### Working with Text Data

In [8]:
# Read the content of the text file 
with open('sample.txt', 'r', encoding='utf-8') as file: 
    text_data = file.read() 
    print("Raw Text:\n", text_data)

# Store in another file
with open('stored_text.txt', 'w', encoding='utf-8') as file: 
    file.write(text_data) 

Raw Text:
 Hello, this is a sample text file.
This is the second line.


### Working with CSV File

In [9]:
import pandas as pd 
# Read the CSV file 
df = pd.read_csv('reviews.csv') 
print("Reviews:\n", df['Review'].head()) 
# Save the reviews column to a text file 
df['Review'].to_csv('stored_reviews.txt', index=False, header=False) 

Reviews:
 0    The product is amazing!
1     Worst experience ever!
Name: Review, dtype: object


### Working with Excel File

In [10]:
# Read the Excel file 
df_excel = pd.read_excel('reviews.xlsx', engine='openpyxl') 
print("First two rows:\n", df_excel.head(2)) 
# Save the first two rows to a text file 
df_excel.head(2).to_csv('extracted_excel.txt', index=False)

First two rows:
    ID                   Review
0   1  The product is amazing!
1   2   Worst experience ever!


### Working with JSON File

In [11]:
import json 
 
# Read the JSON file 
with open('social_data.json', 'r', encoding='utf-8') as file: 
    data = json.load(file) 
print("Extracted City:", data['city']) 
 
# Store the extracted city to a file 
with open('stored_city.txt', 'w', encoding='utf-8') as file: 
    file.write(data['city']) 

Extracted City: New York


### Working with XML File

In [12]:
import xml.etree.ElementTree as ET 
 
# Parse the XML file 
tree = ET.parse('news.xml') 
root = tree.getroot() 
 
for article in root.findall('article'): 
    title = article.find('title').text 
    print("Extracted Title:", title) 
 
# Store the extracted title to a file 
with open('stored_titles.txt', 'w', encoding='utf-8') as file: 
    for article in root.findall('article'): 
        title = article.find('title').text 
        file.write(title + '\n') 

Extracted Title: AI is transforming industries


### Working with PDF File

In [13]:
pip install PyPDF2

Note: you may need to restart the kernel to use updated packages.


In [14]:
import PyPDF2 
# Read the PDF file 
with open('document.pdf', 'rb') as file: 
    reader = PyPDF2.PdfReader(file) 
    text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text()) 
# Print the extracted text 
print("Extracted PDF Text:\n", text) 
# Store the extracted text in a file 
with open('stored_pdf_text.txt', 'w', encoding='utf-8') as output: 
    output.write(text) 

Extracted PDF Text:
 This is a sample PDF document.  
AI is transforming industries and automation.  


### Exercise Lab 1

#### 1. Extract text from all pages of Business_Proposal.pdf and save it in business_proposal_all.txt

In [15]:
# Read the PDF file 
with open('Business_Proposal.pdf', 'rb') as file: 
    reader = PyPDF2.PdfReader(file) 
    text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())

# Print the extracted text 
print("Extracted PDF Text:\n", text) 
# Store the extracted text in a file 
with open('stored_pdf_text.txt', 'w', encoding='utf-8') as output: 
    output.write(text) 

Extracted PDF Text:
 Business Proposal  
The Revolution is Coming  
Leverage agile frameworks to provide a robust synopsis for high level  
overviews. Iterative approaches to corporate strategy foster collaborative  
thinking to further the overall value proposition. Organically grow the  
holistic world view of disruptive innovation via workplace diversity and  
empowerment.  
Bring to the table win -win survival strategies to ensure proactive  
domination. At the end of the day, going forward, a new normal that has  
evolved from generation X is on the runway heading towards a streamlined  
cloud solution. User generated content in real -time will have multi ple 
touchpoints for offshoring.  
Capitalize on low hanging fruit to identify a ballpark value added activity to  
beta test. Override the digital divide with additional clickthroughs from  
DevOps. Nanotechnology immersion along the information highway will  
close the loop on focusing solely on the bottom line.  
Podcasting op

#### Extract text from only page 2 of Business_Proposal.pdf and save it in business_proposal_page_2.txt t

In [18]:
import PyPDF2

# Open the PDF file
with open('Business_Proposal.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    
    # Extract text from page 2 (page index 1, since it's zero-based)
    page_2_text = reader.pages[1].extract_text() if len(reader.pages) > 1 else ""

# Save the extracted text from page 2 into a text file
print("Extracted PDF Text:\n", page_2_text) 
# Store the extracted text in a file 
with open('business_proposal_page_2.txt', 'w', encoding='utf-8') as output:
    output.write(page_2_text)

print("Page 2 text extraction completed.")


Extracted PDF Text:
 AUTHORS:  
Amy Baker, Finance Chair, x345, abaker@ourcompany.com  
Chris Donaldson, Accounting Dir., x621, cdonaldson@ourcompany.com  
Erin Freeman, Sr. VP, x879, efreeman@ourcompany.com  
Page 2 text extraction completed.
