# Working with Text Files

In [1]:
name = "Fred"

print("His name is {var}".format(var = name))

print(f"His name is {name}")

His name is Fred
His name is Fred


In [3]:
print(f"His name is {name!r}")

His name is 'Fred'


In [4]:
d = {"a":123, "b":456}

print(f"Address : {d['a']} Main Street")

Address : 123 Main Street


In [5]:
library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]

for book in library:

    print(f"{book[0]:10}, {book[1]:10}, {book[2]:10}")

Author    , Topic     , Pages     
Twain     , Rafting   ,        601
Feynman   , Physics   ,         95
Hamilton  , Mythology ,        144


In [10]:
library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]

for book in library:

    print(f"{book[0]:10} {book[1]:10} {book[2]:.>10}")

Author     Topic      .....Pages
Twain      Rafting    .......601
Feynman    Physics    ........95
Hamilton   Mythology  .......144


In [12]:
# https://docs.python.org/3/reference/lexical_analysis.html#f-strings

from datetime import datetime

today = datetime(year = 2018, month = 1, day = 27)

print(f"{today:%B %Y %d}")

January 2018 27


## Files

In [13]:
%%writefile test.txt
Hello, this is a quick test file.
This is the second line of the file.

Writing test.txt


In [14]:
!pwd
!ls

/content
sample_data  test.txt


In [15]:
myfile = open(file = "test.txt", mode = "r")

In [17]:
myfile.read()

'Hello, this is a quick test file.\nThis is the second line of the file.'

In [18]:
myfile.read()

''

In [19]:
myfile.seek(0)
myfile.read()

'Hello, this is a quick test file.\nThis is the second line of the file.'

In [20]:
myfile.seek(0)
myfile.readlines()

['Hello, this is a quick test file.\n', 'This is the second line of the file.']

In [21]:
my_file = open("test.txt","w+")

In [22]:
my_file.write("This is a new first line")

24

In [23]:
my_file.seek(0)
my_file.read()

'This is a new first line'

In [24]:
my_file.close()

In [25]:
my_file = open("test.txt","a+")
my_file.write("\nThis line is being appended to text.txt")
my_file.write("\nAnd another line here.")

23

In [26]:
my_file.seek(0)
print(my_file.read())

This is a new first line
This line is being appended to text.txt
And another line here.


In [27]:
my_file.close()

In [28]:
%%writefile -a test.txt

This is more text being appended to test.txt
And another line here.

Appending to test.txt


In [29]:
with open("test.txt","r") as txt:
    first_line = txt.readlines()[0]
print(first_line)

This is a new first line



In [30]:
txt.read()

ValueError: ignored

In [33]:
with open("test.txt","r") as txt:
    print(list(txt))

['This is a new first line\n', 'This line is being appended to text.txt\n', 'And another line here.\n', 'This is more text being appended to test.txt\n', 'And another line here.']


In [31]:
with open("test.txt","r") as txt:
    for line in txt:
        print(line, end = "")

This is a new first line
This line is being appended to text.txt
And another line here.
This is more text being appended to test.txt
And another line here.

# Working With PDF Files

In [36]:
%%capture
!pip install PyPDF2
import PyPDF2
import zipfile

from google.colab import drive
drive.mount("/content/drive")

In [37]:
zip_file = zipfile.ZipFile("/content/drive/MyDrive/NLP_Vol1/UPDATED_NLP_COURSE.zip","r")
zip_file.extractall()
zip_file.close()

In [38]:
# rb = read binary
f = open("/content/UPDATED_NLP_COURSE/00-Python-Text-Basics/US_Declaration.pdf", "rb")

In [39]:
pdf_reader = PyPDF2.PdfFileReader(f)

pdf_reader.numPages

5

In [41]:
page_one = pdf_reader.getPage(0)
page_one_text = page_one.extractText()

In [42]:
page_one_text

"Declaration of IndependenceIN CONGRESS, July 4, 1776. The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the\npolitical bands which have connected them with another, and to assume among the powers of the\nearth, the separate and equal station to which the Laws of Nature and of Nature's God entitle\n\nthem, a decent respect to the opinions of mankind requires that they should declare the causes\n\nwhich impel them to the separation. \nWe hold these truths to be self-evident, that all men are created equal, that they are endowed by\n\ntheir Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit\nof Happiness.ŠThat to secure these rights, Governments are instituted among Men, deriving\n\ntheir just powers from the consent of the governed,ŠThat whenever any Form of Government\nbecomes destructive of these ends, it is the Right of the People to alter or 

In [43]:
f.close()

In [56]:
# Example !!!

# rb = read binary
f = open("/content/UPDATED_NLP_COURSE/00-Python-Text-Basics/US_Declaration.pdf", "rb")
pdf_reader = PyPDF2.PdfFileReader(f)

In [57]:
first_page = pdf_reader.getPage(0)

In [58]:
pdf_writer = PyPDF2.PdfFileWriter()

pdf_writer.addPage(first_page)

In [59]:
pdf_writer

<PyPDF2.pdf.PdfFileWriter at 0x7f81d1290090>

In [60]:
pdf_output = open("Some_New_Doc.pdf","wb")

pdf_writer.write(pdf_output)

In [61]:
pdf_output.close()
f.close()

In [62]:
pdf_output = open("Some_New_Doc.pdf","rb")

print(pdf_output.read())

b'%PDF-1.3\n1 0 obj\n<<\n/Type /Pages\n/Count 1\n/Kids [ 3 0 R ]\n>>\nendobj\n2 0 obj\n<<\n/Producer (PyPDF2)\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 1 0 R\n/Contents 5 0 R\n/MediaBox [ 0 0 612 792 ]\n/Resources <<\n/Font <<\n/F9 6 0 R\n/F6 10 0 R\n>>\n/ProcSet [ /PDF /Text ]\n>>\n>>\nendobj\n4 0 obj\n<<\n/Type /Catalog\n/Pages 1 0 R\n>>\nendobj\n5 0 obj\n<<\n/Length 4539\n>>\nstream\nBT\r0 0 0 rg\r1 i \r/RelativeColorimetric ri\r/F6 1 Tf 0.00333 Tc 0 Tw 0 Tr \r18 0 0 18 201.72 703.2 Tm\r(Declaration of Independence)Tj\r0.01717 Tc -0.00858 Tw \r13.98 0 0 13.98 216.84 671.28 Tm\r(IN CONGRESS, July 4, 1776.)Tj\r0.015 Tc -0.01999 Tw \r12 0 0 12 392.28 671.28 Tm\r( )Tj\r/F9 1 Tf 0 Tc 0 Tw \r-26.685 -2.425 Td\r(The unanimous Declaration of the thirteen united States of America,)Tj\r/F6 1 Tf \r29.2 0 Td\r( )Tj\r-29.195 -2.36 Td\r(When in the Course of human events, it becomes necessary for one people \\\rto dissolve the)Tj\r1.175 TL T*(political bands which have connected them with an

In [68]:
# Simple Example

f = open("/content/UPDATED_NLP_COURSE/00-Python-Text-Basics/US_Declaration.pdf", "rb")

pdf_text = []

pdf_reader = PyPDF2.PdfFileReader(f)

for p in range(pdf_reader.numPages):

    page = pdf_reader.getPage(p)

    pdf_text.append(page.extractText())

f.close()

In [69]:
print(len(pdf_text))

5


In [70]:
print(pdf_text[2])

to render it at once an example and fit instrument for introducing the sameabsolute rule into these Colonies:For taking away our Charters, abolishing our most valuable Laws, and alteringfundamentally the Forms of our Governments:For suspending our own Legislatures, and declaring themselves invested withpower to legislate for us in all cases whatsoever.He has abdicated Government here, by declaring us out of his Protection andwaging War against us.He has plundered our seas, ravaged our Coasts, burnt our towns, and destroyed thelives of our people.He is at this time transporting large Armies of foreign Mercenaries to compleatthe works of death, desolation and tyranny, already begun with circumstances ofCruelty & perfidy scarcely paralleled in the most barbarous ages, and totallyunworthy of the Head of a civilized nation.He has constrained our fellow Citizens taken Captive on the high Seas to bearArms against their Country, to become the executioners of their friends and
Brethren, or to f

# Regular Expressions

In [71]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [72]:
"phone" in text

True

In [73]:
import re

pattern = "phone"

re.search(pattern, text)

<re.Match object; span=(12, 17), match='phone'>

In [74]:
match = re.search(pattern,text)

In [76]:
match.span()

(12, 17)

In [78]:
match.start(),match.end()

(12, 17)

In [79]:
text = "my phone is a new phone"

match = re.search("phone",text)

match.span()

(3, 8)

In [80]:
matches = re.findall(pattern, text)

matches

['phone', 'phone']

In [81]:
len(matches)

2

In [83]:
for match in re.finditer("phone",text):
    print(match.span())

(3, 8)
(18, 23)


In [86]:
match.group()

'phone'

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >\d</span></td><td>A digit</td><td>file_\d\d</td><td>file_25</td></tr>

<tr ><td><span >\w</span></td><td>Alphanumeric</td><td>\w-\w\w\w</td><td>A-b_1</td></tr>



<tr ><td><span >\s</span></td><td>White space</td><td>a\sb\sc</td><td>a b c</td></tr>



<tr ><td><span >\D</span></td><td>A non digit</td><td>\D\D\D</td><td>ABC</td></tr>

<tr ><td><span >\W</span></td><td>Non-alphanumeric</td><td>\W\W\W\W\W</td><td>*-+=)</td></tr>

<tr ><td><span >\S</span></td><td>Non-whitespace</td><td>\S\S\S\S</td><td>Yoyo</td></tr></table>

In [87]:
text = "My telephone number is 408-555-1234"

In [93]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d',text)

In [95]:
phone.group()

'408-555-1234'

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr></table>

In [97]:
re.search("\d{3}-\d{3}-\d{4}",text).group()

'408-555-1234'

In [106]:
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')
results = re.search(phone_pattern,text)

In [111]:
results.group(1),results.group(2),results.group(3)

('408', '555', '1234')

In [112]:
results.group()

'408-555-1234'

In [113]:
re.search(r"man|women","This man was here"),re.search(r"man|women","This women was here")

(<re.Match object; span=(5, 8), match='man'>,
 <re.Match object; span=(5, 10), match='women'>)

In [114]:
re.findall(r".at","The bat went splat")

['bat', 'lat']

In [115]:
# Boşluğa dikkat
re.findall(r"..at","The bat went splat")

[' bat', 'plat']

In [119]:
# \S boşluk olmayan !! Tablodan bak !!!
re.findall(r"\S+at","The bat went splat")

['bat', 'splat']

In [122]:
# ^ -- > start
# $ -- > end

re.findall(r"\d$" , "This ends with a number 2")

['2']

In [123]:
re.findall(r"^\d" , "1 is the loneliest number.")

['1']

In [124]:
# [^\d] extract number

phrase = "there are 3 numbers 34 inside 5 this sentence."

re.findall(r"[^\d]", phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e',
 '.']

In [129]:
re.findall(r"[^\d]+", phrase)

['there are ', ' numbers ', ' inside ', ' this sentence.']

In [130]:
re.findall(r"[\d]+", phrase)

['3', '34', '5']

In [133]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

" ".join(re.findall(r"[^!.? ]+", test_phrase))

'This is a string But it has punctuation How can we remove it'

In [135]:
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'

re.findall(r"[\w]+-[\w]+",text)

['hypen-words', 'long-ish']

In [136]:
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [137]:
re.search(r"cat(fish|nap|claw)",text)

<re.Match object; span=(27, 34), match='catfish'>

In [138]:
re.search(r"cat(fish|nap|claw)",texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [139]:
re.search(r"cat(fish|nap|claw)",textthree)

# Exercise

In [140]:
abbr = 'NLP'
full_text = 'Natural Language Processing'

# Enter your code here:
print(f'{abbr} stands for {full_text}')

NLP stands for Natural Language Processing


In [141]:
%%writefile contacts.txt
First_Name Last_Name, Title, Extension, Email

Writing contacts.txt


In [142]:
with open("contacts.txt") as c:
    fields = c.read()

fields

'First_Name Last_Name, Title, Extension, Email'

In [143]:
import PyPDF2

f = open("/content/UPDATED_NLP_COURSE/00-Python-Text-Basics/Business_Proposal.pdf", "rb")

pdf_reader = PyPDF2.PdfFileReader(f)

page_two_text = pdf_reader.getPage(1).extractText()

f.close()

print(page_two_text)

AUTHORS:
 
Amy Baker, Finance Chair, x345, abaker@ourcompany.com
 
Chris Donaldson, Accounting Dir., x621, cdonaldson@ourcompany.com
 
Erin Freeman, Sr. VP, x879, efreeman@ourcompany.com
 



In [144]:
with open("contacts.txt","a+") as c:
    c.write(page_two_text)
    c.seek(0)
    print(c.read())

First_Name Last_Name, Title, Extension, EmailAUTHORS:
 
Amy Baker, Finance Chair, x345, abaker@ourcompany.com
 
Chris Donaldson, Accounting Dir., x621, cdonaldson@ourcompany.com
 
Erin Freeman, Sr. VP, x879, efreeman@ourcompany.com
 



In [147]:
%%writefile contacts.txt
First_Name Last_Name, Title, Extension, Email

Overwriting contacts.txt


In [148]:
with open("contacts.txt","a+") as c:
    c.write(page_two_text[8:])
    c.seek(0)
    print(c.read())

First_Name Last_Name, Title, Extension, Email
 
Amy Baker, Finance Chair, x345, abaker@ourcompany.com
 
Chris Donaldson, Accounting Dir., x621, cdonaldson@ourcompany.com
 
Erin Freeman, Sr. VP, x879, efreeman@ourcompany.com
 



In [149]:
print(page_two_text)

AUTHORS:
 
Amy Baker, Finance Chair, x345, abaker@ourcompany.com
 
Chris Donaldson, Accounting Dir., x621, cdonaldson@ourcompany.com
 
Erin Freeman, Sr. VP, x879, efreeman@ourcompany.com
 



In [151]:
re.findall(r"\w+@\w+.\w{3}",page_two_text)

['abaker@ourcompany.com',
 'cdonaldson@ourcompany.com',
 'efreeman@ourcompany.com']