# 3.2.1 Introduction to Language Processing

In [1]:
'''
Pattern within texts - are different for languages/authors

Properties of individual books - various authors and languages
    - book lengths
    - number of Unique words
    - How attributes clustere by language/author
    
Project Gutenberg - oldest online digital library
    gutenberg.org
    
'''

'\nPattern within texts - are different for languages/authors\n\nProperties of individual books - various authors and languages\n    - book lengths\n    - number of Unique words\n    - How attributes clustere by language/author\n    \nProject Gutenberg - oldest online digital library\n    gutenberg.org\n    \n'

# 3.2.2. Counting Words

In [2]:
text = "This is my text. We're keeping this text short to keep things manageable."

In [3]:
def count_words(text):
    '''
    Count the number of times each word occures in text (str). 
    Returns dictionary where keys are unique words and values are word counts.
    '''
    word_counts = {}
    for word in text.split(" "):
        #known word
        if word in word_counts:
            word_counts[word] += 1
        
        #unknown word
        else:
            word_counts[word] = 1
            
    return word_counts

In [4]:
count_words(text)

{'This': 1,
 "We're": 1,
 'is': 1,
 'keep': 1,
 'keeping': 1,
 'manageable.': 1,
 'my': 1,
 'short': 1,
 'text': 1,
 'text.': 1,
 'things': 1,
 'this': 1,
 'to': 1}

In [5]:
#punctuation with word :  text and text. 
#lower case : This, this
def count_words(text):
    '''
    Count the number of times each word occures in text (str). skip punctuation
    Returns dictionary where keys are unique words and values are word counts.
    
    '''
    text=text.lower()
    
    skips = [".", ",", ":", ";", "'", '"']
    for ch in skips:
        text = text.replace(ch,"")
    
    word_counts = {}
    
    for word in text.split(" "):
        #known word
        if word in word_counts:
            word_counts[word] += 1
        
        #unknown word
        else:
            word_counts[word] = 1
            
    return word_counts

In [7]:
count_words(text)

{'is': 1,
 'keep': 1,
 'keeping': 1,
 'manageable': 1,
 'my': 1,
 'short': 1,
 'text': 2,
 'things': 1,
 'this': 2,
 'to': 1,
 'were': 1}

In [11]:
#using counter object
from collections import Counter

def count_words_fast(text):
    '''
    Count the number of times each word occures in text (str). skip punctuation
    Returns dictionary where keys are unique words and values are word counts.
    
    '''
    text=text.lower()
    
    skips = [".", ",", ":", ";", "'", '"']
    for ch in skips:
        text = text.replace(ch,"")
    
    word_counts = Counter(text.split(" "))

    return word_counts

In [13]:
count_words(text) == count_words_fast(text)

True

In [14]:
len(count_words("This comprehension check is to check for comprehension."))

6

In [15]:
count_words(text) is count_words_fast(text)

False

# 3.2.3 Reading in a Book

In [61]:

def read_book(title_path):
    '''
    Read a book and return it as a string
    '''
#    with open(title_path, "r", encoding="utf-8") as current_file:
    with open(title_path, "r", encoding="ISO-8859-1") as current_file:
        text = current_file.read()
        text = text.replace("\n","").replace("\r","")
        
    return text    

In [62]:
text = read_book("./python_case_studies/books/English/Shakespeare/Romeo and Juliet.txt")

In [63]:
len(text)

161333

In [64]:
ind = text.find("What's in a name?")

In [65]:
ind

51240

In [38]:
sample_text = text[ind : ind + 1000]

In [39]:
sample_text

"What's in a name? That which we call a rose    By any other name would smell as sweet.    So Romeo would, were he not Romeo call'd,    Retain that dear perfection which he owes    Without that title. Romeo, doff thy name;    And for that name, which is no part of thee,    Take all myself.  Rom. I take thee at thy word.    Call me but love, and I'll be new baptiz'd;    Henceforth I never will be Romeo.  Jul. What man art thou that, thus bescreen'd in night,    So stumblest on my counsel?  Rom. By a name    I know not how to tell thee who I am.    My name, dear saint, is hateful to myself,    Because it is an enemy to thee.    Had I it written, I would tear the word.  Jul. My ears have yet not drunk a hundred words    Of that tongue's utterance, yet I know the sound.    Art thou not Romeo, and a Montague?  Rom. Neither, fair saint, if either thee dislike.  Jul. How cam'st thou hither, tell me, and wherefore?    The orchard walls are high and hard to climb,    And the place death, consid

# 3.2.4 Computing Word Frequency Statistics

In [67]:
def word_stats(word_counts):
    '''
    return number of unique word and word frequecy
    '''
    num_unique = len(word_counts)
    counts = word_counts.values()
    return(num_unique,counts)

In [76]:
text = read_book("./python_case_studies/books/English/Shakespeare/Romeo and Juliet.txt")

In [77]:
word_counts = count_words_fast(text)

In [79]:
type(word_counts)

collections.Counter

In [70]:
(num_unique, counts) = word_stats(word_counts)

In [71]:
num_unique

4920

In [72]:
sum(counts)

39424

Romeo Juliet German

In [83]:
text = read_book("./python_case_studies/books/German/Shakespeare/Romeo und Juliette.txt")

In [84]:
word_counts = count_words_fast(text)

In [85]:
(num_unique, counts) = word_stats(word_counts)

In [86]:
num_unique

8186

In [87]:
sum(counts)

26136

# 3.2.5 Reading Multiple Files

In [88]:
import os
book_dir = "./python_case_studies/books"

In [89]:
os.listdir(book_dir)

['English', 'German']

In [91]:
for language in os.listdir(book_dir):
    for author in os.listdir(book_dir+"/"+language):
        for title in os.listdir(book_dir+"/"+language+"/"+author):
            inputfile = book_dir+"/"+language+"/"+author+"/"+title
            print(inputfile)

./python_case_studies/books/English/Shakespeare/hamlets.csv
./python_case_studies/books/English/Shakespeare/Romeo and Juliet.txt
./python_case_studies/books/German/Shakespeare/Romeo und Juliette.txt


In [92]:
for language in os.listdir(book_dir):
    for author in os.listdir(book_dir+"/"+language):
        for title in os.listdir(book_dir+"/"+language+"/"+author):
            inputfile = book_dir+"/"+language+"/"+author+"/"+title
            print(inputfile)
            text = read_book(inputfile)
            (num_unique, counts) = word_stats(count_words(text))

./python_case_studies/books/English/Shakespeare/hamlets.csv
./python_case_studies/books/English/Shakespeare/Romeo and Juliet.txt
./python_case_studies/books/German/Shakespeare/Romeo und Juliette.txt


In [93]:
'''
Pandas:
    Panel data (Multi Dimensional Struct data sets) Additional data s tructure and data analysis
    Manipulate numerical table and time series data

'''

'\nPandas:\n    Panel data (Multi Dimensional Struct data sets) Additional data s tructure and data analysis\n    Manipulate numerical table and time series data\n\n'

In [95]:
import pandas as pd

In [96]:
# dataframe - R - table
table = pd.DataFrame(columns=("name","age"))

table.loc[1] = "James", 22 #insert row 1
table.loc[2] = "Jess", 32

table

Unnamed: 0,name,age
1,James,22
2,Jess,32


In [97]:
table.columns

Index(['name', 'age'], dtype='object')

In [98]:
import os
book_dir = "./python_case_studies/books"

import pandas as pd
stats = pd.DataFrame(columns=("language","author","title","length", "unique"))
title_num = 1

for language in os.listdir(book_dir):
    for author in os.listdir(book_dir+"/"+language):
        for title in os.listdir(book_dir+"/"+language+"/"+author):
            inputfile = book_dir+"/"+language+"/"+author+"/"+title
            print(inputfile)
            text = read_book(inputfile)
            (num_unique, counts) = word_stats(count_words(text))
            
            stats.loc[title_num] = language, author, title, sum(counts), num_unique
            title_num += 1

./python_case_studies/books/English/Shakespeare/hamlets.csv
./python_case_studies/books/English/Shakespeare/Romeo and Juliet.txt
./python_case_studies/books/German/Shakespeare/Romeo und Juliette.txt


In [99]:
stats

Unnamed: 0,language,author,title,length,unique
1,English,Shakespeare,hamlets.csv,265408,19854
2,English,Shakespeare,Romeo and Juliet.txt,39424,4920
3,German,Shakespeare,Romeo und Juliette.txt,26136,8186


In [100]:
stats.head()

Unnamed: 0,language,author,title,length,unique
1,English,Shakespeare,hamlets.csv,265408,19854
2,English,Shakespeare,Romeo and Juliet.txt,39424,4920
3,German,Shakespeare,Romeo und Juliette.txt,26136,8186


In [101]:
stats.tail()

Unnamed: 0,language,author,title,length,unique
1,English,Shakespeare,hamlets.csv,265408,19854
2,English,Shakespeare,Romeo and Juliet.txt,39424,4920
3,German,Shakespeare,Romeo und Juliette.txt,26136,8186


In [103]:
import os
book_dir = "./python_case_studies/books"

import pandas as pd
stats = pd.DataFrame(columns=("language", "author", "title", "length", "unique"))
title_num = 1

for language in os.listdir(book_dir):
    for author in os.listdir(book_dir+"/"+language):
        for title in os.listdir(book_dir+"/"+language+"/"+author):
            inputfile = book_dir+"/"+language+"/"+author+"/"+title
#            print(inputfile)
            text = read_book(inputfile)
            (num_unique, counts) = word_stats(count_words(text))
            
            stats.loc[title_num] = language, author.capitalize(), title.replace(".txt","").replace(".csv",""), sum(counts), num_unique
            title_num += 1

In [105]:
stats

Unnamed: 0,language,author,title,length,unique
1,English,Shakespeare,hamlets,265408,19854
2,English,Shakespeare,Romeo and Juliet,39424,4920
3,German,Shakespeare,Romeo und Juliette,26136,8186


SyntaxError: invalid syntax (<ipython-input-107-00983b6acaa5>, line 2)