# Language Processing

In [2]:
from collections import Counter

### Creating a word counter function

In [9]:
def word_count(text):
    '''
    Counts the number of time each word occurs in the given text. 
    - First remove the poncuation
    - Second return a dictionary with keys as word and values are the number of occurence of the corresponding word.
    '''
    # Clean the text --- Lower case and removing the ponctuation
    text = text.lower()
    ponct = ['.', ',', ':', ';', "'", '"', '!']
    for ch in ponct:
        text = text.replace(ch, '')
    
    word_counts = Counter(text.split(' '))
    return word_counts

In [14]:
text = 'Counts the number of time each word occurs in the given text. First remove the poncuation. Second return a dictionary with keys as word and values are the number of occurence of the corresponding word'

In [15]:
word_count(text)

Counter({'counts': 1,
         'the': 5,
         'number': 2,
         'of': 3,
         'time': 1,
         'each': 1,
         'word': 3,
         'occurs': 1,
         'in': 1,
         'given': 1,
         'text': 1,
         'first': 1,
         'remove': 1,
         'poncuation': 1,
         'second': 1,
         'return': 1,
         'a': 1,
         'dictionary': 1,
         'with': 1,
         'keys': 1,
         'as': 1,
         'and': 1,
         'values': 1,
         'are': 1,
         'occurence': 1,
         'corresponding': 1})

### Creating a function that reads a book and return a text

In [32]:
def read_book(path_file):
    '''
    This function reads a book and return a text
    '''
    with open(path_file, 'r', encoding='utf8') as f:
        text = f.read()
        text = text.replace('\n', "").replace('\r', "")
    return text

In [38]:
path_file = 'Books/English/shakespeare/Romeo and Juliet.txt'
text = read_book(path_file)

In [39]:
# Check if we can find the sentense "What's in a name?"

index = text.find("What's in a name?")
index

41085

In [40]:
# Let display part of the text
text[index : index + 1000]

"What's in a name? That which we call a rose    By any other name would smell as sweet.    So Romeo would, were he not Romeo call'd,    Retain that dear perfection which he owes    Without that title. Romeo, doff thy name;    And for that name, which is no part of thee,    Take all myself.  Rom. I take thee at thy word.    Call me but love, and I'll be new baptiz'd;    Henceforth I never will be Romeo.  Jul. What man art thou that, thus bescreen'd in night,    So stumblest on my counsel?  Rom. By a name    I know not how to tell thee who I am.    My name, dear saint, is hateful to myself,    Because it is an enemy to thee.    Had I it written, I would tear the word.  Jul. My ears have yet not drunk a hundred words    Of that tongue's utterance, yet I know the sound.    Art thou not Romeo, and a Montague?  Rom. Neither, fair saint, if either thee dislike.  Jul. How cam'st thou hither, tell me, and wherefore?    The orchard walls are high and hard to climb,    And the place death, consid

In [41]:
# Counts word in Romeo and Juliet text
r_j_words = word_count(text)
len(r_j_words)

4174

In [42]:
r_j_words

Counter({'\ufeff1595the': 1,
         'tragedy': 1,
         'of': 400,
         'romeo': 125,
         'and': 706,
         'julietby': 1,
         'william': 1,
         'shakespearedramatis': 1,
         'personae': 1,
         '': 11990,
         'chorus': 3,
         'escalus': 2,
         'prince': 36,
         'verona': 12,
         'paris': 33,
         'a': 459,
         'young': 23,
         'count': 5,
         'kinsman': 12,
         'to': 538,
         'the': 676,
         'montague': 28,
         'heads': 7,
         'two': 25,
         'houses': 7,
         'at': 70,
         'variance': 2,
         'with': 251,
         'each': 4,
         'other': 20,
         'capulet': 30,
         'an': 85,
         'old': 33,
         'man': 66,
         'family': 1,
         'son': 17,
         'tybalt': 52,
         'nephew': 4,
         'lady': 67,
         'mercutio': 20,
         'friend': 15,
         'benvolio': 15,
         'friar': 85,
         'laurence': 9,
         'fra

### Computing Word Frequency Statistics

In [43]:
# Creating a function
def words_stats(word_count):
    '''
    Returns number of unique words and word frequencies. 
    '''
    num_unique = len(word_count)
    counts = word_count.values()
    return (num_unique, counts)

In [46]:
(num_unique, counts) = words_stats(r_j_words)

In [47]:
# How much the book have of unique words
num_unique

4174

In [48]:
# The total numbre of words
sum(counts)

37692

### Comparing the English and the German version of Romeo & Juliet

In [51]:
Eng_path = 'Books/English/shakespeare/Romeo and Juliet.txt'
Gr_path = 'Books/German/shakespeare/Romeo und Julia.txt'

# Read the books & transform to text
eng_text = read_book(Eng_path)
gr_text = read_book(Gr_path)

# Counts words
eng_words = word_count(eng_text) 
gr_words = word_count(gr_text)

# Stats and frequencies
(eng_num_unique, eng_counts) = words_stats(eng_words)
(gr_num_unique, gr_counts) = words_stats(gr_words)

# Print the results

print(f'Unique words in the English version : {eng_num_unique} ; Total number of words in the English version : {sum(eng_counts)}.')
print(f'Unique words in the German version : {gr_num_unique} ; Total number of words in the German version : {sum(gr_counts)}.')

Unique words in the English version : 4174 ; Total number of words in the English version : 37692.
Unique words in the German version : 6491 ; Total number of words in the German version : 17869.


### Looping over the whole dircetory of books

In [56]:
dir = 'Books'
stats = pd.DataFrame(columns=('language', 'author', 'title', 'length', 'unique_words'))

idx = 1
for language in os.listdir(dir):
    for author in os.listdir(dir + '/' + language):
        for title in os.listdir(dir + '/' + language + '/' + author):
            inputfile =  dir + '/' + language + '/' + author + '/' + title
            print(inputfile)
            text = read_book(inputfile)
            (num_unique, counts) = words_stats(word_count(text))
            stats.loc[idx] = language, author.capitalize(), title.replace('.txt', ''), sum(counts), num_unique
            idx += 1
            

Books/English/shakespeare/A Midsummer Night's Dream.txt
Books/English/shakespeare/Hamlet.txt
Books/English/shakespeare/Macbeth.txt
Books/English/shakespeare/Othello.txt
Books/English/shakespeare/Richard III.txt
Books/English/shakespeare/Romeo and Juliet.txt
Books/English/shakespeare/The Merchant of Venice.txt
Books/French/chevalier/L'a╠èle de sable.txt
Books/French/chevalier/L'enfer et le paradis de l'autre monde.txt
Books/French/chevalier/La capitaine.txt
Books/French/chevalier/La fille des indiens rouges.txt
Books/French/chevalier/La fille du pirate.txt
Books/French/chevalier/Le chasseur noir.txt
Books/French/chevalier/Les derniers Iroquois.txt
Books/French/de Maupassant/Boule de Suif.txt
Books/French/de Maupassant/Claire de Lune.txt
Books/French/de Maupassant/Contes de la Becasse.txt
Books/French/de Maupassant/L'inutile beautC╠º.txt
Books/French/de Maupassant/La Main Gauche.txt
Books/French/de Maupassant/La Maison Tellier.txt
Books/French/de Maupassant/La petite roque.txt
Books/Fren

In [57]:
stats.head()

Unnamed: 0,language,author,title,length,unique_words
1,English,Shakespeare,A Midsummer Night's Dream,16103,4345
2,English,Shakespeare,Hamlet,28551,6776
3,English,Shakespeare,Macbeth,16874,4780
4,English,Shakespeare,Othello,26590,5898
5,English,Shakespeare,Richard III,48315,5259


In [58]:
stats.tail()

Unnamed: 0,language,author,title,length,unique_words
98,Portuguese,Queir┬ós,O crime do padre Amaro,128630,28580
99,Portuguese,Queir┬ós,O Mandarim,21440,7772
100,Portuguese,Queir┬ós,O Primo Bazilio,107303,26851
101,Portuguese,Queir┬ós,Os Maias,195771,39785
102,Portuguese,Shakespeare,Hamlet,30567,9589
