<a href="https://colab.research.google.com/github/dasmiq/CS6120-HW2/blob/master/LanguageModeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Name Thinh Lam


Your task is to train *character-level* language models. 
You will train unigram, bigram, and trigram character-level models on a collection of books from Project Gutenberg. You will then use these trained English language models to distinguish English documents from Brazilian Portuguese documents in the test set.

In [2]:
import pandas as pd
import httpimport

with httpimport.remote_repo(['lm_helper'], 'https://raw.githubusercontent.com/jasoriya/CS6120-PS2-support/master/utils/'):
    from lm_helper import get_train_data, get_test_data

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hongt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\hongt\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\hongt\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package mac_morpho to
[nltk_data]     C:\Users\hongt\AppData\Roaming\nltk_data...
[nltk_data]   Package mac_morpho is already up-to-date!


In [3]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

import collections
import math

In [4]:
# get the train and test data
train = get_train_data()
test, test_files = get_test_data()

In [5]:
len(train)*0.2

3.6

In [6]:
X_train, validation = train_test_split(train,test_size= 0.2, random_state=32)

In [7]:
def createNgramFeatures(corpus,n1=1, n2=1):
    cv = CountVectorizer(analyzer='char',ngram_range=(n1,n2))
    texts = cv.fit_transform([record for record in corpus]).toarray()
    char = cv.get_feature_names()
    return texts, char

# This cells is to join words of a sentence together in each book. 
def create_corpus(train_ds):
    books = []
    for i in range(len(train_ds)):
        for sent in train_ds[i]:
            books.append(" ".join(sent).lstrip().rstrip())
    return books

In [8]:
corpus = create_corpus(train)

In [9]:
unigram, unigram_char = createNgramFeatures(corpus)

In [10]:
#Check if there are any weird characters in the corpus.
unigram_vocab_count = collections.Counter(unigram_char)
print(unigram_vocab_count)

Counter({'\x1a': 1, ' ': 1, '!': 1, '"': 1, '$': 1, '%': 1, '&': 1, "'": 1, '(': 1, ')': 1, '*': 1, '+': 1, ',': 1, '-': 1, '.': 1, '/': 1, '0': 1, '1': 1, '2': 1, '3': 1, '4': 1, '5': 1, '6': 1, '7': 1, '8': 1, '9': 1, ':': 1, ';': 1, '<': 1, '=': 1, '>': 1, '?': 1, '@': 1, '[': 1, ']': 1, '_': 1, '`': 1, 'a': 1, 'b': 1, 'c': 1, 'd': 1, 'e': 1, 'f': 1, 'g': 1, 'h': 1, 'i': 1, 'j': 1, 'k': 1, 'l': 1, 'm': 1, 'n': 1, 'o': 1, 'p': 1, 'q': 1, 'r': 1, 's': 1, 't': 1, 'u': 1, 'v': 1, 'w': 1, 'x': 1, 'y': 1, 'z': 1, '}': 1, '~': 1, 'æ': 1, 'è': 1, 'é': 1, 'î': 1})


In [11]:
#Replace those weird characters with "^" which is not in the corpus.
corpus_char = []
for item in corpus:
    corpus_char.append(re.sub('[/$/&/%/>/</~/}/{/*/@/`/æ/é/è/î/=/+/\x1a]',"^" ,item))

In [12]:
unigram1, unigram_char = createNgramFeatures(corpus_char)
bigram1, bigram_char = createNgramFeatures(corpus_char,2,2)
trigram1, trigram_char = createNgramFeatures(corpus_char,3,3)


In [13]:
print("unigram rows: {}, unigram columns: {}".format(unigram1.shape[0], unigram1.shape[1]))
print("bigram rows: {}, bigram columns: {}".format(bigram1.shape[0], bigram1.shape[1]))
print("trigram rows: {}, trigram columns: {}".format(trigram1.shape[0], trigram1.shape[1]))

unigram rows: 98552, unigram columns: 52
bigram rows: 98552, bigram columns: 936
trigram rows: 98552, trigram columns: 9099


In [14]:
#Check again to ensure those weird characters have been replaced. 
unigram_vocab_count = collections.Counter(unigram_char)
print(unigram_vocab_count)

Counter({' ': 1, '!': 1, '"': 1, "'": 1, '(': 1, ')': 1, ',': 1, '-': 1, '.': 1, '0': 1, '1': 1, '2': 1, '3': 1, '4': 1, '5': 1, '6': 1, '7': 1, '8': 1, '9': 1, ':': 1, ';': 1, '?': 1, '[': 1, ']': 1, '^': 1, '_': 1, 'a': 1, 'b': 1, 'c': 1, 'd': 1, 'e': 1, 'f': 1, 'g': 1, 'h': 1, 'i': 1, 'j': 1, 'k': 1, 'l': 1, 'm': 1, 'n': 1, 'o': 1, 'p': 1, 'q': 1, 'r': 1, 's': 1, 't': 1, 'u': 1, 'v': 1, 'w': 1, 'x': 1, 'y': 1, 'z': 1})


In [15]:
def ngram_count(corpus, n1=1,n2=1):
    
    cv = CountVectorizer(analyzer='char', ngram_range = (n1,n2))
    ngram = cv.fit_transform(corpus)      #create a matrix of chars
    
    #list of characters.
    chars = cv.get_feature_names() 
    
    #add up values all rows in each columns. The first row represent the count of each char.
    count_char = np.array(np.sum(ngram,axis=0))[0]
    
    #create a dictionary of each character and its count value. 
    ngram = dict(zip(chars, count_char))
    
    return ngram, sum(count_char)
    
    
uni, TOTAL_UNI = ngram_count(corpus_char) 
bi, TOTAL_BI = ngram_count(corpus_char,2,2) 
tri, TOTAL_TRI = ngram_count(corpus_char,3,3)

In [16]:
#print(uni)
#print(bi)
#print(tri)

#TOTAL_UNI = sum(uni.values())
#TOTAL_BI = sum(bi.values())
#TOTAL_TRI = sum(tri.values())

print(TOTAL_UNI)
print(TOTAL_BI)
print(TOTAL_TRI)

12010505
11911953
11813477


In [17]:
# Your code here
def unigram_probability(uni_char):
    if uni_char in uni.keys():
        return uni[uni_char]/TOTAL_UNI 
    return uni['^']/TOTAL_UNI
    
def bigram_probability(bi_char):
    if bi_char in bi.keys():
        return bi[bi_char]/uni[bi_char[0]]
    return 0

def trigram_probability(tri_char):
    if tri_char in tri.keys():
        return tri[tri_char]/bi[tri_char[:2]]
    return 0

In [18]:
def linear_interpolation_perplexity(train_ds,L1,L2,L3):
    results=[]
    
    def create_trigram(sentence):
        return [sentence[i:i+3] for i in range(len(sentence)-3)]
    
    for book in train_ds:
        cv = CountVectorizer(analyzer='char')
        matrix = cv.fit_transform(book)
        total_char = sum(np.array(np.sum(matrix,axis=0))[0])
        
        sum_log_prob = 0
        for sentence in book: 
            for tri_gram in create_trigram(sentence):
                probability = L1 * unigram_probability(tri_gram[-1]) + L2 * bigram_probability(tri_gram[0:2]) + L3 * trigram_probability(tri_gram)
                sum_log_prob += math.log(probability,2)

        
        perplexity = 2**(-sum_log_prob/total_char)
        results.append(perplexity)
    return results

In [19]:
LAMBDA_SETS =[[0.11,0.09,0.8],
             [0.05,0.05,0.9],
             [0.9,0.05,0.05],
             [0.17,0.13,0.7],
             [0.25,0.6,0.15],
             [0.23,0.17,0.6],
             [0.13,0.27,0.6],
             [0.1,0.18,0.72],
             [0.1,0.1,0.8]]


def create_test(testset):
    results=[]
    for i in range(len(testset)):
        sentences=[]
        for j in range(len(testset[i])):
            sentences.append((" ".join(testset[i][j]).lstrip().rstrip()))
        results.append(sentences)
    return results

vals = create_test(validation)

In [20]:
for i in range(len(LAMBDA_SETS)):
    pp= linear_interpolation_perplexity(vals,LAMBDA_SETS[i][0],LAMBDA_SETS[i][1],LAMBDA_SETS[i][2])
    print("L1: {}, L2: {}, L3: {}, pp1: {}, pp2: {} , pp3: {}, pp4: {}".format(LAMBDA_SETS[i][0],
                                                                               LAMBDA_SETS[i][1],
                                                                               LAMBDA_SETS[i][2],
                                                                               pp[0],pp[1],pp[2],pp[3]))


L1: 0.11, L2: 0.09, L3: 0.8, pp1: 7.238311163593336, pp2: 6.656228381379482 , pp3: 7.877630967954666, pp4: 7.92132629371633
L1: 0.05, L2: 0.05, L3: 0.9, pp1: 7.646427035120186, pp2: 6.918109643452387 , pp3: 8.474975729008921, pp4: 8.742446669413381
L1: 0.9, L2: 0.05, L3: 0.05, pp1: 13.450811257722803, pp2: 13.25106522481023 , pp3: 13.94735015408296, pp4: 12.228459658919187
L1: 0.17, L2: 0.13, L3: 0.7, pp1: 7.0965398569076905, pp2: 6.598234462964519 , pp3: 7.643070590060275, pp4: 7.569056254016475
L1: 0.25, L2: 0.6, L3: 0.15, pp1: 7.973727202855698, pp2: 7.707689734584803 , pp3: 8.428911739065006, pp4: 8.15049942887561
L1: 0.23, L2: 0.17, L3: 0.6, pp1: 7.0951963890833785, pp2: 6.656348371609629 , pp3: 7.584790254769845, pp4: 7.423166127185495
L1: 0.13, L2: 0.27, L3: 0.6, pp1: 6.734976816695541, pp2: 6.293488937941123 , pp3: 7.2459577838664835, pp4: 7.265958670019745
L1: 0.1, L2: 0.18, L3: 0.72, pp1: 6.858886740523797, pp2: 6.3481379212831905 , pp3: 7.434660254103765, pp4: 7.516019005906

## My best Lambda_set is 0.13,0.27,0.6 

## 1.2
Calculate the perplexity for each document in the test set using the linear interpolation smoothing method. For determining λs for linear interpolation of the trigram, bigram, and unigram models, you can divide the training data into a new training set (80%) and a held-out set (20%).
Then choose ~10 random pairs of $(\lambda_3, \lambda_2)$ such that $1 > \lambda_3 > \lambda_2 > 0$ and $\sum_{i=1}^3 \lambda_i = 1$ to test on held-out data.

Some documents in the test set are in Brazilian Portuguese. Identify them as follows: 
  - Sort by perplexity and set a cut-off threshold. All the documents above this threshold score should be categorized as Brazilian Portuguese. 
  - Print the file names (from `test_files`) and perplexities of the documents above the threshold

    ```
        file name, score
        file name, score
        . . .
        file name, score
    ```

  - Copy this list of filenames and manually annotate them as being correctly or incorrectly labeled as Portuguese.




In [21]:
test_corpus = create_test(test)

#compute perplexities for the test documents to find the threshold. 
test_results=linear_interpolation_perplexity(test_corpus,0.13,0.27,0.6)
for item in test_results:
    print(item)

8.990035755187394
9.183538995430567
8.494066898830214
7.423244135292134
7.724299802362842
8.093941902921818
16.113449087468254
15.422144980722324
6.669571270843347
9.9386005059876
6.793393943861492
7.289269848050166
7.523691034925563
7.710589420243429
6.787827635854505
8.86372475666537
6.849153530803214
16.609484975479575
7.310579881198023
7.492478421056932
6.494627472042587
8.814281809223031
6.609634813179062
7.9027001691176615
6.6808534891452
7.936628955413524
7.67751963755789
7.984615697583378
7.595579033825983
8.725589922615477
7.595843932592441
7.89268370317403
8.437217867895308
7.4925201334099905
8.262544667233222
8.12252936871801
6.576473278255494
7.935411173344955
6.40457886825349
8.155366269749901
8.188886650904761
16.137268031378238
7.765881237969996
15.727599422118397
7.550451395248099
7.870753624166888
6.821253564422213
8.315113720167242
9.13132145147403
7.50652053014766
8.06679995598811
7.138020621336395
7.507867530134214
7.789421001773431
9.314093916774384
6.5624990380788

In [22]:
THRESHOLD = 12
book_list=pd.DataFrame()
book_list["filename"]=test_files
book_list["score"]= test_results
book_list=book_list.sort_values(by='score',axis=0)
book_list["language"]=book_list["score"].apply(lambda x: "English " if x<THRESHOLD else "Portugese")


print("the list of Portugese books")
display(book_list[book_list.score > THRESHOLD])

the list of Portugese books


Unnamed: 0,filename,score,language
59,ag94fe1.txt,14.664378,Portugese
7,ag94ja11.txt,15.422145,Portugese
183,ag94mr1.txt,15.48454,Portugese
122,br94ju01.txt,15.616903,Portugese
165,br94ab02.txt,15.641852,Portugese
43,ag94jl12.txt,15.727599,Portugese
98,ag94ab12.txt,15.763092,Portugese
133,br94jl01.txt,15.827872,Portugese
132,ag94no01.txt,15.894473,Portugese
100,br94de01.txt,15.956685,Portugese


In [23]:
display(book_list[book_list.score < THRESHOLD])

Unnamed: 0,filename,score,language
157,cg04,6.283744,English
38,cf27,6.404579,English
20,cf26,6.494627,English
69,cf13,6.499864,English
139,cf36,6.538029,English
...,...,...,...
78,ca31,9.744572,English
9,ca16,9.938601,English
140,ca17,10.006179,English
62,ca18,10.599886,English


## 1.3
Build a trigram language model with add-λ smoothing (use λ = 0.1).

Sort the test documents by perplexity and perform a check for Brazilian Portuguese documents as above:

  - Observe the perplexity scores and set a cut-off threshold. All the documents above this threshold score should be categorized as Brazilian Portuguese. 
  - Print the file names and perplexities of the documents above the threshold

  ```
      file name, score
      file name, score
      . . .
      file name, score
  ```

  - Copy this list of filenames and manually annotate them for correctness.

In [24]:
ADD_LAMBDA = 0.1
NEW_THRESHOLD = 20

def add_lambda_smoothing(trichar,lamda):
    if trichar not in tri.keys():
        num=lamda                    #compute numberator as 0.1 if not in trigrams
    else:
        num=lamda + tri[trichar]     
    if trichar[:2] not in bi.keys():
        den=lamda*len(uni) 
    else:
        den= lamda*len(uni) + bi[trichar[:2]]
    return num/den

def add_smoothing_perplexity(train_ds):
    results=[]
    
    def create_trigram(sentence):
        return [sentence[i:i+3] for i in range(len(sentence)-3)]
    
    for book in train_ds:
        cv = CountVectorizer(analyzer='char')
        matrix = cv.fit_transform(book)
        total_char = sum(np.array(np.sum(matrix,axis=0))[0])
        
        sum_log_prob = 0
        for sentence in book: 
            for tri_gram in create_trigram(sentence):
                probability = add_lambda_smoothing(tri_gram,ADD_LAMBDA)
                sum_log_prob += math.log(probability,2)

        
        perplexity = 2**(-sum_log_prob/total_char)
        results.append(perplexity)
    return results

test_result_2=add_smoothing_perplexity(test_corpus)

df=pd.DataFrame()
df["filename"]=test_files
df["score"]=test_result_2
df=df.sort_values(by='score',axis=0)

df["language"]=df["score"].apply(lambda x: "English " if x< NEW_THRESHOLD else "Portugese")
print('the list of Potugese files')
display(df[df["score"]>NEW_THRESHOLD])

Potugese files
         filename      score   language
59    ag94fe1.txt  28.417827  Portugese
165  br94ab02.txt  30.289170  Portugese
133  br94jl01.txt  30.634832  Portugese
122  br94ju01.txt  30.672988  Portugese
7    ag94ja11.txt  30.840945  Portugese
100  br94de01.txt  30.968355  Portugese
215  br94ja04.txt  31.020278  Portugese
98   ag94ab12.txt  31.040430  Portugese
132  ag94no01.txt  31.153239  Portugese
170   br94fe1.txt  31.348353  Portugese
41   ag94ou04.txt  31.573793  Portugese
183   ag94mr1.txt  31.606144  Portugese
43   ag94jl12.txt  31.661051  Portugese
123  br94ag01.txt  31.760117  Portugese
82   ag94ju07.txt  31.835033  Portugese
6    ag94ma03.txt  31.950743  Portugese
164  ag94ag02.txt  32.246327  Portugese
87   ag94de06.txt  32.423947  Portugese
163  ag94se06.txt  32.799927  Portugese
17   br94ma01.txt  33.250555  Portugese


In [25]:
display(df[df["score"]<NEW_THRESHOLD])

    filename      score  language
157     cg04   7.290719  English 
69      cf13   7.439201  English 
20      cf26   7.484306  English 
124     ce21   7.573957  English 
38      cf27   7.592564  English 
..       ...        ...       ...
78      ca31  15.581845  English 
9       ca16  16.995866  English 
140     ca17  17.723653  English 
167     ce09  19.104821  English 
62      ca18  19.543667  English 

[200 rows x 3 columns]


## 1.4
Based on your observation from above questions, compare linear interpolation and add-λ smoothing by listing out their pros and cons.

Add-lambda smoothing is better in computing the perplexities which shows the apparent threshold between English and Portugese in the test_files and it runs seemingly faster than linear interpolation. 