In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import re
import string
import nltk
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


In [2]:
train_pos_path = "C:\\Users\\Elina\\Desktop\\aclImdb\\train\\pos"
train_neg_path = "C:\\Users\\Elina\\Desktop\\aclImdb\\train\\neg"


In [3]:
train_pos_list = []
train_pos_list_filename = []
for root, dirs, files in os.walk(train_pos_path):
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(root, file), 'r',encoding="utf8") as f:
                head,tail = os.path.split(f.name)
                text = f.read()
                train_pos_list.append(text)
                train_pos_list_filename.append(tail)


In [4]:
train_neg_list = []
train_neg_list_filename = []

for root, dirs, files in os.walk(train_neg_path):
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(root, file), 'r',encoding="utf8") as f:
                text = f.read()
                head,tail = os.path.split(f.name)
                train_neg_list.append(text)
                train_neg_list_filename.append(tail)

In [5]:
train_pos_dataFrame = pd.DataFrame(list(zip(train_pos_list,train_pos_list_filename)),columns = ["review","File Name"]) 
train_pos_dataFrame["sentiment"]="pos"
train_pos_dataFrame["p.pos"]='NaN'
train_pos_dataFrame["p.neg"]='NaN'
train_pos_dataFrame["predicted_sentiment"]='NaN'

train_neg_dataFrame = pd.DataFrame(list(zip(train_neg_list,train_neg_list_filename)),columns = ["review","File Name"]) 
train_neg_dataFrame["sentiment"]="neg"
train_neg_dataFrame["p.pos"]='NaN'
train_neg_dataFrame["p.neg"]='NaN'
train_neg_dataFrame["predicted_sentiment"]='NaN'

In [6]:
traing = pd.concat([train_pos_dataFrame,train_neg_dataFrame], ignore_index=True)

# Clean the string

In [7]:
def clean_text(text):
    text = text.lower().strip()
    text = " ".join([w for w in text.split() if len(w) > 2])
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


In [8]:
traing["review"]=traing["review"].apply(clean_text)

In [9]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')


traing["review"] = traing["review"].apply(tokenizer.tokenize)

# Remove duplicate words and rare words

In [10]:
def remove_duplicate_words(text,rare_word):
    remove_duplicate_words = list(set(text) - set(rare_word))
    return remove_duplicate_words

# Find rare words that the occurrence is less than five times

In [11]:
exploded = traing.explode('review')
total_vocabulary  = exploded.review.value_counts(ascending=True)
total_vocabulary  = total_vocabulary[total_vocabulary > 5]
rare_word = total_vocabulary[total_vocabulary <= 5].index.to_list()

In [12]:
traing['review'] = traing['review'].apply(
        lambda x: remove_duplicate_words(x,rare_word))


# Build a vocabulary for training as list with number of document containing these words

In [13]:
exploded = traing.explode('review')
total_vocabulary  = exploded.review.value_counts(ascending=True)
total_vocabulary  = total_vocabulary[total_vocabulary > 5]
print("Build a vocabulary that the occurrence is more than five times\n\n",total_vocabulary)

Build a vocabulary that the occurrence is more than five times

 morei             6
reticent          6
satisfies         6
definetely        6
raiding           6
              ...  
for           17763
that          19964
this          22417
and           24105
the           24778
Name: review, Length: 25649, dtype: int64


# Calculate the following probability
## Probability of the occurrence
### P[“the”] = num of documents containing ‘the’ / num of all documents
## Conditional probability based on the sentiment
### P[“the” | Positive]  = # of positive documents containing “the” / num of all positive review documents

In [14]:
num_of_all_documents=traing.shape[0]
num_of_documents_containing_the= total_vocabulary["the"]

P_the=num_of_documents_containing_the/num_of_all_documents
print("P[“the”] = num of documents containing ‘the’ / num of all documents\nP[“the”] = ",P_the)

pos = traing["sentiment"]=="pos"
vol_pos = traing[pos]
total_pos_vocabulary = vol_pos.explode('review')
total_pos_vocabulary  = total_pos_vocabulary.review.value_counts(ascending=True)


neg = traing["sentiment"]=="neg"
vol_neg = traing[neg]
total_neg_vocabulary  = vol_neg.explode('review')
total_neg_vocabulary  = total_neg_vocabulary.review.value_counts(ascending=True)


num_of_all_pos_documents=vol_pos.shape[0]
num_of_all_neg_documents=vol_neg.shape[0]

num_of_pos_documents_containing_the= total_pos_vocabulary["the"]

P_the=num_of_pos_documents_containing_the/num_of_all_pos_documents

print("P[“the” | Positive]  = # of positive documents containing “the” / num of all positive review documents\nP[“the”] = ",P_the)

P[“the”] = num of documents containing ‘the’ / num of all documents
P[“the”] =  0.99112
P[“the” | Positive]  = # of positive documents containing “the” / num of all positive review documents
P[“the”] =  0.98992


# Stopwords Removal

In [15]:
from nltk.corpus import stopwords

In [16]:
def remove_stopwords(word_tokens):
    stop_words = set(stopwords.words('english')) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
            
    return filtered_sentence

In [17]:
traing['review'] = traing['review'].apply(remove_stopwords)

In [18]:
def naive_bayes(text,total_vocabulary,total_vocal_sentiment,num_of_sentiment_documents,smoothing = False):
    if smoothing != True:
        if (text in total_vocabulary) & (text in total_vocal_sentiment):
            return total_vocal_sentiment[text]/num_of_sentiment_documents

        elif (text in total_vocabulary) & (text  not in total_vocal_sentiment):
            return 0

        elif (text not in total_vocal_sentiment) & (text  in total_vocabulary): 
            return 0

        elif (text not in total_vocabulary) & (text   not in total_vocal_sentiment): 
            return 0
    else:
        if (text in total_vocabulary) & (text in total_vocal_sentiment):
            return (total_vocal_sentiment[text]+1)/(num_of_sentiment_documents+2)

        elif (text in total_vocabulary) & (text  not in total_vocal_sentiment):
            return 1/(num_of_sentiment_documents+2)

        elif (text not in total_vocal_sentiment) & (text  in total_vocabulary): 
            return 0

        elif (text not in total_vocabulary) & (text   not in total_vocal_sentiment): 
            return 0

# Calculate accuracy using dev dataset 
## Conduct five fold cross validation

In [19]:
traing=traing.sample(frac = 1)

In [20]:
def corss_validation(train,k,smoothing = False):
    dev = 1/k
    
    for i in range (1,k+1):
        dev_chunk = train.sample(
            frac=dev, replace=False, random_state=i).copy(deep=True)
        train_chunk = train.drop(dev_chunk.index, axis=0).copy(deep=True)
       
    
    
        train_data = train_chunk.explode('review')
        total_train_vocabulary  = train_data.review.value_counts(ascending=True)
        total_train_vocabulary  = total_train_vocabulary[total_train_vocabulary > 5]
    
    
    
    
        print("Cross-validation Pass", i)
        
        pos = dev_chunk["sentiment"]=="pos"
        vol_pos = dev_chunk[pos]
        num_of_all_pos_documents=vol_pos.shape[0]
        total_pos_vocabulary = vol_pos.explode('review')
        total_pos_vocabulary  = total_pos_vocabulary.review.value_counts(ascending=True)

                
        neg = dev_chunk["sentiment"]=="neg"
        vol_neg = dev_chunk[neg]
        num_of_all_neg_documents=vol_neg.shape[0]
        total_neg_vocabulary  = vol_neg.explode('review')
        total_neg_vocabulary  = total_neg_vocabulary.review.value_counts(ascending=True)

        exploded = dev_chunk.explode('review')
        

        print("processing positive")
        exploded["p.pos"] = exploded['review'].apply(
        lambda x: naive_bayes(x,total_train_vocabulary,total_pos_vocabulary,num_of_all_pos_documents,smoothing))
        print("done")

        
        print("processing negative")
        exploded["p.neg"] = exploded['review'].apply(
        lambda x: naive_bayes(x,total_train_vocabulary,total_neg_vocabulary,num_of_all_neg_documents,smoothing))
        print("done")
        

       
        group=exploded.groupby("File Name")
        
        p_pos_sentiment=num_of_all_pos_documents/(num_of_all_pos_documents+num_of_all_neg_documents)
        p_neg_sentiment=num_of_all_neg_documents/(num_of_all_pos_documents+num_of_all_neg_documents)
        
        
        for file in dev_chunk["File Name"]:
            dev_chunk.loc[dev_chunk['File Name'] == file,"p.pos"]=group["p.pos"].get_group(file).prod()*p_pos_sentiment
            dev_chunk.loc[dev_chunk['File Name'] == file,"p.neg"]=group["p.neg"].get_group(file).prod()*p_neg_sentiment

        
        dev_chunk['predicted_sentiment'] = np.where(dev_chunk["p.pos"] > dev_chunk["p.neg"], "pos", "neg")
        correct=dev_chunk["sentiment"]==dev_chunk["predicted_sentiment"]

        
        
        accuracy=0
        for check in correct:
            if check == True:
                accuracy = accuracy+1
                
        accuracy=accuracy/correct.shape[0]
        print("accuracy = ",accuracy,"\n\n\n")

        
        
    return 
        

In [21]:
corss_validation(traing,5)

Cross-validation Pass 1
processing positive
done
processing negative
done
accuracy =  0.9504 



Cross-validation Pass 2
processing positive
done
processing negative
done
accuracy =  0.9488 



Cross-validation Pass 3
processing positive
done
processing negative
done
accuracy =  0.948 



Cross-validation Pass 4
processing positive
done
processing negative
done
accuracy =  0.9526 



Cross-validation Pass 5
processing positive
done
processing negative
done
accuracy =  0.9468 





# Do following experiments
## Compare the effect of Smoothing


In [22]:
corss_validation(traing,5,smoothing = True)

Cross-validation Pass 1
processing positive
done
processing negative
done
accuracy =  0.9502 



Cross-validation Pass 2
processing positive
done
processing negative
done
accuracy =  0.9516 



Cross-validation Pass 3
processing positive
done
processing negative
done
accuracy =  0.95 



Cross-validation Pass 4
processing positive
done
processing negative
done
accuracy =  0.953 



Cross-validation Pass 5
processing positive
done
processing negative
done
accuracy =  0.9468 





## Derive Top 10 words that predicts positive and negative class
### P[Positive| word] 

In [23]:
def top_ten_words(train,k):
    dev = 1/k
    
    dev_chunk = train.sample(
        frac=dev, replace=False, random_state=1).copy(deep=True)
    train_chunk = train.drop(dev_chunk.index, axis=0).copy(deep=True)
    
    train_data = train_chunk.explode('review')
    total_train_vocabulary  = train_data.review.value_counts(ascending=True)
    total_train_vocabulary  = total_train_vocabulary[total_train_vocabulary > 5]
    total_words=train_data.shape[0]


    pos = dev_chunk["sentiment"]=="pos"
    vol_pos = dev_chunk[pos]
    num_of_all_pos_documents=vol_pos.shape[0]
    total_pos_vocabulary = vol_pos.explode('review')
    total_pos_vocabulary  = total_pos_vocabulary.review.value_counts(ascending=True)


    neg = dev_chunk["sentiment"]=="neg"
    vol_neg = dev_chunk[neg]
    num_of_all_neg_documents=vol_neg.shape[0]
    total_neg_vocabulary  = vol_neg.explode('review')
    total_neg_vocabulary  = total_neg_vocabulary.review.value_counts(ascending=True)

    exploded = dev_chunk.explode('review')


    print("processing positive")
    exploded["p.pos"] = exploded['review'].apply(
    lambda x: naive_bayes(x,total_train_vocabulary,total_pos_vocabulary,total_words))
    print("done")

    exploded["p.pos"]=exploded["p.pos"]*(num_of_all_pos_documents/(num_of_all_pos_documents+num_of_all_neg_documents))

    print("processing negative")
    exploded["p.neg"] = exploded['review'].apply(
    lambda x: naive_bayes(x,total_train_vocabulary,total_neg_vocabulary,total_words))
    print("done")

    exploded["p.neg"]=exploded["p.neg"]*(num_of_all_pos_documents/(num_of_all_pos_documents+num_of_all_neg_documents))

    group=exploded.groupby("File Name")

    p_pos_sentiment=num_of_all_pos_documents/(num_of_all_pos_documents+num_of_all_neg_documents)
    p_neg_sentiment=num_of_all_neg_documents/(num_of_all_pos_documents+num_of_all_neg_documents)


    for file in dev_chunk["File Name"]:
        dev_chunk.loc[dev_chunk['File Name'] == file,"p.pos"]=group["p.pos"].get_group(file).prod()*p_pos_sentiment
        dev_chunk.loc[dev_chunk['File Name'] == file,"p.neg"]=group["p.neg"].get_group(file).prod()*p_neg_sentiment

        
    
    return exploded

In [24]:
word = top_ten_words(traing,5)


processing positive
done
processing negative
done


In [25]:
pos=word.sort_values(by=["p.pos"],ascending=False)
pos_pred=pos.loc[pos["sentiment"]=="pos"]
top_ten_pos = pos_pred.review.unique()[:10].tolist()

In [26]:
top_ten_pos

['one',
 'film',
 'movie',
 'like',
 'good',
 'story',
 'great',
 'see',
 'time',
 'well']

In [27]:
neg=word.sort_values(by=["p.neg"],ascending=False)
neg_pred=neg.loc[neg["sentiment"]=="neg"]
top_ten_neg = neg_pred.review.unique()[:10].tolist()

In [28]:
top_ten_neg

['movie', 'one', 'film', 'like', 'even', 'good', 'would', 'bad', 'time', 'see']

# Using the test dataset
## Use the optimal hyperparameters you found in the step e, and use it to calculate the final accuracy.  
## Use five fold cross validation for final accuracy

In [29]:
test_pos_path = "C:\\Users\\Elina\\Desktop\\aclImdb\\test\\pos"
test_neg_path = "C:\\Users\\Elina\\Desktop\\aclImdb\\test\\neg"

In [30]:
test_pos_list = []
test_pos_list_filename = []
for root, dirs, files in os.walk(test_pos_path):
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(root, file), 'r',encoding="utf8") as f:
                head,tail = os.path.split(f.name)
                text = f.read()
                test_pos_list.append(text)
                test_pos_list_filename.append(tail)

In [31]:
test_neg_list = []
test_neg_list_filename = []

for root, dirs, files in os.walk(test_neg_path):
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(root, file), 'r',encoding="utf8") as f:
                text = f.read()
                head,tail = os.path.split(f.name)
                test_neg_list.append(text)
                test_neg_list_filename.append(tail)

In [32]:
test_pos_dataFrame = pd.DataFrame(list(zip(test_pos_list,test_pos_list_filename)),columns = ["review","File Name"]) 
test_pos_dataFrame["sentiment"]="pos"
test_pos_dataFrame["p.pos"]='NaN'
test_pos_dataFrame["p.neg"]='NaN'
test_pos_dataFrame["predicted_sentiment"]='NaN'

test_neg_dataFrame = pd.DataFrame(list(zip(test_neg_list,test_neg_list_filename)),columns = ["review","File Name"]) 
test_neg_dataFrame["sentiment"]="neg"
test_neg_dataFrame["p.pos"]='NaN'
test_neg_dataFrame["p.neg"]='NaN'
test_neg_dataFrame["predicted_sentiment"]='NaN'

In [33]:
testing = pd.concat([test_pos_dataFrame,test_neg_dataFrame], ignore_index=True)

In [34]:
testing = pd.concat([testing,traing], ignore_index=True)

# Using the test dataset with smoothing

In [35]:
corss_validation(testing,5,smoothing = True)

Cross-validation Pass 1
processing positive
done
processing negative
done
accuracy =  0.962 



Cross-validation Pass 2
processing positive
done
processing negative
done
accuracy =  0.9624 



Cross-validation Pass 3
processing positive
done
processing negative
done
accuracy =  0.9636 



Cross-validation Pass 4
processing positive
done
processing negative
done
accuracy =  0.9651 



Cross-validation Pass 5
processing positive
done
processing negative
done
accuracy =  0.9637 



