<a href="https://colab.research.google.com/github/swatidamele/Jupyter-Notebooks/blob/main/Copy_of_NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from random import randrange
import random

In [2]:
NEG = 0
POS = 1
ALL = 2

**1. Take IMDB review text file as input data**

In [3]:
df=pd.read_csv(r"https://raw.githubusercontent.com/swatidamele/Jupyter-Notebooks/main/imdb_labelled.txt",delimiter="\t",header=None,names=["IMDB Review","Sentiment"])
df.columns

Index(['IMDB Review', 'Sentiment'], dtype='object')

In [4]:
df=df.replace(',','',regex=True).replace('!','',regex=True).replace('\.','',regex=True).replace('-','',regex=True)
df   

Unnamed: 0,IMDB Review,Sentiment
0,A very very very slowmoving aimless movie abou...,0
1,Not sure who was more lost the flat character...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
743,I just got bored watching Jessice Lange take h...,0
744,Unfortunately any virtue in this film's produc...,0
745,In a word it is embarrassing,0
746,Exceptionally bad,0


**2. Divide the dataset as train, development and test**


In [5]:
split_1 = int(0.8 * len(df))
split_2 = int(0.9 * len(df))
train_data = df[:split_1]
dev_data = df[split_1:split_2]
test_data = df[split_2:]
print("Train Data")
print((train_data))
print("Dev Data")
print((dev_data))
print("Test Data")
print((test_data))

Train Data
                                           IMDB Review  Sentiment
0    A very very very slowmoving aimless movie abou...          0
1    Not sure who was more lost  the flat character...          0
2    Attempting artiness with black & white and cle...          0
3          Very little music or anything to speak of            0
4    The best scene in the movie was when Gerardo i...          1
..                                                 ...        ...
593  This film highlights the fundamental flaws of ...          1
594  The film is well paced understated and one of ...          1
595  This mostly routine factbased TV drama gets a ...          1
596                  Predictable but not a bad watch            1
597  It was clear that she had the range and abilit...          1

[598 rows x 2 columns]
Dev Data
                                           IMDB Review  Sentiment
598                       She carries the movie well            1
599  Constantine gives everythin

In [6]:
validation_data=train_data.values.tolist()
dev_data_values=dev_data.values.tolist()
test_data_values=test_data.values.tolist()

**3. Take the count of statements based on sentiment**

In [7]:
def count_sentence(data):
    label_count=[0,0, len(data)] # [neg_count, pos_count, all_count]
    for item in data:
        #label_count[ALL]+=1
        if item[1]== 0:
            label_count[NEG]+=1
        else:
            label_count[POS]+=1
            
    return label_count

In [8]:
total_count = count_sentence(validation_data)
print("Count of sentences [Negative,Positive,All] is ",total_count)

Count of sentences [Negative,Positive,All] is  [313, 285, 598]


**4. Build a vocabulary as list and omit rare words if the occurrence is less than five times**


In [9]:
def word_vocabulary(data,threshold):
   
    vocab={}
    
    for sent in data:
        count=0
        for content in sent:
            if(count%2==0): 
                word_list = content.split()
                for word in word_list:
                    if word.lower() not in vocab:
                        vocab[word.lower()]=[0,0,0]
                    else:
                        vocab[word.lower()][ALL]+=1
                        if(sent[count+1]==1):
                            vocab[word.lower()][POS]+=1
                        else:
                            if(sent[count+1]==0):
                                vocab[word.lower()][NEG]+=1
                count+=1
                    
    for key in list(vocab.keys()):
        if vocab[key][ALL]<threshold:
            del vocab[key]
                      
    return(vocab)

In [10]:
Dict = word_vocabulary(validation_data,5)
df_dict = pd.DataFrame(list(Dict.items()),columns = ['Word','Count [Negative,Positive,All]']) 
print(df_dict)

       Word Count [Negative,Positive,All]
0         a               [181, 156, 337]
1      very                  [34, 25, 59]
2     movie                 [82, 59, 141]
3     about                  [22, 14, 36]
4       man                     [4, 5, 9]
..      ...                           ...
258      am                     [6, 1, 7]
259  seeing                     [2, 3, 5]
260    each                     [2, 3, 5]
261   makes                     [1, 5, 6]
262  second                     [4, 1, 5]

[263 rows x 2 columns]


**5. Calculate the probability of the occurrence**


In [11]:
P_word = {}

for word in Dict:
    word_count = int(Dict[word][ALL])
    total_all = total_count[ALL]
    P = word_count/total_all
    P_word[word] = P
df_p_word = pd.DataFrame(list(P_word.items()),columns = ['Word','Probability(Word)'])     
print(df_p_word) 

       Word  Probability(Word)
0         a           0.563545
1      very           0.098662
2     movie           0.235786
3     about           0.060201
4       man           0.015050
..      ...                ...
258      am           0.011706
259  seeing           0.008361
260    each           0.008361
261   makes           0.010033
262  second           0.008361

[263 rows x 2 columns]


**6. Calculate the conditional probability based on the sentiment**

In [12]:
P_conditional_pos_word = {}

for word in Dict:
    word_count = int(Dict[word][POS])
    total = total_count[POS]
    P = word_count/total
    P_conditional_pos_word[word.lower()] = P
df_p_conditional_pos_word = pd.DataFrame(list(P_conditional_pos_word.items()),columns = ['Word','Probability(Word|Positive)'])      
print(df_p_conditional_pos_word)            

       Word  Probability(Word|Positive)
0         a                    0.547368
1      very                    0.087719
2     movie                    0.207018
3     about                    0.049123
4       man                    0.017544
..      ...                         ...
258      am                    0.003509
259  seeing                    0.010526
260    each                    0.010526
261   makes                    0.017544
262  second                    0.003509

[263 rows x 2 columns]


In [13]:
P_conditional_neg_word = {}

for word in Dict:
    word_count_n = int(Dict[word][NEG])
    total_n = total_count[NEG]
    P_n = word_count_n/total_n
    P_conditional_neg_word[word.lower()] = P_n
df_p_conditional_neg_word = pd.DataFrame(list(P_conditional_pos_word.items()),columns = ['Word','Probability(Word|Negative)'])     
print(df_p_conditional_neg_word)         

       Word  Probability(Word|Negative)
0         a                    0.547368
1      very                    0.087719
2     movie                    0.207018
3     about                    0.049123
4       man                    0.017544
..      ...                         ...
258      am                    0.003509
259  seeing                    0.010526
260    each                    0.010526
261   makes                    0.017544
262  second                    0.003509

[263 rows x 2 columns]


**7. Calculate accuracy using five fold cross validation and compare the effect of Smoothing**

In [14]:

k=5

folds=validation_data 
#print(validation_data)
fold_size= int((len(validation_data))/5)
#print(fold_size)
# for j in range(k):
#     fold=[]
#     for i in range(fold_size):
#         random_index=randrange(50)
#         if len(validation_data)>0:
#             #print(validation_data[random_index][0])
#             fold.append(validation_data[random_index][0])
#             #fold.append(random.choice(validation_data))
#             #print(fold)
#             #fold.append(newData_validate.pop(random_index))
#             #print(i)
#     folds.append(fold)
#     #print(folds)

train_fold = folds[:450]
test_fold = folds[450:]



# fold_count = 0    
# for fold in folds:
#     if(fold_count == 4):
#         test_fold = fold
#     else:
#         train_fold = train_fold + fold
#     fold_count+=1
    #train_fold = folds
    #train_fold.remove(fold)
    #train_fold = sum(train_fold, [])
    #test_fold = fold
#print(train_fold)
   





In [15]:
def predicted_sentiment(review, vocabulary, label_count, lam):

    probability = []

    for label in[0,1]:
        prob = 1.0
        counter = 0
        for row in review:
            if(counter%2==0):
                sentarray = row.lower().split()
                for word in sentarray:
                    if word not in vocabulary:
                        continue
                    if lam == 0 and vocabulary[word][label]==0:
                        prob = 0
                        break
                    prob = round(prob*((vocabulary[word][label]+lam) / (label_count[label]+(lam*len(vocabulary)))),5)        
                counter+=1       
        probability.append(prob)  
    return 0 if probability[NEG]>probability[POS] else 1
   

In [16]:
l = [0,1]  

occurrence_shreshold = 5

new_train_fold = []

voca = word_vocabulary(train_fold, occurrence_shreshold)

label_count = count_sentence(train_fold)

for lam in l:
    accuracy = []
    correct = 0
    for review in test_fold:
        flag1 = predicted_sentiment(review, voca, label_count, lam)
        flag2 = int(review[1])
        if(flag1 == flag2):
            correct+=1
    accuracy.append(correct/len(test_fold))
    if(lam==0):
        print("5-folds accuracy without smoothing is : ",accuracy)
    else:
        print("\n5-folds accuracy with smoothing is : ",accuracy)

5-folds accuracy without smoothing is :  [0.5540540540540541]

5-folds accuracy with smoothing is :  [0.5945945945945946]


**8. Derive Top 10 words that predict positive and negative class**


In [27]:
def prob_sentence(data,word,senti):
    countarr=count_sentence(data)
    finalarr=[]
    count=0
    count_conditional=0
    if(senti==0):
        count=countarr[1]
        snt = 0
    else:
        count=countarr[0]
        snt = 1
    
    #print(count)
    for row in data:
        if(row[1]==snt):
            sentencearr=row[0].lower().split()
            if word.lower() in sentencearr:
                count_conditional+=1
    finalarr.append(count_conditional)
    finalarr.append(countarr[0])
    finalarr.append(countarr[1])
    if(senti==0):
        cond_prob=count_conditional/countarr[0]
    else:
        cond_prob=count_conditional/countarr[1]
    #print(cond_prob)    
    return(cond_prob)

In [28]:
vocabulary = word_vocabulary(validation_data, 5)
label_count = count_sentence(validation_data)

word_predict = {}

for row in validation_data:
  for word in row[0].split():             
    if word not in vocabulary:
      continue
    neg_prob = (prob_sentence(validation_data,word,NEG) * (label_count[NEG]/label_count[ALL]))/ P_word[word]
    pos_prob = (prob_sentence(validation_data,word,POS) * (label_count[POS]/label_count[ALL]))/ P_word[word]
    pred = NEG if neg_prob > pos_prob else POS
    if word not in word_predict:
      word_predict[word] = [0,0,0]
    word_predict[word][ALL] += 1
    if pred == int(row[1]):  
      word_predict[word][pred] += 1
            
top_neg_lst = sorted(word_predict, key = lambda x: word_predict[x][NEG]/word_predict[x][ALL], reverse=True) [:10]
top_pos_lst = sorted(word_predict, key = lambda x: word_predict[x][POS]/word_predict[x][ALL], reverse=True)[:10]
finalarr=[]
finalarr.append(top_neg_lst)
finalarr.append(top_pos_lst)

In [29]:
arr = finalarr
df_arr1 = pd.DataFrame(list(arr[0]),columns = ['Word']) 
print("Top 10 words that predict negative class are as follows: \n", arr[0])
print("\nTop 10 words that predict positive class are as follows: \n", arr[1])

Top 10 words that predict negative class are as follows: 
 ['poor', 'lines', 'wasted', "can't", 'worst', 'annoying', 'these', 'dialogue', 'stupid', 'awful']

Top 10 words that predict positive class are as follows: 
 ['interesting', 'however', 'brilliant', 'wonderful', 'liked', 'actually', 'played', 'job', 'makes', 'family']


**9. Use the optimal hyperparameters to calculate the final accuracy.**

In [30]:
def fit(data,pos_lab,neg_lab):
    predict_y=[]
    for row in data:
        probs1=[]
        probs2=[]
        for word in row[0].split():
            if(word != " "):
                t1=prob_sentence(data,word,0)
                t2=prob_sentence(data,word,1)
                probs1.append(t1)
                probs2.append(t2)
            prob1=1
            prob2=1
            for i in range(len(probs1)):
                prob1= prob1*probs1[i]
                prob2= prob2*probs2[i]
            prob1=prob1*((neg_lab)/((pos_lab)+(neg_lab)))
            prob2=prob2*((pos_lab)/((pos_lab)+(neg_lab)))
            if(prob1>prob2):
                predict_y.append(0)
            else:
                predict_y.append(1)
    return predict_y  

In [31]:
sentiarr=[]
for row in test_data_values:
    sentiarr.append(row[1])
length=len(sentiarr)

In [32]:
countarr=count_sentence(test_data_values)
y_prediction=fit(test_data_values,countarr[0],countarr[1])
y_prediction=(y_prediction[:length])

In [33]:
count=0
y_val = sentiarr
for i in range(len(y_prediction)):
   if(y_prediction[i]==int(y_val[i])):
     count=count+1
final_acc = count/len(y_val)  

In [34]:
# final_acc=acc(y_prediction,sentiarr)  
print("Final accuracy is : ",final_acc)  

Final accuracy is :  0.72
