In [1]:
import re, nltk
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,precision_score,recall_score
from sklearn.ensemble import AdaBoostClassifier
import joblib

In [2]:
def normalizer(text): #### Cleaning Tweets
 re2 = re.sub("[^A-Za-z]+"," ", text) # removing numbers
 tokens = nltk.word_tokenize(re2)
 removed_letters = [word for word in tokens if len(word)>2] # removing words
 lower_case = [l.lower() for l in removed_letters]
 stop_words = set(stopwords.words('english'))
 filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
 wordnet_lemmatizer = WordNetLemmatizer()
 lemmas = ' '.join([wordnet_lemmatizer.lemmatize(t, pos='v') for t in filtered_result])
 return lemmas

In [3]:
def extract_features(train_text,val_text,ngram,thresh):
    #feature extraction using tf-0idf
    vectorizer=TfidfVectorizer(analyzer='word',max_features=1500,ngram_range=(ngram),max_df=1.0,min_df=thresh)  
    train_vector=vectorizer.fit_transform(train_text) 
    val_vector=vectorizer.transform(val_text)
    return train_vector,val_vector

In [4]:
def label_encode(labels):
    #convert labels to binary 
    labels[labels<8]=0
    labels[labels>=8]=1
    return labels

In [5]:
def cross_validation(df,model,ngram,thresh):
    #apply k fold cross validation
    accuracy,f1,precision,recall=[],[],[],[]
    kf = KFold(n_splits=5, shuffle=True, random_state=1)  #k fold cross validatoon
    for train_idx,val_idx in kf.split(df):
        train=df.iloc[train_idx,:] #slice training data
        val=df.iloc[val_idx,:] #slice validation data
        
        train_text,train_labels=train['extract'].copy(),label_encode(train['score'].copy()) #slice features and labels seprate for training data
        val_text,val_labels=val['extract'].copy(),label_encode(val['score'].copy())#slice features and labels seprate for validation data
        train_vector,val_vector=extract_features(train_text,val_text,ngram,thresh)#feature extraction
        #fit the model
        model.fit(train_vector,train_labels)
        y_pred=model.predict(val_vector)
        y_pred=y_pred.astype('int') #convert float to integers
        accuracy.append(accuracy_score(val_labels,y_pred)) #accuracy
        f1.append(f1_score(val_labels,y_pred,average='macro')) #f1 score
        precision.append(precision_score(val_labels,y_pred,average='macro')) #precison
        recall.append(recall_score(val_labels,y_pred,average='macro')) #recall
    
    #save model resuls in a dataframe
    result=pd.DataFrame(data=list(zip(accuracy,precision,recall,f1)),columns=['Accuracy','Precision','Recall','F1-score'])
    result.loc['Average'] = result.mean() #calculate average of 5 fold cross validation
    return result

In [6]:
def save_model(df,model,ngram,thresh):
    #save the model
    labels=df['score']
    labels[labels<8]=0 
    labels[labels>=8]=1
    #apply tf idf
    vectorizer=TfidfVectorizer(analyzer='word',max_features=1500,ngram_range=(ngram),max_df=1.0,min_df=thresh)  
    data=vectorizer.fit_transform(df['extract'])
    #save model
    joblib.dump(vectorizer,'vec.pkl')
    model.fit(data,labels)
    joblib.dump(model,'clf.pkl')    


In [7]:
def main():
    df=pd.read_csv('user_reviews.csv') #read file
    df['extract']=df['extract'].apply(normalizer) #apply preporcessing
    model1=MultinomialNB() #naive bayes moels
    res1=cross_validation(df,model1,ngram=(1,1),thresh=10) # naive bayes unigram
    res2=cross_validation(df,model1,ngram=(1,2),thresh=20) # naive bayes bigram 
    res3=cross_validation(df,model1,ngram=(1,3),thresh=30) # naive bayes trigram
    
    #run svm model
    model2=LinearSVC()
    res12=cross_validation(df,model2,ngram=(1,1),thresh=10) #svm unigram
    res22=cross_validation(df,model2,ngram=(1,2),thresh=20) #svm bigram
    res32=cross_validation(df,model2,ngram=(1,3),thresh=30) #svm trigram
    
    #run adda boost model
    model3=AdaBoostClassifier()
    res13=cross_validation(df,model3,ngram=(1,1),thresh=10) #adaboost unigram
    res23=cross_validation(df,model3,ngram=(1,2),thresh=20) #adaboost bigram
    res33=cross_validation(df,model3,ngram=(1,3),thresh=30) # #adaboost trigram
    
    #extract average f1 score of all models
    out1f=res1.loc['Average']['F1-score']
    out2f=res2.loc['Average']['F1-score']
    out3f=res3.loc['Average']['F1-score']
    
    out4f=res12.loc['Average']['F1-score']
    out5f=res22.loc['Average']['F1-score']
    out6f=res32.loc['Average']['F1-score']
    
    out7f=res13.loc['Average']['F1-score']
    out8f=res23.loc['Average']['F1-score']
    out9f=res33.loc['Average']['F1-score']
    
    #create a list containing score of all models
    lis=[out1f,out2f,out3f,out4f,out5f,out6f,out7f,out8f,out9f]
    
    #save the model which has the best score
    if np.argmax(lis)==0:
        save_model(df,model1,(1,1),10)
        
    elif np.argmax(lis)==1:
        save_model(df,model1,(1,2),20)
   
    elif np.argmax(lis)==2:
        save_model(df,model1,(1,3),30)
        
    elif np.argmax(lis)==3:
        save_model(df,model2,(1,1),10)
        
    elif np.argmax(lis)==4:
        save_model(df,model2,(1,2),20)
        
    elif np.argmax(lis)==5:
       save_model(df,model2,(1,1),30)
  
    elif np.argmax(lis)==6:
        save_model(df,model3,(1,1),10)
 
    elif np.argmax(lis)==7:
        save_model(df,model3,(1,2),20)

    elif np.argmax(lis)==8:
        save_model(df,model3,(1,3),30)

    #add all results to dataframe
    #result=pd.concat([res1,res2,res3,res12,res22,res32,res13,res23,res33],axis=1)
    
    return res1,res2,res3,res12,res22,res32,res13,res23,res33


res1,res2,res3,res12,res22,res32,res13,res23,res33=main()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [9]:
print('Unigram Naive Bayes')
print(res1)


Unigram Naive Bayes
         Accuracy  Precision    Recall  F1-score
0         0.84755   0.842577  0.726429  0.759234
1         0.84995   0.843868  0.730195  0.763070
2         0.84895   0.845446  0.727272  0.760574
3         0.85075   0.849483  0.729948  0.763700
4         0.85065   0.844468  0.731306  0.764220
Average   0.84957   0.845168  0.729030  0.762160


In [10]:
print('bigram Naive Bayes')
print(res2)

bigram Naive Bayes
         Accuracy  Precision    Recall  F1-score
0         0.85230   0.838907  0.742696  0.773103
1         0.85655   0.844024  0.748825  0.779532
2         0.85600   0.843369  0.748693  0.779227
3         0.85435   0.843012  0.744741  0.775743
4         0.85410   0.839456  0.745164  0.775460
Average   0.85466   0.841754  0.746024  0.776613


In [11]:
print('trigram Naive Bayes')
print(res3)

trigram Naive Bayes
         Accuracy  Precision    Recall  F1-score
0         0.85185   0.837485  0.742597  0.772721
1         0.85625   0.842837  0.748963  0.779391
2         0.85560   0.841850  0.748830  0.779013
3         0.85480   0.842283  0.746649  0.777226
4         0.85445   0.838989  0.746614  0.776588
Average   0.85459   0.840689  0.746731  0.776988


In [12]:
print('Unigram SYM')
print(res12)

Unigram SYM
         Accuracy  Precision    Recall  F1-score
0         0.87085   0.841994  0.798171  0.816324
1         0.87125   0.839412  0.801595  0.817682
2         0.87160   0.841821  0.800124  0.817574
3         0.87395   0.845923  0.803020  0.820926
4         0.87050   0.839208  0.798741  0.815754
Average   0.87163   0.841672  0.800330  0.817652


In [13]:
print('bigram SYM')
print(res22)

bigram SYM
         Accuracy  Precision    Recall  F1-score
0         0.86955   0.839458  0.797238  0.814819
1         0.87220   0.841392  0.802024  0.818678
2         0.87365   0.844138  0.804041  0.820970
3         0.87280   0.843500  0.802455  0.819696
4         0.87100   0.838455  0.801778  0.817454
Average   0.87184   0.841389  0.801507  0.818323


In [14]:
print('trigram SYM')
print(res32)

trigram SYM
         Accuracy  Precision    Recall  F1-score
0         0.86950   0.839449  0.797071  0.814705
1         0.87130   0.840471  0.800143  0.817119
2         0.87390   0.844585  0.804275  0.821283
3         0.87240   0.842938  0.801854  0.819102
4         0.87075   0.838142  0.801341  0.817058
Average   0.87157   0.841117  0.800937  0.817853


In [15]:
print('Unigram Adaboost')
print(res13)


Unigram Adaboost
         Accuracy  Precision    Recall  F1-score
0         0.81910   0.784969  0.688623  0.714439
1         0.82255   0.788960  0.693547  0.719859
2         0.82300   0.788528  0.696966  0.722918
3         0.82595   0.796644  0.699227  0.726323
4         0.81940   0.778662  0.693558  0.718182
Average   0.82200   0.787553  0.694384  0.720344


In [16]:
print('bigram Adaboost')
print(res23)

bigram Adaboost
         Accuracy  Precision    Recall  F1-score
0         0.81870   0.783069  0.689292  0.714788
1         0.83000   0.791493  0.717603  0.741737
2         0.82545   0.791772  0.701956  0.728055
3         0.82570   0.796581  0.698458  0.725591
4         0.82250   0.785773  0.696362  0.721978
Average   0.82447   0.789738  0.700734  0.726430


In [17]:
print('trigram Adaboost')
print(res33)

trigram Adaboost
         Accuracy  Precision    Recall  F1-score
0         0.82055   0.785579  0.693065  0.718738
1         0.82500   0.790761  0.700171  0.726306
2         0.82425   0.791563  0.697999  0.724381
3         0.82800   0.798480  0.704211  0.731250
4         0.82270   0.783784  0.699538  0.724514
Average   0.82410   0.790033  0.698997  0.725038
