In [1]:
import re, nltk
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,precision_score,recall_score
from sklearn.ensemble import AdaBoostClassifier
import joblib


In [2]:

def normalizer(text): #### Cleaning Tweets
    re2 = re.sub("[^A-Za-z]+"," ", text) # removing numbers
    tokens = nltk.word_tokenize(re2)
    removed_letters = [word for word in tokens if len(word)>2] # removing words
    lower_case = [l.lower() for l in removed_letters]
    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = ' '.join([wordnet_lemmatizer.lemmatize(t, pos='v') for t in filtered_result])
    return lemmas


In [3]:
def extract_features(train_text,val_text,ngram,thresh):
    vectorizer=TfidfVectorizer(analyzer='word',max_features=1500,ngram_range=(ngram),max_df=1.0,min_df=thresh)  
    train_vector=vectorizer.fit_transform(train_text)
    val_vector=vectorizer.transform(val_text)
    return train_vector,val_vector

In [4]:
def label_encode(labels):
    labels[labels<8]=0
    labels[labels>=8]=1
    return labels


In [5]:
def cross_validation(df,model,ngram,thresh):
    accuracy,f1,precision,recall=[],[],[],[]
    kf = KFold(n_splits=5, shuffle=True, random_state=1) 
    for train_idx,val_idx in kf.split(df):
        train=df.iloc[train_idx,:]
        val=df.iloc[val_idx,:]
        
        train_text,train_labels=train['extract'].copy(),label_encode(train['score'].copy())
        val_text,val_labels=val['extract'].copy(),label_encode(val['score'].copy())
        train_vector,val_vector=extract_features(train_text,val_text,ngram,thresh)
        
        model.fit(train_vector,train_labels)
        y_pred=model.predict(val_vector)
        y_pred=y_pred.astype('int')
        accuracy.append(accuracy_score(val_labels,y_pred))
        f1.append(f1_score(val_labels,y_pred,average='macro'))
        precision.append(precision_score(val_labels,y_pred,average='macro'))
        recall.append(recall_score(val_labels,y_pred,average='macro'))
    
    result=pd.DataFrame(data=list(zip(accuracy,precision,recall,f1)),columns=['Accuracy','Precision','Recall','F1-score'])
    result.loc['Average'] = result.mean()
    return result

In [6]:
def choose_classifier(df):
    response=input('Press 0 for Naive Bayers \n 1 for SVM and 2 for AdaBoost \n')
    if response=='0':
        print('Naive Bayes')
        model=MultinomialNB()
        res1=cross_validation(df,model,ngram=(1,1),thresh=10)
        res2=cross_validation(df,model,ngram=(1,2),thresh=20)
        res3=cross_validation(df,model,ngram=(1,3),thresh=30)
    elif response =='1':
        print('Support vector machine')
        model=LinearSVC()
        res1=cross_validation(df,model,ngram=(1,1),thresh=10)
        res2=cross_validation(df,model,ngram=(1,2),thresh=20)
        res3=cross_validation(df,model,ngram=(1,3),thresh=30)
    elif response=='2':
        print('Adaboost classifier')
        model=AdaBoostClassifier()
        res1=cross_validation(df,model,ngram=(1,1),thresh=10)
        res2=cross_validation(df,model,ngram=(1,2),thresh=20)
        res3=cross_validation(df,model,ngram=(1,3),thresh=30)
    else:
        print('wrong button pressed, program terminated')
    
    return res1,res2,res3

In [8]:
def save_model(model,ngram,thresh,vec,clf):
    df=pd.read_csv('user_reviews.csv')
    labels=df['score']
    labels[labels<8]=0
    labels[labels>=8]=1
    df['extract']=df['extract'].apply(normalizer)
    vectorizer=TfidfVectorizer(analyzer='word',max_features=1500,ngram_range=(ngram),max_df=1.0,min_df=thresh)  
    data=vectorizer.fit_transform(df['extract'])
    joblib.dump(vectorizer,vec)
    model.fit(data,labels)
    joblib.dump(model,clf)    
#save_model(LinearSVC(),(1,1),10,'tfidf.pkl','svm.pkl')


In [9]:
def main():
    df=pd.read_csv('user_reviews.csv')
    df['extract']=df['extract'].apply(normalizer)
    out1,out2,out3=choose_classifier(df)
    return out1.round(2),out2.round(2),out3.round(2)

In [10]:

out1,out2,out3=main()


Press 0 for Naive Bayers 
 1 for SVM and 2 for AdaBoost 
0
Naive Bayes


In [11]:
print('bigram')
print(out1)

bigram
         Accuracy  Precision  Recall  F1-score
0            0.85       0.84    0.73      0.76
1            0.85       0.84    0.73      0.76
2            0.85       0.85    0.73      0.76
3            0.85       0.85    0.73      0.76
4            0.85       0.84    0.73      0.76
Average      0.85       0.85    0.73      0.76


In [13]:
print('bigram')
print(out2)

bigram
         Accuracy  Precision  Recall  F1-score
0            0.85       0.84    0.74      0.77
1            0.86       0.84    0.75      0.78
2            0.86       0.84    0.75      0.78
3            0.85       0.84    0.74      0.78
4            0.85       0.84    0.75      0.78
Average      0.85       0.84    0.75      0.78


In [15]:
print('trigram')
print(out3)

trigram
         Accuracy  Precision  Recall  F1-score
0            0.85       0.84    0.74      0.77
1            0.86       0.84    0.75      0.78
2            0.86       0.84    0.75      0.78
3            0.85       0.84    0.75      0.78
4            0.85       0.84    0.75      0.78
Average      0.85       0.84    0.75      0.78
