In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [2]:
#importing cleaned data

data = pd.read_csv("clean_data.csv", converters = {"words": eval})

In [3]:
#creating new Datafram by deleting rows with nan element

df1 = data.dropna().copy()

Learning Model

In [4]:
x = df1['text']
y = df1['sentiment']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=26105112)
#here random_state work as seed so that if we run again we get he same data

In [5]:
vc = TfidfVectorizer(ngram_range=(1,2))
vc.fit(x_train)
print('No. of feature_words: ', len(vc.get_feature_names()))

No. of feature_words:  3788


In [6]:
x_train = vc.transform(x_train)
x_test  = vc.transform(x_test)

In [7]:
def result(model):
    y_pred = model.predict(x_test)
    acc = round(accuracy_score(y_test, y_pred),4)
    print("Accuracy = "+"{:.2f}".format(acc*100) + "%")
    
    print(classification_report(y_test, y_pred))
    
    cf = confusion_matrix(y_test, y_pred)
    df = pd.DataFrame(cf)
    df.columns = ['Predicted Negative','Predicted Positive']
    df.rename(index = {0:'Actual Negative',1:'Actual Positive'}, inplace=True)
    display(df)

# Multinomial Naive Bayes

In [8]:
Mnb = MultinomialNB()
Mnb.fit(x_train,y_train)
#model_Evaluate(Mnb)
#y_pred0 = Mnb.predict(x_test)
result(Mnb)

Accuracy = 79.44%
              precision    recall  f1-score   support

          -1       0.92      0.13      0.23        92
           1       0.79      1.00      0.88       302

    accuracy                           0.79       394
   macro avg       0.86      0.56      0.55       394
weighted avg       0.82      0.79      0.73       394



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,12,80
Actual Positive,1,301


# Suppport vector machine

In [9]:
SVCmodel = LinearSVC()
SVCmodel.fit(x_train, y_train)
#model_Evaluate(SVCmodel)
#y_pred2 = SVCmodel.predict(x_test)
result(SVCmodel)

Accuracy = 83.76%
              precision    recall  f1-score   support

          -1       0.82      0.39      0.53        92
           1       0.84      0.97      0.90       302

    accuracy                           0.84       394
   macro avg       0.83      0.68      0.72       394
weighted avg       0.83      0.84      0.81       394



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,36,56
Actual Positive,8,294


# Logistic Regression

In [10]:
LRmodel = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
LRmodel.fit(x_train, y_train)
#model_Evaluate(LRmodel)
#y_pred3 = LRmodel.predict(x_test)
result(LRmodel)

Accuracy = 80.96%
              precision    recall  f1-score   support

          -1       0.87      0.22      0.35        92
           1       0.81      0.99      0.89       302

    accuracy                           0.81       394
   macro avg       0.84      0.60      0.62       394
weighted avg       0.82      0.81      0.76       394



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20,72
Actual Positive,3,299
