In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from string import punctuation
from sklearn import svm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from nltk import ngrams
from itertools import chain
from wordcloud import WordCloud
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, recall_score, precision_score,precision_recall_curve
from keras.preprocessing.sequence import pad_sequences
import re
from keras.preprocessing.text import Tokenizer
import warnings
warnings.filterwarnings("ignore")
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [4]:
def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

In [5]:
def preprocess_text(review):
    review = re.sub(r"http\S+", "", review)             # removing website links
    review = BeautifulSoup(review, 'lxml').get_text()   # removing html tags
    review = decontract(review)                         # decontracting
    review = re.sub("\S*\d\S*", "", review).strip()     # removing the words with numeric digits
    review = re.sub('[^A-Za-z]+', ' ', review)          # removing non-word characters
    review = review.lower()                             # converting to lower case
    #review = [word for word in review.split(" ") if not word in stop_words] # removing stop words
    review = [lemmatizer.lemmatize(token, "v") for token in review] #Lemmatization
    review = "".join(review)
    review.strip()
    return review

In [6]:
save_path = '/content/drive/My Drive/Natural Language Processing/Reviews.csv'
df = pd.read_csv(save_path)

In [15]:
df['Text'] = df['Text'].apply(lambda x:preprocess_text(x))

In [17]:
data_set = df[["Text", "Score"]]
train_set, test_set = train_test_split(data_set, random_state=0)
y_map = {0:0, 1:0, 2:0, 3:1, 4:1, 5:1}

train_text = train_set["Text"]
train_score = train_set["Score"].map(y_map)

test_text = test_set["Text"]
test_score = test_set["Score"].map(y_map)

In [18]:
train_set['Result'] = train_score
df_positive = train_set[train_set["Result"]==1]
df_negative = train_set[train_set["Result"]==0]

In [19]:
df_positive_text = df_positive["Text"]
df_negative_text = df_negative["Text"]

In [20]:
df_negative_up = df_negative_text.sample(len(df_positive_text), replace=True)
df_positive_up = df_positive_text
df_positive_down = df_positive_text.sample(len(df_negative_text))
df_negative_down = df_negative_text

In [21]:
n=8
df_negative_big = df_negative_text.sample(n*len(df_negative_text), replace = True)
df_positive_big = df_positive_text.sample(n*len(df_positive_text), replace = True)

In [22]:
X_big = df_positive_big.append(df_negative_big)
y_big = np.zeros((len(X_big),1))
l = len(df_positive_big)
y_big[0:l] = 1

In [23]:
c = CountVectorizer(stop_words = 'english')
tfidf_n = TfidfVectorizer(ngram_range=(1,2),stop_words = 'english')
def text_fit(X_train, y_train, X_test, y_test, model, clf_model, coef_show=1):
    
    X_c = model.fit_transform(X_train)
    
    print('# features: {}'.format(X_c.shape[1]))
    #X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state=0)
    print('# train records: {}'.format(X_train.shape[0]))
    print('# test records: {}'.format(X_test.shape[0]))
    X_test_c = model.transform(X_test)
    clf = clf_model.fit(X_c, y_train)
    model_acc = clf.score(X_c, y_train)
    print("Model Training Accuracy: {}".format(model_acc))
    pred = clf.predict(X_test_c)
    acc = clf.score(X_test_c, y_test)
    print ('Model Test Accuracy: {}'.format(acc))
    confu_mat = confusion_matrix(y_test, pred)
    print('Confusion matrix {}'.format(confu_mat))
    f1 = f1_score(y_test, pred, pos_label=0)
    recall_score_value = recall_score(y_test, pred, pos_label=0)
    precision_score_value = precision_score(y_test, pred, pos_label=0)
    print('F1 Score: {}'.format(f1))
    print('Recall: {}'.format(recall_score_value))
    print('Precision: {}'.format(precision_score_value))
    if coef_show == 1: 
        w = model.get_feature_names()
        coef = clf.coef_.tolist()[0]
        coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
        coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
        print('')
        print('-Top 20 positive-')
        print(coeff_df.head(20).to_string(index=False))
        print('')
        print('-Top 20 negative-')        
        print(coeff_df.tail(20).to_string(index=False))
    

Training With Pre-Processing Text

In [24]:
text_fit(train_text, train_score, test_text, test_score, c, LogisticRegression())

# features: 106996
# train records: 426340
# test records: 142114
Model Training Accuracy: 0.9378829103532392
Model Test Accuracy: 0.9237935741728471
Confusion matrix [[ 13068   7528]
 [  3302 118216]]
F1 Score: 0.7070280798571661
Recall: 0.6344921343950282
Precision: 0.7982895540623091

-Top 20 positive-
         Word  Coefficient
     downside     3.134629
   pleasantly     2.971120
    addicting     2.931147
       delish     2.468444
    skeptical     2.430423
       resist     2.364124
     soothing     2.350713
     drawback     2.313859
      drained     2.313372
 unmistakable     2.218932
      easiest     2.181462
    recepient     2.181208
      sandies     2.132267
      worries     2.122300
      gobbles     2.117243
 conventional     2.092833
       hooked     2.056088
    macademia     2.026714
       tastey     2.008487
        steal     2.007786

-Top 20 negative-
        Word  Coefficient
       ruins    -2.356007
       yikes    -2.384579
   lethargic    -2.385725
  d

In [25]:
text_fit(train_text, train_score, test_text, test_score, tfidf_n, LogisticRegression())

# features: 3466838
# train records: 426340
# test records: 142114
Model Training Accuracy: 0.950985129239574
Model Test Accuracy: 0.9354954473169428
Confusion matrix [[ 13351   7245]
 [  1922 119596]]
F1 Score: 0.7444311243692325
Recall: 0.6482326665371917
Precision: 0.874157009101028

-Top 20 positive-
             Word  Coefficient
            great    18.685332
             best    16.256385
        delicious    15.101592
          perfect    13.376376
            loves    12.211773
        excellent    11.573570
             love    11.351488
             good    10.863352
             nice    10.204020
        wonderful     9.923834
         favorite     9.736541
          amazing     8.703701
 highly recommend     8.306376
          awesome     8.256209
             easy     8.184009
            happy     7.994866
           highly     7.875166
            tasty     7.869253
              bit     7.859970
          pleased     7.533680

-Top 20 negative-
           Word  Coeffic

In [26]:
#n=8
text_fit(X_big, y_big, test_text, test_score, tfidf_n, LogisticRegression(max_iter = 200))

# features: 3465946
# train records: 3410720
# test records: 142114
Model Training Accuracy: 0.9939857273537552
Model Test Accuracy: 0.9498993765568488
Confusion matrix [[ 15579   5017]
 [  2103 119415]]
F1 Score: 0.8139923715972621
Recall: 0.7564090114585357
Precision: 0.8810654903291483

-Top 20 positive-
             Word  Coefficient
            great    23.180707
             best    21.862793
        delicious    21.271095
          perfect    19.766512
        excellent    16.675352
            loves    16.513593
 highly recommend    15.004442
        wonderful    14.430089
             good    14.062650
             love    13.904756
          amazing    13.704588
          awesome    13.594946
             nice    13.243553
           hooked    13.183552
         favorite    12.583761
          pleased    11.852538
            yummy    11.605677
        fantastic    11.452121
             glad    11.313977
            thank    11.226202

-Top 20 negative-
           Word  Coef

In [None]:
#n=12
text_fit(X_big, y_big, test_text, test_score, tfidf_n, LogisticRegression(max_iter = 200))

# features: 3466838
# train records: 5116080
# test records: 142114
Model Training Accuracy: 0.9976800597332333
Model Test Accuracy: 0.9508985743839453
Confusion matrix [[ 15758   4838]
 [  2140 119378]]
F1 Score: 0.818724996103289
Recall: 0.7651000194212468
Precision: 0.8804335679964241

-Top 20 positive-
             Word  Coefficient
            great    24.965708
             best    23.142507
        delicious    22.638882
          perfect    20.195739
        excellent    17.847640
            loves    17.177562
 highly recommend    16.242735
        wonderful    15.391146
             good    14.970366
          amazing    14.424317
           hooked    14.406410
             nice    13.933900
             love    13.909130
          awesome    13.662366
         favorite    12.947531
          pleased    12.362464
       just right    12.238898
        fantastic    12.201809
            yummy    12.031814
            thank    11.753579

-Top 20 negative-
           Word  Coeff

Training without Pre-Processing Text

In [None]:
text_fit(train_text, train_score, test_text, test_score, c, LogisticRegression())

# features: 105762
# train records: 426340
# test records: 142114


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Training Accuracy: 0.9354482338040062
Model Test Accuracy: 0.9234347073476223
Confusion matrix [[ 13089   7507]
 [  3374 118144]]
F1 Score: 0.7063871124423218
Recall: 0.6355117498543407
Precision: 0.7950555791775497

-Top 20 positive-
         Word  Coefficient
     downside     3.067710
   pleasantly     2.890361
    addicting     2.584897
    skeptical     2.353791
       hooked     2.244221
       resist     2.148429
     soothing     2.132844
       delish     2.043792
     drawback     1.966748
 conventional     1.959105
      worries     1.849322
      drained     1.847494
      trainer     1.797404
       brings     1.769413
    amazingly     1.750267
      gobbles     1.746297
        steal     1.730062
      easiest     1.716821
 unmistakable     1.690465
     terrific     1.668224

-Top 20 negative-
         Word  Coefficient
     vinegary    -2.071368
          ick    -2.081415
 unacceptable    -2.105773
      defeats    -2.106337
   returnable    -2.165382
        sch

In [None]:
#n=8
text_fit(X_big, y_big, test_text, test_score, tfidf_n, LogisticRegression())

# features: 3552222
# train records: 3410720
# test records: 142114


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Training Accuracy: 0.991131491298025
Model Test Accuracy: 0.9493012651814741
Confusion matrix [[ 15700   4896]
 [  2309 119209]]
F1 Score: 0.8133661442818287
Recall: 0.76228393862886
Precision: 0.8717863290576934

-Top 20 positive-
             Word  Coefficient
             best    23.667018
            great    23.271710
        delicious    21.396004
            loves    20.210369
          perfect    18.068870
        excellent    16.304537
             nice    15.552418
             love    15.050679
           hooked    14.971249
         favorite    14.569942
             good    14.512067
 highly recommend    14.457361
        wonderful    14.113683
       just right    12.674138
          amazing    12.526315
             easy    12.461788
            happy    12.453545
            tasty    12.385956
 won disappointed    12.267151
          awesome    12.224901

-Top 20 negative-
           Word  Coefficient
          bland   -11.089369
          gross   -11.560073
     

In [None]:
#n=10
text_fit(X_big, y_big, test_text, test_score, tfidf_n, LogisticRegression())

# features: 3552816
# train records: 4263400
# test records: 142114


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Training Accuracy: 0.9889332457662898
Model Test Accuracy: 0.94921682592848
Confusion matrix [[ 15743   4853]
 [  2364 119154]]
F1 Score: 0.8135286670283958
Recall: 0.7643717226645951
Precision: 0.869442756944828

-Top 20 positive-
             Word  Coefficient
            great    25.784658
             best    24.758224
        delicious    21.221235
            loves    19.844510
          perfect    19.150658
           hooked    16.830238
             good    16.418690
 highly recommend    16.338015
        excellent    16.170962
             nice    14.899069
        wonderful    14.504732
         favorite    14.327597
          amazing    14.292588
          awesome    14.093606
 won disappointed    13.864424
       just right    13.733148
             love    13.616278
        fantastic    13.559918
             beat    13.375759
            yummy    13.168810

-Top 20 negative-
           Word  Coefficient
         ruined   -11.561902
  great reviews   -11.564077
     

In [None]:
#n=14
text_fit(X_big, y_big, test_text, test_score, tfidf_n, LogisticRegression())

# features: 3552896
# train records: 5968760
# test records: 142114


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Training Accuracy: 0.992959174099813
Model Test Accuracy: 0.9501315845025824
Confusion matrix [[ 15973   4623]
 [  2464 119054]]
F1 Score: 0.8184356826275203
Recall: 0.7755389395999223
Precision: 0.8663556977816348

-Top 20 positive-
             Word  Coefficient
            great    30.705834
             best    28.533700
        delicious    24.052770
            loves    23.517018
          perfect    21.536202
           hooked    20.651479
 highly recommend    18.749281
        excellent    18.653035
             love    18.063380
             good    17.700439
             nice    17.676976
 won disappointed    17.614909
       just right    16.834990
        wonderful    16.609821
         favorite    16.541230
          awesome    16.080359
          amazing    16.005632
             beat    15.131503
        fantastic    14.650764
              bit    14.377985

-Top 20 negative-
           Word  Coefficient
  great reviews   -13.259437
          gross   -13.325364
   

In [None]:
#n=16
text_fit(X_big, y_big, test_text, test_score, tfidf_n, LogisticRegression())

In [36]:
X_down = df_positive_down.append(df_negative_down)
X_up = df_positive_up.append(df_negative_up)
y_down = np.zeros((len(df_negative_down)+len(df_positive_down),1))
y_down[0:len(df_positive_down)] = 1
y_up = np.zeros((len(df_negative_up)+len(df_positive_up),1))
y_up[0:len(df_positive_up)] = 1

In [37]:
n=1
df_positive_up_big = df_positive_up.sample(n*len(df_positive_up), replace=True)
df_negative_up_big = df_negative_up.sample(n*len(df_negative_up), replace=True)
df_positive_down_big = df_positive_down.sample(n*len(df_positive_down), replace=True)
df_negative_down_big = df_negative_down.sample(n*len(df_negative_down), replace=True)
X_up_big = df_positive_up_big.append(df_negative_up_big)
y_up_big = np.zeros((len(df_negative_up_big)+len(df_positive_up_big),1))
y_up_big[0:len(df_positive_up_big)] = 1
X_down_big = df_positive_down_big.append(df_negative_down_big)
y_down_big = np.zeros((len(df_negative_down_big)+len(df_positive_down_big),1))
y_down_big[0:len(df_positive_down_big)] = 1

In [None]:
#n=4
text_fit(X_up_big, y_up_big, test_text, test_score, tfidf_n, LogisticRegression())

# features: 3521186
# train records: 2919192
# test records: 142114


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Training Accuracy: 0.9946516022241771
Model Test Accuracy: 0.9446711794756323
Confusion matrix [[ 17858   2738]
 [  5125 116393]]
F1 Score: 0.8195690584914752
Recall: 0.8670615653524957
Precision: 0.7770090936779359

-Top 20 positive-
             Word  Coefficient
            great    21.962996
             best    20.638290
        delicious    20.455408
          perfect    19.175356
            loves    16.938529
        excellent    16.276780
 highly recommend    15.157621
        wonderful    14.622828
           hooked    13.870444
          amazing    13.551968
          awesome    13.070841
             love    13.060740
             good    12.716394
             nice    12.480202
       just right    12.363941
          pleased    11.610963
         favorite    11.533761
        fantastic    11.528493
            yummy    11.402195
             glad    11.362790

-Top 20 negative-
           Word  Coefficient
          worse   -12.666410
           poor   -12.940237
  

In [None]:
#n=6
text_fit(X_up_big, y_up_big, test_text, test_score, tfidf_n, LogisticRegression())

# features: 3547293
# train records: 4378788
# test records: 142114


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Training Accuracy: 0.996751384172972
Model Test Accuracy: 0.9466203188989122
Confusion matrix [[ 17685   2911]
 [  4675 116843]]
F1 Score: 0.8234006890771952
Recall: 0.8586618760924452
Precision: 0.7909212880143113

-Top 20 positive-
             Word  Coefficient
            great    22.641239
             best    22.250721
        delicious    22.100629
          perfect    19.651206
            loves    18.261304
        excellent    17.663906
 highly recommend    16.936113
           hooked    15.767885
        wonderful    15.211198
          amazing    14.969446
          awesome    13.905995
       just right    13.818796
             love    13.499521
             good    13.199401
             nice    13.173813
 won disappointed    13.156027
            yummy    12.871948
        fantastic    12.489488
          pleased    12.199450
         favorite    12.094527

-Top 20 negative-
           Word  Coefficient
            rip   -13.624717
           weak   -13.902542
   

In [None]:
#n=8
text_fit(X_up_big, y_up_big, test_text, test_score, tfidf_n, LogisticRegression())

# features: 3551440
# train records: 5838384
# test records: 142114


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Training Accuracy: 0.998020685175898
Model Test Accuracy: 0.9477813586275807
Confusion matrix [[ 17589   3007]
 [  4414 117104]]
F1 Score: 0.8257940327237729
Recall: 0.8540007768498737
Precision: 0.7993909921374358

-Top 20 positive-
             Word  Coefficient
            great    23.829698
             best    22.932495
        delicious    22.303568
          perfect    21.216298
            loves    18.921283
        excellent    17.811063
 highly recommend    17.728594
           hooked    16.108067
        wonderful    15.942163
          amazing    15.563808
          awesome    14.536647
             love    14.156905
       just right    14.133120
             good    14.023967
             nice    13.886421
 won disappointed    13.821081
          pleased    12.960343
        fantastic    12.899619
            yummy    12.770920
         downside    12.581235

-Top 20 negative-
           Word  Coefficient
  unfortunately   -14.222411
           poor   -14.413819
   

In [None]:
#n=10 down
text_fit(X_down_big, y_down_big, test_text, test_score, tfidf_n, LogisticRegression())

# features: 1632872
# train records: 1228820
# test records: 142114


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Training Accuracy: 0.9967212447714067
Model Test Accuracy: 0.9108532586515051
Confusion matrix [[ 18855   1741]
 [ 10928 110590]]
F1 Score: 0.748526171619127
Recall: 0.9154690231112838
Precision: 0.6330792734110062

-Top 20 positive-
             Word  Coefficient
            great    23.284070
             best    20.504184
        delicious    20.192314
          perfect    17.985714
            loves    16.095251
        excellent    15.819308
             love    14.426159
        wonderful    13.495235
 highly recommend    12.872426
         favorite    12.801760
             good    12.597622
             nice    12.512296
          amazing    12.071739
           hooked    11.535581
          awesome    11.499417
           highly    10.794829
            thank    10.769232
       just right    10.674564
            yummy    10.664066
          pleased    10.643701

-Top 20 negative-
           Word  Coefficient
          waste   -10.108279
          nasty   -10.198762
   

In [None]:
#n=20 down
text_fit(X_down_big, y_down_big, test_text, test_score, tfidf_n, LogisticRegression())

# features: 1632982
# train records: 2457640
# test records: 142114


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Training Accuracy: 0.9995992089972494
Model Test Accuracy: 0.9119017127095149
Confusion matrix [[ 18844   1752]
 [ 10768 110750]]
F1 Score: 0.7506373486297004
Recall: 0.9149349388230724
Precision: 0.6363636363636364

-Top 20 positive-
             Word  Coefficient
            great    25.955155
             best    23.482807
        delicious    22.869613
          perfect    20.775217
            loves    18.871484
        excellent    18.040395
             love    15.703490
 highly recommend    15.662180
        wonderful    15.565371
          amazing    14.780255
         favorite    14.643784
             good    14.251769
             nice    14.190326
           hooked    14.133811
          awesome    13.928174
            yummy    12.862188
 won disappointed    12.767706
          pleased    12.695205
       just right    12.628876
            thank    12.439287

-Top 20 negative-
           Word  Coefficient
          nasty   -12.245782
           poor   -12.288928
  

In [None]:
#n=40 down
text_fit(X_down_big, y_down_big, test_text, test_score, tfidf_n, LogisticRegression())

# features: 1632982
# train records: 4915280
# test records: 142114


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Training Accuracy: 0.9999035660226885
Model Test Accuracy: 0.9122465063259074
Confusion matrix [[ 18852   1744]
 [ 10727 110791]]
F1 Score: 0.7514499252615844
Recall: 0.9153233637599534
Precision: 0.6373440616653707

-Top 20 positive-
             Word  Coefficient
            great    28.932181
             best    26.045913
        delicious    25.990652
          perfect    22.923824
            loves    20.824153
        excellent    20.635775
             love    17.588041
        wonderful    17.390358
 highly recommend    17.193421
         favorite    16.225820
          amazing    16.136321
             good    15.961379
           hooked    15.691404
             nice    15.689290
          awesome    15.012204
 won disappointed    14.375725
       just right    14.132111
            yummy    13.952019
          pleased    13.872452
            thank    13.601920

-Top 20 negative-
           Word  Coefficient
         refund   -13.433673
          worse   -13.477998
  

In [None]:
text_fit(X, y, c, DummyClassifier(),0)

# features: 119939
# train records: 426340
# test records: 142114
Model Accuracy: 0.7518752550769101




In [None]:
tfidf = TfidfVectorizer(stop_words = 'english')
text_fit(X, y, tfidf, LogisticRegression())

# features: 119939
# train records: 426340
# test records: 142114


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Accuracy: 0.9224284729161096

-Top 20 positive-
       Word  Coefficient
      great    11.940613
  delicious    10.551228
       best    10.147604
    perfect     9.368369
      loves     8.790815
  excellent     8.406053
     highly     7.841667
       love     7.580810
     hooked     6.980460
  wonderful     6.808297
       good     6.584383
       nice     6.509065
    amazing     6.418014
 pleasantly     6.317414
    awesome     6.245800
   favorite     6.183629
  fantastic     5.851168
      yummy     5.821420
       beat     5.544562
       easy     5.522370

-Top 20 negative-
           Word  Coefficient
          worse    -5.023784
        useless    -5.251663
  unfortunately    -5.315098
          nasty    -5.479692
          gross    -5.563191
         return    -5.592826
           poor    -5.662447
          waste    -5.801514
    undrinkable    -6.029512
           yuck    -6.084586
      tasteless    -6.268094
     disgusting    -6.297766
          threw    -6.514

In [27]:
from sklearn.ensemble import RandomForestClassifier
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_n = TfidfVectorizer(ngram_range=(1,2),stop_words = 'english')
text_fit(train_text, train_score, test_text, test_score, tfidf, RandomForestClassifier(), coef_show = 0)

# features: 106996
# train records: 426340
# test records: 142114
Model Training Accuracy: 0.9996669324951917
Model Test Accuracy: 0.9257497502005432
Confusion matrix [[ 10261  10335]
 [   217 121301]]
F1 Score: 0.6604235051811804
Recall: 0.49820353466692563
Precision: 0.9792899408284024


AttributeError: ignored

In [34]:
train_score.sum()/train_score.count()

0.8558873199793592

In [42]:
text_fit(X_down, y_down, test_text, test_score, tfidf, RandomForestClassifier(), coef_show = 0)

# features: 61381
# train records: 122882
# test records: 142114
Model Training Accuracy: 0.9990234533943132
Model Test Accuracy: 0.8917066580350986
Confusion matrix [[ 18246   2350]
 [ 13040 108478]]
F1 Score: 0.7033653290158436
Recall: 0.8859001747912216
Precision: 0.5832001534232564


In [44]:
text_fit(train_text, train_score, test_text, test_score, tfidf, XGBClassifier(), coef_show = 0)

# features: 106996
# train records: 426340
# test records: 142114
Model Training Accuracy: 0.874829947928883
Model Test Accuracy: 0.8740447809505045
Confusion matrix [[  3014  17582]
 [   318 121200]]
F1 Score: 0.25192243396857233
Recall: 0.14633909496989705
Precision: 0.904561824729892


In [45]:
text_fit(X_up, y_up, test_text, test_score, tfidf, XGBClassifier(), coef_show = 0)

# features: 106971
# train records: 729798
# test records: 142114
Model Training Accuracy: 0.7774466359184322
Model Test Accuracy: 0.745929324345244
Confusion matrix [[16960  3636]
 [32471 89047]]
F1 Score: 0.4843845945135447
Recall: 0.8234608661876093
Precision: 0.34310452954623616
