In [1]:
import bz2
import numpy as np

In [2]:
def labels_text(x):
  label=[]
  text=[]
  for line in bz2.BZ2File(x):
    decode = line.decode("utf-8")
    label.append(int(decode[9]) - 1)
    text.append(decode[10:].strip())
  return np.array(label),text

train_label, train_text = labels_text('train.ft.txt.bz2')
test_label, test_text = labels_text('test.ft.txt.bz2')

In [3]:
from sklearn.utils import shuffle
train_text, train_label = shuffle(train_text, train_label, random_state=42)
test_text, test_label = shuffle(test_text, test_label, random_state=42)

In [4]:
train_text=train_text[0:2000]
train_label=train_label[0:2000]

In [5]:
test_text=test_text[0:500]
test_label=test_label[0:500]

In [6]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def clean(any_text):
    corpus=[]
    for i in range(len(any_text)):
        review = re.sub('".*?"', '', any_text[i]) #removing any word within quotation marks
        review = re.sub('[^a-zA-Z]', ' ', review) #keeping only letters and removing anything else
        review = review.lower()
        review = review.split()
        lem = WordNetLemmatizer()
        all_stopwords = stopwords.words('english')
        all_stopwords.remove('not')
        review = [lem.lemmatize(word) for word in review if not word in set(all_stopwords)]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Manav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Manav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
train_text = clean(train_text)
test_text = clean(test_text)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(lowercase=False)
X = tfidf.fit_transform(train_text).toarray()
y = train_label

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [10]:
X_test = tfidf.transform(test_text).toarray()
y_test = test_label

LOGISTIC REGRESSION

In [11]:
from sklearn.linear_model import LogisticRegression
classifier_logistic = LogisticRegression(C=2.75,random_state = 0)  
classifier_logistic.fit(X_train, y_train)

LogisticRegression(C=2.75, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
y_pred_logistic = classifier_logistic.predict(X_val)

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_val, y_pred_logistic)
print(cm)
acc = accuracy_score(y_val, y_pred_logistic)
print(acc)

[[209  46]
 [ 42 203]]
0.824


In [14]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier_logistic, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 83.40 %
Standard Deviation: 2.18 %


In [15]:
'''from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV
parameters = [{'penalty':['l2'],'C': [0.95,1,1.05]}]
grid_search = GridSearchCV(estimator = classifier_logistic,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs=-1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy.mean()*100))
print("Best Parameters:", best_parameters)'''

'from sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import GridSearchCV\nparameters = [{\'penalty\':[\'l2\'],\'C\': [0.95,1,1.05]}]\ngrid_search = GridSearchCV(estimator = classifier_logistic,\n                           param_grid = parameters,\n                           scoring = \'accuracy\',\n                           cv = 3,\n                           n_jobs=-1)\ngrid_search.fit(X_train, y_train)\nbest_accuracy = grid_search.best_score_\nbest_parameters = grid_search.best_params_\nprint("Best Accuracy: {:.2f} %".format(best_accuracy.mean()*100))\nprint("Best Parameters:", best_parameters)'

In [16]:
y_pred_test = classifier_logistic.predict(X_test)

In [17]:
cm_final = confusion_matrix(y_test, y_pred_test)
print(cm_final)
acc_final = accuracy_score(y_test, y_pred_test)
print(acc_final)

[[213  34]
 [ 46 207]]
0.84


SVM

In [18]:
from sklearn.svm import SVC
classifier_svm = SVC(C=0.75,kernel = 'linear', gamma=0.009, random_state = 0) #83.2,13
classifier_svm.fit(X_train, y_train)

SVC(C=0.75, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.009, kernel='linear',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [19]:
y_pred_svm = classifier_svm.predict(X_val)

In [20]:
cm_svm = confusion_matrix(y_val, y_pred_svm)
print(cm_svm)
acc_svm = accuracy_score(y_val, y_pred_svm)
print(acc_svm)

[[205  50]
 [ 37 208]]
0.826


In [21]:
accuracies = cross_val_score(estimator = classifier_svm, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))#8307
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 82.53 %
Standard Deviation: 2.78 %


In [22]:
'''parameters = [{'C': [0.2,0.25,0.3,0.35,0.4], 'kernel': ['linear'], 'gamma': [0.009]}]
grid_search = GridSearchCV(estimator = classifier_svm,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs=-1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy.mean()*100))
print("Best Parameters:", best_parameters)'''

'parameters = [{\'C\': [0.2,0.25,0.3,0.35,0.4], \'kernel\': [\'linear\'], \'gamma\': [0.009]}]\ngrid_search = GridSearchCV(estimator = classifier_svm,\n                           param_grid = parameters,\n                           scoring = \'accuracy\',\n                           cv = 3,\n                           n_jobs=-1)\ngrid_search.fit(X_train, y_train)\nbest_accuracy = grid_search.best_score_\nbest_parameters = grid_search.best_params_\nprint("Best Accuracy: {:.2f} %".format(best_accuracy.mean()*100))\nprint("Best Parameters:", best_parameters)'

In [23]:
y_pred_test = classifier_svm.predict(X_test)

In [24]:
cm_final = confusion_matrix(y_test, y_pred_test)
print(cm_final)
acc_final = accuracy_score(y_test, y_pred_test)
print(acc_final)

[[207  40]
 [ 44 209]]
0.832


KERNEL SVM

In [25]:
from sklearn.svm import SVC
classifier_kernel = SVC(C=1.75,kernel = 'rbf', random_state = 0, gamma=0.2) #83.6
classifier_kernel.fit(X_train, y_train)

SVC(C=1.75, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.2, kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [26]:
y_pred_kernel = classifier_kernel.predict(X_val)

In [27]:
cm_kernel = confusion_matrix(y_val, y_pred_kernel)
print(cm_kernel)
acc_kernel = accuracy_score(y_val, y_pred_kernel)
print(acc_kernel)

[[203  52]
 [ 37 208]]
0.822


In [28]:
accuracies = cross_val_score(estimator = classifier_kernel, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 82.60 %
Standard Deviation: 2.37 %


In [29]:
'''parameters = [{'C': [1.15,1.2,1.25], 'kernel': ['rbf'], 'gamma': [0.1,0.2,0.3]}]
grid_search = GridSearchCV(estimator = classifier_kernel,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs=-1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy.mean()*100))
print("Best Parameters:", best_parameters)'''

'parameters = [{\'C\': [1.15,1.2,1.25], \'kernel\': [\'rbf\'], \'gamma\': [0.1,0.2,0.3]}]\ngrid_search = GridSearchCV(estimator = classifier_kernel,\n                           param_grid = parameters,\n                           scoring = \'accuracy\',\n                           cv = 3,\n                           n_jobs=-1)\ngrid_search.fit(X_train, y_train)\nbest_accuracy = grid_search.best_score_\nbest_parameters = grid_search.best_params_\nprint("Best Accuracy: {:.2f} %".format(best_accuracy.mean()*100))\nprint("Best Parameters:", best_parameters)'

In [30]:
y_pred_test = classifier_kernel.predict(X_test)

In [31]:
cm_final = confusion_matrix(y_test, y_pred_test)
print(cm_final)
acc_final = accuracy_score(y_test, y_pred_test)
print(acc_final)

[[208  39]
 [ 44 209]]
0.834


KNN

In [32]:
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(n_neighbors = 190, weights='distance',leaf_size=12,metric = 'minkowski', p = 2)
classifier_knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=12, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=190, p=2,
                     weights='distance')

In [33]:
y_pred_knn = classifier_knn.predict(X_val)

In [34]:
cm_knn = confusion_matrix(y_val, y_pred_knn)
print(cm_knn)
acc_knn = accuracy_score(y_val, y_pred_knn)
print(acc_knn)

[[174  81]
 [ 33 212]]
0.772


In [35]:
accuracies = cross_val_score(estimator = classifier_knn, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 81.13 %
Standard Deviation: 2.98 %


In [36]:
'''parameters = [{'n_neighbors': [175,200,225,250], 'weights': ['uniform','distance'], 'leaf_size': [7,12,15,17]}]
grid_search = GridSearchCV(estimator = classifier_knn,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs=-1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy.mean()*100))
print("Best Parameters:", best_parameters)'''
#Best Accuracy: 80.80 %
#Best Parameters: {'leaf_size': 25, 'n_neighbors': 225, 'weights': 'uniform'}
#Best Accuracy: 81.07 %
#Best Parameters: {'leaf_size': 15, 'n_neighbors': 200, 'weights': 'distance'}

'parameters = [{\'n_neighbors\': [175,200,225,250], \'weights\': [\'uniform\',\'distance\'], \'leaf_size\': [7,12,15,17]}]\ngrid_search = GridSearchCV(estimator = classifier_knn,\n                           param_grid = parameters,\n                           scoring = \'accuracy\',\n                           cv = 3,\n                           n_jobs=-1)\ngrid_search.fit(X_train, y_train)\nbest_accuracy = grid_search.best_score_\nbest_parameters = grid_search.best_params_\nprint("Best Accuracy: {:.2f} %".format(best_accuracy.mean()*100))\nprint("Best Parameters:", best_parameters)'

In [37]:
y_pred_test = classifier_knn.predict(X_test)

In [38]:
cm_final = confusion_matrix(y_test, y_pred_test)
print(cm_final)
acc_final = accuracy_score(y_test, y_pred_test)
print(acc_final)

[[181  66]
 [ 38 215]]
0.792


NAIVE BAYES

In [39]:
from sklearn.naive_bayes import GaussianNB
classifier_nb = GaussianNB()
classifier_nb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [40]:
y_pred_nb = classifier_nb.predict(X_val)

In [41]:
cm_nb = confusion_matrix(y_val, y_pred_nb)
print(cm_nb)
acc_nb = accuracy_score(y_val, y_pred_nb)
print(acc_nb)

[[148 107]
 [111 134]]
0.564


In [42]:
accuracies = cross_val_score(estimator = classifier_nb, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 60.33 %
Standard Deviation: 4.08 %


DECISION TREES

In [43]:
from sklearn.tree import DecisionTreeClassifier
classifier_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [44]:
y_pred_dt = classifier_dt.predict(X_val)

In [45]:
cm_dt = confusion_matrix(y_val, y_pred_dt)
print(cm_dt)
acc_dt = accuracy_score(y_val, y_pred_dt)
print(acc_dt)

[[176  79]
 [ 58 187]]
0.726


In [46]:
accuracies = cross_val_score(estimator = classifier_dt, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 70.47 %
Standard Deviation: 3.85 %


RANDOM FOREST CLASSIFIER

In [47]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(max_depth=7, max_features= 'auto', min_samples_split=7, n_estimators= 800)
classifier_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=7,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [48]:
y_pred_rf = classifier_rf.predict(X_val)

In [49]:
cm_rf = confusion_matrix(y_val, y_pred_rf) 
print(cm_rf)
acc_rf = accuracy_score(y_val, y_pred_rf)
print(acc_rf)

[[194  61]
 [ 21 224]]
0.836


In [50]:
accuracies = cross_val_score(estimator = classifier_rf, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 81.20 %
Standard Deviation: 2.99 %


In [51]:
'''parameters = [{'n_estimators': [800,825], 'max_features': ['auto'], 
               'max_depth': [8,9], 'min_samples_split':[8,9], 
               'min_samples_leaf':[0.5,1],'bootstrap':[True,False]}]
grid_search = GridSearchCV(estimator = classifier_rf,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs=-1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy.mean()*100))
print("Best Parameters:", best_parameters)'''
#Best Accuracy: 81.67 %
#Best Parameters: {'bootstrap': False, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 800}

'parameters = [{\'n_estimators\': [800,825], \'max_features\': [\'auto\'], \n               \'max_depth\': [8,9], \'min_samples_split\':[8,9], \n               \'min_samples_leaf\':[0.5,1],\'bootstrap\':[True,False]}]\ngrid_search = GridSearchCV(estimator = classifier_rf,\n                           param_grid = parameters,\n                           scoring = \'accuracy\',\n                           cv = 3,\n                           n_jobs=-1)\ngrid_search.fit(X_train, y_train)\nbest_accuracy = grid_search.best_score_\nbest_parameters = grid_search.best_params_\nprint("Best Accuracy: {:.2f} %".format(best_accuracy.mean()*100))\nprint("Best Parameters:", best_parameters)'

In [52]:
y_pred_test = classifier_rf.predict(X_test)

In [53]:
cm_final = confusion_matrix(y_test, y_pred_test)
print(cm_final)
acc_final = accuracy_score(y_test, y_pred_test)
print(acc_final)

[[187  60]
 [ 31 222]]
0.818


XG BOOST

In [54]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [55]:
from xgboost import XGBClassifier
classifier_xgb = XGBClassifier()
classifier_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [56]:
y_pred_xgb = classifier_xgb.predict(X_val)

In [57]:
cm_xgb = confusion_matrix(y_val, y_pred_xgb)
print(cm_xgb)
acc_xgb = accuracy_score(y_val, y_pred_xgb)
print(acc_xgb)

[[211  44]
 [ 44 201]]
0.824


In [58]:
accuracies = cross_val_score(estimator = classifier_xgb, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 79.73 %
Standard Deviation: 3.49 %


In [59]:
y_pred_test = classifier_xgb.predict(X_test)

In [60]:
cm_final = confusion_matrix(y_test, y_pred_test)
print(cm_final)
acc_final = accuracy_score(y_test, y_pred_test)
print(acc_final)

[[201  46]
 [ 49 204]]
0.81


CATBOOST

In [61]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [None]:
from catboost import CatBoostClassifier
classifier_ctb = CatBoostClassifier()
classifier_ctb.fit(X_train, y_train)

In [63]:
y_pred_ctb = classifier_ctb.predict(X_val)

In [64]:
cm_ctb = confusion_matrix(y_val, y_pred_ctb)
print(cm_ctb)
acc_ctb = accuracy_score(y_val, y_pred_ctb)
print(acc_ctb)

[[211  44]
 [ 36 209]]
0.84


In [None]:
accuracies = cross_val_score(estimator = classifier_ctb, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

#Accuracy: 80.40 %
#Standard Deviation: 4.08 %

In [66]:
y_pred_test = classifier_ctb.predict(X_test)

In [67]:
cm_final = confusion_matrix(y_test, y_pred_test)
print(cm_final)
acc_final = accuracy_score(y_test, y_pred_test)
print(acc_final)

[[200  47]
 [ 49 204]]
0.808


# PREDICTIONS (LOGISTIC REGRESION) #BEST ACCURACY

In [71]:
#1 star review
new_review = "I took four photo prints on glossy A4 sheets and \
            two photo prints on normal A4 sheets: that means total six copies. \
            And there you go, it says the cartridge is exhausted! What a nonsense! \
            Rs. 685 for six photos, that too excluding the photo paper costs! \
            They are telling lies about the number of prints possible. Maybe \
            it may be possible to take 150 colour prints with this with a lot \
            of white space, like an MS word document with a few heading lines \
            in colours. I have HP Deskjet Ink Advantage 2135 which I bought a \
            couple of weeks ago. Neither the printer nor this cartridge is meant \
            for photo printing."
new_review = re.sub('".*?"', '', new_review)
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
lem = WordNetLemmatizer()
new_review = [lem.lemmatize(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = tfidf.transform(new_corpus).toarray()
new_y_pred = classifier_logistic.predict(new_X_test)
print(new_y_pred)

[0]


In [72]:
#4 star review
new_review = "I like these headphones. They are really sharp and very futuristic. \
            The quality is very well-made and quite sturdy. The sound quality for\
            when connected to blutooth is amazing. Better than the Bose Soundlink \
            I bought. BUT my personal downside about this headphone is there is way \
            to much muffled bass. I mean a LOT of bass. So if a lot of bass in your\
            trunk is what you want, get these. However, if you want the clarity and\
            crispness with good bass, this is not for you. Dont get me wrong, these\
            are quality made headphones that many people will be very pleased with,\
            but being a bit of an audiohile, I wanted something with more crispness. \
            I want to hear every intrument in great detail, but these headphones tend\
            to push the music to the back and the bass to the front. I used these \
            headphones with my iphone 5s and 6s and a Sony Experia Z3. It might be \
            better if you were using them plugged straight into an AMP or for Dj-ing.\
            V-moda supplies Dj accessories and a microphone. In fact, if you are a DJ,\
            I would say these are definitely for you. If you are not a DJ and want a \
            higher quality headphone, get the high-end Sennheisers. The will defintely\
            be a better choice for your lsitening, but As far as I am aware, they are \
            all wired and dont supply a blutooth headphone. They also dont fold flat \
            so the case is quite large. Get the XL cushions from V-MOda unless you have\
            small ears. The XL cushions from V moda will set you back about 20 bucks but\
            are definitely worth the money. Additionally, the guard plates on both headphones\
            are interchangeable and customisable direct from V-moda. Some available even in\
            actual gold."
new_review = re.sub('".*?"', '', new_review)
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
lem = WordNetLemmatizer()
new_review = [lem.lemmatize(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = tfidf.transform(new_corpus).toarray()
new_y_pred = classifier_logistic.predict(new_X_test)
print(new_y_pred)

[1]


In [75]:
#3 star review
new_review = "Build quality is good and there is no Flex. Screen is a TN panel so not\
            that good as an IPS panel, there is slight bluish tinge in the panel. Sound quality is\
            OK. Keyboard keys are bit compressed so not the same experience in typing as for other\
            high ranging thinkpad. There is no heating issue. Performance for normal task is fine \
            because of SSD. Just looking at it feels that this thing is going to last long"
new_review = re.sub('".*?"', '', new_review)
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
lem = WordNetLemmatizer()
new_review = [lem.lemmatize(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = tfidf.transform(new_corpus).toarray()
new_y_pred = classifier_logistic.predict(new_X_test)
print(new_y_pred)

[0]
