In [1]:
import pandas as pd
import numpy as np

In [92]:
df_train = pd.read_csv('./sentiments_train_test_reviews/reviews_training_26000.csv')
df_test = pd.read_csv('./sentiments_train_test_reviews/reviews_test_4000.csv')

In [93]:
df_train.head(5)

Unnamed: 0,review_id,review,sentiment
0,2,I thought this was a wonderful way to spend ti...,positive
1,3,Basically there's a family where a little boy ...,negative
2,7,"This show was an amazing, fresh & innovative i...",negative
3,8,Encouraged by the positive comments about this...,negative
4,10,Phil the Alien is one of those quirky films wh...,negative


In [94]:
df_test.head(5)

Unnamed: 0,review_id,review,sentiment
0,T_0,I have to confess that I am severely disappoin...,negative
1,T_9,I have never understood the appeal of this sho...,negative
2,T_12,This is supposed to be based on Wilkie Collins...,negative
3,T_13,Of all the British imperialist movies like Fou...,positive
4,T_15,I loved this film. Not being a swooning Ed Woo...,positive


In [95]:
df_train['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [96]:
d = {'positive': 1, 'negative':0}
# df_train['sentiment'] = df_train['sentiment'].map(d)

In [None]:
df_train.drop('review_id', inplace=True, axis=1)

In [97]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/anil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/anil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anil/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# remove stop words
def clean_data(text):

    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]

    lower_words = [word for word in words]

    return ' '.join(lower_words)

In [98]:
df_train['cleaned_review'] = df_train['review'].apply(clean_data)

In [99]:
df_test['cleaned_review'] = df_test['review'].apply(clean_data)

In [None]:
# create embeddings out of text
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=15000)

X_train = vectorizer.fit_transform(df_train['cleaned_review'])
vectorizer.get_feature_names_out()

y_train = df_train['sentiment']

print(X_train.shape)

In [93]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

# create the model, train it, print scores
clf = RandomForestClassifier(n_estimators=200)

clf.fit(X_train, y_train)

print("train score:", clf.score(X_train, y_train))

In [None]:
X_test = vectorizer.transform(df_test['review'])
df_test['sentiment'] = df_test['sentiment'].map(d)
y_test = df_test['sentiment']

In [None]:
X_test[0]

In [None]:
print("test score:", clf.score(X_test, y_test))

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred,  digits=5))
plot_confussion_matrix(y_test, y_pred)
plot_roc_curve(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, auc, roc_curve
import matplotlib.pyplot as plt
import seaborn as sn

# Create the confussion matrix
def plot_confussion_matrix(y_test, y_pred):
    ''' Plot the confussion matrix for the target labels and predictions '''
    cm = confusion_matrix(y_test, y_pred)

    # Create a dataframe with the confussion matrix values
    df_cm = pd.DataFrame(cm, range(cm.shape[0]),
                  range(cm.shape[1]))
    #plt.figure(figsize = (10,7))
    # Plot the confussion matrix
    sn.set(font_scale=1.4) #for label size
    sn.heatmap(df_cm, annot=True,fmt='.0f',annot_kws={"size": 10})# font size
    plt.show()
    
# ROC Curve
# plot no skill
# Calculate the points in the ROC curve
def plot_roc_curve(y_test, y_pred):
    ''' Plot the ROC curve for the target labels and predictions'''
    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
    roc_auc= auc(fpr,tpr)

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    

In [100]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# X, y = make_classification(random_state=0)
# X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    # random_state=0)

X_train = df_train['cleaned_review']
df_train['sentiment'] = df_train['sentiment'].map(d)
y_train = df_train['sentiment']

X_test = df_test['cleaned_review']
df_test['sentiment'] = df_test['sentiment'].map(d)
y_test = df_test['sentiment']

pipe = Pipeline([('vectorizer', TfidfVectorizer(max_features=5000)), ('clf', RandomForestClassifier(n_estimators=200))])
# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
pipe.fit(X_train, y_train)

pipe.score(X_test, y_test)

0.634

In [101]:
from joblib import dump, load
dump(pipe, 'model.joblib') 

['model.joblib']

In [108]:
import pickle
f = open('model.pkl', 'wb')
pickle.dumps(pipe, f) 

TypeError: '_io.BufferedWriter' object cannot be interpreted as an integer

In [110]:
pipe.predict(['bad'])

array([0])