In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
def convert_sentiment (score):
    score = int(score)
    if score > 2:
        label = 1
    elif score == 2:
        label = 0
    else:
        label = -1

    return label

In [9]:
dataset = pd.read_csv('./data/train.tsv', sep='\t', header=0)

dataset = dataset.sample(frac =.50)
# Convert text to lowercase
dataset['Phrase'] = dataset['Phrase'].str.strip().str.lower()
dataset['Sentiment'] = dataset['Sentiment'].map(lambda a: convert_sentiment(a))
dataset.info()

dataset.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78030 entries, 151417 to 36575
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PhraseId    78030 non-null  int64 
 1   SentenceId  78030 non-null  int64 
 2   Phrase      78030 non-null  object
 3   Sentiment   78030 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 3.0+ MB


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
151417,151418,8256,big time .,0
71131,71132,3630,those on both sides of the issues,0
98922,98923,5188,the heavy doses of weird performances and dire...,0
59134,59135,2983,without clobbering the audience over the head,1
26002,26003,1190,"are so unmemorable , despite several attempts ...",-1


In [73]:
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
x = dataset['Phrase']
y = dataset['Sentiment']


x, x_test, y, y_test = train_test_split(x,y,test_size=0.25, random_state=101)
print(x_test)

61745                                      seems obligatory
5164                                 more sophisticated and
2352                                    a corny examination
6021                 was essentially , by campaign 's end ,
45636                      squabbling working-class spouses
                                ...                        
115575                                        among sequels
36939                           is truly gorgeous to behold
88248                                                  pics
123369    half of dragonfly is worse : the part where no...
68758                                       olympia , wash.
Name: Phrase, Length: 19508, dtype: object


In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Vectorizing the text data
X_train_vec = vectorizer.fit_transform(x)
X_test_vec = vectorizer.transform(x_test)

In [75]:
from sklearn.linear_model import LogisticRegression
#Training the model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vec, y)

LogisticRegression(max_iter=1000)

In [76]:
lr_score = lr.score(X_test_vec, y_test)
print("Results for Logistic Regression with TfidfVectorizer")
print(lr_score)


Results for Logistic Regression with CountVectorizer
0.7006868976830019


In [87]:
import joblib
import pickle

# Save model
joblib.dump(lr, './model/linear_regression.pkl')


['./model/linear_regression.pkl']

In [79]:
review = """Do not purchase this product. My cell phone blast when I switched the charger"""
review_vector = vectorizer.transform([review])
print(vectorizer.transform([review]))
print(lr.predict(review_vector))


  (0, 8980)	0.2763957880161891
  (0, 8217)	0.20065196500451543
  (0, 8177)	0.10951245356872427
  (0, 6243)	0.37351600915643696
  (0, 5942)	0.42134554891777465
  (0, 5515)	0.2192565980553573
  (0, 5357)	0.3031549853914983
  (0, 2363)	0.26907652402159377
  (0, 1288)	0.4054556089145027
  (0, 887)	0.42134554891777465
[0]


In [81]:
from sklearn import svm
from sklearn.metrics import classification_report

svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train_vec, y)

prediction_linear = svm_model.predict(X_test_vec)
report = classification_report(y_test, prediction_linear, output_dict=True)

In [92]:
import joblib
import pickle

# Save model
joblib.dump(svm_model, './model/svm_linear_model.pkl')

file = open('./temp/vec_svm', 'wb')
pickle.dump(vectorizer, file)
file.close()

In [82]:
print(report)

{'-1': {'precision': 0.6757078986587183, 'recall': 0.5251331943479268, 'f1-score': 0.5909801876955162, 'support': 4317}, '0': {'precision': 0.6919732441471572, 'recall': 0.8294247344157146, 'f1-score': 0.7544899261555292, 'support': 9978}, '1': {'precision': 0.7495826377295493, 'recall': 0.6029157874544409, 'f1-score': 0.6682968318094834, 'support': 5213}, 'accuracy': 0.701558335042034, 'macro avg': {'precision': 0.7057545935118082, 'recall': 0.6524912387393608, 'f1-score': 0.6712556485535096, 'support': 19508}, 'weighted avg': {'precision': 0.7037684190636745, 'recall': 0.701558335042034, 'f1-score': 0.69527339233567, 'support': 19508}}


In [93]:
loaded_model = joblib.load('./model/svm_linear_model.pkl')
review = """I love this movie"""
review_vector = vectorizer.transform([review]) # vectorizing
print(loaded_model.predict(review_vector))

[1]
