In [1]:
import pandas as pd
# train Data
Data = pd.read_csv("https://imdb-reviews-data.s3.us-east-2.amazonaws.com/IMDB%2BDataset+(1).csv")
Data.head()

Unnamed: 0,sentiment,review
0,positive,One of the other reviewers has mentioned that ...
1,positive,A wonderful little production. <br /><br />The...
2,positive,I thought this was a wonderful way to spend ti...
3,negative,Basically there's a family where a little boy ...
4,positive,"Petter Mattei's ""Love in the Time of Money"" is..."


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             ngram_range = (2,2), 
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(Data['review'])

In [3]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, Data['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(train_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(Data['sentiment'], prediction_linear, output_dict=True)
print('positive: ', report['positive'])
print('negative: ', report['negative'])

Training time: 6426.417742s; Prediction time: 1788.998567s
positive:  {'precision': 0.9898105969791416, 'recall': 0.9908003679852806, 'f1-score': 0.9903052351730066, 'support': 25001}
negative:  {'precision': 0.9907911595131326, 'recall': 0.9898004079836806, 'f1-score': 0.9902955359465354, 'support': 25001}


In [5]:
review = """good"""
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

['negative']


In [6]:
review = """not good"""
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

['negative']


In [7]:
review = """not bad"""
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

['positive']


In [8]:
review = """bad"""
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

['negative']


In [9]:
from joblib import dump, load
dump(classifier_linear, 'svm_model_4.joblib')

['svm_model_4.joblib']