In [19]:
import pandas as pd
# train Data
Data = pd.read_csv("https://imdb-reviews-data.s3.us-east-2.amazonaws.com/IMDB%2BDataset+(1).csv")
Data.head()

Unnamed: 0,sentiment,review
0,positive,One of the other reviewers has mentioned that ...
1,positive,A wonderful little production. <br /><br />The...
2,positive,I thought this was a wonderful way to spend ti...
3,negative,Basically there's a family where a little boy ...
4,positive,"Petter Mattei's ""Love in the Time of Money"" is..."


In [20]:
Data.sample(frac=1).head(5) # shuffle the df and pick first 5

Unnamed: 0,sentiment,review
27245,positive,A fun romp...a lot of good twists and turns! (...
34095,positive,"The romance of the movie, which is also its ma..."
37508,negative,"Maybe it's just that it was made in 1997, or m..."
24991,negative,"i think this one sucked on ice, because it lef..."
18476,negative,The scariest thing about freshman director Car...


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(Data['review'])

In [22]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, Data['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(train_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(Data['sentiment'], prediction_linear, output_dict=True)

Training time: 2623.096130s; Prediction time: 986.158912s


In [23]:
print('positive: ', report['positive'])
print('negative: ', report['negative'])

positive:  {'precision': 0.954001034578807, 'recall': 0.9589616415343386, 'f1-score': 0.9564749062475065, 'support': 25001}
negative:  {'precision': 0.9587471352177235, 'recall': 0.953761849526019, 'f1-score': 0.9562479948668592, 'support': 25001}


In [24]:
review = """I did not like it that much"""
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

['negative']


In [25]:
import pickle
reviews_svm = pickle.dumps(classifier_linear)

In [26]:
model = pickle.loads(reviews_svm)

In [27]:
review = """I did not like it that much"""
review_vector = vectorizer.transform([review]) # vectorizing
print(model.predict(review_vector))

['negative']


In [28]:
from joblib import dump, load
dump(classifier_linear, 'svm_model.joblib')

['svm_model.joblib']