## TFIDF with Linear SVC


In [36]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from sklearn.svm import LinearSVC

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

import numpy as np
import joblib

In [3]:
df = pd.read_csv('/kaggle/input/privatetrain/train.tsv', sep='\t')
df.head()

In [4]:
df.groupby("Sentiment").Sentiment.count().plot.bar(ylim=0)

In [5]:
df['Sentiment'] = df['Sentiment'].replace(0, 1)
df['Sentiment'] = df['Sentiment'].replace(4, 3)

In [6]:
df.groupby("Sentiment").Sentiment.count().plot.bar(ylim=0)

In [7]:
df['Phrase'] = df['Phrase'].str.replace('\d+', '') # remove digits
df['Phrase'] = df['Phrase'].str.replace('[^\w\s]', '') # remove punctuation

In [8]:
my_stop_words = ENGLISH_STOP_WORDS.union(['film', 'movie', 'cinema', 'theatre', 'hollywood'])

In [9]:
vectorizer = TfidfVectorizer(stop_words=my_stop_words, ngram_range=(1, 1))
vectors = vectorizer.fit_transform(df.Phrase)
feature_names = vectorizer.get_feature_names()

In [10]:
print(vectors.shape)

In [11]:
terms = ['']*len(feature_names)
for i, feature in enumerate(vectorizer.get_feature_names()):
    terms[i] = feature

## Build model

In [14]:
text_clf_svc_tf = Pipeline([
    ('vect', TfidfVectorizer(stop_words=my_stop_words)), 
     ('clf', LinearSVC(max_iter=10000))
     ])

In [15]:
# GridSearchCV to iterate over
param_grid = {
    'vect__ngram_range':[(1,1), (1,3), (1,5)],
    'vect__max_df':[0.8,0.9,1.0],
    'clf__C':[0.5,1.0],
    'clf__class_weight': ['balanced',None]
}

In [16]:
X = df['Phrase'] 
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
grid = GridSearchCV(text_clf_svc_tf, cv=3, param_grid=param_grid)
grid.fit(X_train,y_train)

In [18]:
# summarize results
print("Best: %f using %s" % (grid.best_score_, 
    grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [19]:
#Best: 0.726419 using {'clf__C': 0.5, 'clf__class_weight': None, 'vect__max_df': 0.8, 'vect__ngram_range': (1, 3)}

text_clf_svc_tf = Pipeline([('vect', TfidfVectorizer(stop_words=my_stop_words, max_df=0.8, ngram_range=(1, 3))), ('clf', LinearSVC(max_iter=10000, C = 0.5, class_weight=None))])

In [24]:
#plot Learning Curve
train_sizes, train_scores, validation_scores = learning_curve(
estimator = text_clf_svc_tf ,
X = X_train, y = y_train, cv=3)

In [25]:
train_sizes, train_scores, validation_scores 

In [26]:
# Calculate training and test mean and std

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(validation_scores , axis=1)
test_std = np.std(validation_scores , axis=1)

In [29]:
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training Accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', marker='+', markersize=5, linestyle='--', label='Validation Accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.title('Learning Curve')
plt.xlabel('Training Data Size')
plt.ylabel('Model accuracy')
plt.grid()
plt.legend(loc='lower right')
plt.show()
plt.savefig('lrning.png')

#Train final model
text_clf_svc_tf.fit(X_train, y_train)

In [30]:
text_clf_svc_tf.fit(X_train, y_train)

In [31]:
predictions_svc_tf = text_clf_svc_tf.predict(X_test)

In [32]:
#print a Confusion Matrix
print(metrics.confusion_matrix(y_test,predictions_svc_tf))

In [33]:
# Print a classification report
print(metrics.classification_report(y_test,predictions_svc_tf))

In [34]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions_svc_tf))
# accuracy - .744

#  precision    recall  f1-score   support

#           1       0.75      0.64      0.69      6943
#           2       0.73      0.82      0.77     15639
#           3       0.78      0.69      0.74      8630

In [35]:
cv_scores = cross_val_score(text_clf_svc_tf, X_train, y_train, cv=3)

print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_scores.mean(), cv_scores.std()))

#0.73 accuracy with a standard deviation of 0.00

## Save and retrieve model

In [38]:
joblib.dump(text_clf_svc_tf, 'SVM_tf_model.pkl')

In [39]:
model = open('SVM_tf_model.pkl','rb')

In [40]:
clf = joblib.load(model)

In [41]:
data = ["plenty of funny quotes but ultimately fell flat","why did Spielberg make this crap?","restores your faith in the world"]

In [42]:
new_data = []

for string in data:
    string1 = string.replace('\d+', '') # remove digits
    string1 = string1.replace('[^\w\s]', '') # remove punctuation
    new_data.append(string1)

print(new_data)

In [43]:
my_prediction = clf.predict(data)

In [46]:
out = {1: 'Negative', 2: 'Neutral', 3: 'Positive'}

In [48]:
for i in my_prediction:
    print(out[i])
#Negative
#Negative
#Neutral<----????