In [78]:
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [79]:
# https://www.kaggle.com/ashishpatel26/sentimental-analysis-nlp
#Movie review --> 1 - positive, 0 - Negative
sentimental_data = pd.read_csv("datasets/sentimental_analysis_train_data.csv", header=None,
                               names=['Label','Text'],sep='\t')
sentimental_data.head()

Unnamed: 0,Label,Text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [80]:
sentimental_data.shape

(6918, 2)

In [81]:
x = sentimental_data['Text']

y = sentimental_data['Label']

In [82]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [83]:
x_train.shape, x_test.shape

((5534,), (1384,))

In [84]:
# TfidfVectorizer. Tfidf - Term Frequence and inverse document frequence score
# TfidfVectorizer - Captures significant word in particular document as well as in whole corpus
tfidf_vect = TfidfVectorizer(max_features=15) # Limiting size of feature vector to represent each document to 15

x_trans = tfidf_vect.fit_transform(x_train)

In [85]:
print(x_trans[0:3]) # sparse metrics for 1st 3 document(review)

  (0, 10)	0.7071067811865476
  (0, 2)	0.7071067811865476
  (1, 10)	0.7071067811865476
  (1, 2)	0.7071067811865476
  (2, 12)	0.442051110368997
  (2, 7)	0.5563366139393721
  (2, 10)	0.4975341133088318
  (2, 2)	0.4975341133088318


In [86]:
# We have selected only 15 most common words in document
x_trans.shape

(5534, 15)

In [87]:
classifier = LinearSVC(C=1.0,max_iter=1000, tol=1e-3)
linear_svc_model = classifier.fit(x_trans, y_train)
linear_svc_model

LinearSVC(tol=0.001)

In [88]:
x_test_trans = tfidf_vect.fit_transform(x_test)
x_test_trans.shape

(1384, 15)

In [89]:
y_pred = classifier.predict(x_test_trans)
y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [90]:
pred_results = pd.DataFrame({'y_test':y_test, 'y_pred':y_pred})
pred_results

Unnamed: 0,y_test,y_pred
1296,1,1
5956,0,0
85,1,1
5562,0,0
1375,1,1
...,...,...
3737,1,1
4189,0,0
2880,1,1
3425,1,1


In [91]:
pred_results.loc[pred_results['y_pred'] == 1]

Unnamed: 0,y_test,y_pred
1296,1,1
85,1,1
1375,1,1
1230,1,1
535,1,1
...,...,...
1900,1,1
3737,1,1
2880,1,1
3425,1,1


In [92]:
pred_results.loc[pred_results['y_pred'] == 0]

Unnamed: 0,y_test,y_pred
5956,0,0
5562,0,0
5252,0,0
5462,0,0
5547,0,0
...,...,...
5596,0,0
2661,1,0
5228,0,0
5665,0,0


In [93]:
print(pred_results[pred_results['y_test'] == pred_results['y_pred']])

      y_test  y_pred
1296       1       1
5956       0       0
85         1       1
5562       0       0
1375       1       1
...      ...     ...
3737       1       1
4189       0       0
2880       1       1
3425       1       1
161        1       1

[1237 rows x 2 columns]


In [94]:
from sklearn.metrics import accuracy_score
print(type(y_test),type(y_pred))
accuracy_score = accuracy_score(y_test,y_pred)
print(accuracy_score)

<class 'pandas.core.series.Series'> <class 'numpy.ndarray'>
0.8937861271676301


In [95]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
tn, fp, fn, tp = cm.reshape(-1)
print(cm)
print("tn = {}, fp = {}, fn = {}, tp = {}".format(tn, fp, fn, tp))
total = tn+fp+fn+tp
print(total)
accuracy = (tp+tn)/total
print('accuracy : ', accuracy)
precision_0 = tn/(tn+fn) #negative
print('precision_0 : ',precision_0)
precision_1 = tp/(tp+fp) #positive
print('precision_1 : ',precision_1)
recall_0 = tn/(tn+fp) #negative
print('recall_0 : ',recall_0)
recall_1 = tp/(tp+fn) #positive
print('recall_1 : ',recall_1)

f1_score_0 = 2*precision_0*precision_0/(precision_0+precision_0)
f1_score_1 = 2*precision_1*precision_1/(precision_1+precision_1)
print('f1_score_0 : ', f1_score_0)
print('f1_score_1 : ', f1_score_1)

[[569  20]
 [127 668]]
tn = 569, fp = 20, fn = 127, tp = 668
1384
accuracy :  0.8937861271676301
precision_0 :  0.8175287356321839
precision_1 :  0.9709302325581395
recall_0 :  0.966044142614601
recall_1 :  0.8402515723270441
f1_score_0 :  0.8175287356321839
f1_score_1 :  0.9709302325581395


In [96]:
from sklearn.metrics import classification_report
cp = classification_report(y_test,y_pred)
print(cp)

              precision    recall  f1-score   support

           0       0.82      0.97      0.89       589
           1       0.97      0.84      0.90       795

    accuracy                           0.89      1384
   macro avg       0.89      0.90      0.89      1384
weighted avg       0.91      0.89      0.89      1384



### We need to save preprocessing details of the model since we need to use same object for performing transformation in x_test before predicting. In this case, we are saving tfidf_vect

In [97]:
text_clf_params = {}

text_clf_params['preprocessing'] = tfidf_vect
text_clf_params['model'] = linear_svc_model
text_clf_params['sklearn_version'] = sklearn.__version__ 
text_clf_params['accuracy'] = accuracy_score 

In [98]:
text_clf_params

{'preprocessing': TfidfVectorizer(max_features=15),
 'model': LinearSVC(tol=0.001),
 'sklearn_version': '0.23.2',
 'accuracy': 0.8937861271676301}

### Serializing to disk using JobLib

In [99]:
import joblib

In [100]:
filename = "models/txt_clf_checkpoint.joblib"
joblib.dump(text_clf_params,filename)

['models/txt_clf_checkpoint.joblib']

In [101]:
clf_checkpoint = joblib.load(filename)

In [102]:
clf_checkpoint

{'preprocessing': TfidfVectorizer(max_features=15),
 'model': LinearSVC(tol=0.001),
 'sklearn_version': '0.23.2',
 'accuracy': 0.8937861271676301}

In [113]:
reloaded_vect = clf_checkpoint['preprocessing']
reloaded_vect

TfidfVectorizer(max_features=15)

In [114]:
clf_model = clf_checkpoint['model']
clf_model

LinearSVC(tol=0.001)

In [123]:
x_test_trans_new = reloaded_vect.fit_transform(x_test)

In [124]:
y_pred = clf_model.predict(x_test_trans_new)
y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [141]:
acc = accuracy_score(y_test,y_pred)
print(acc)

0.8930635838150289


In [133]:
text_clf_params['accuracy']

0.8937861271676301

# sklearn Pipeline

Sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods. The final estimator only needs to implement fit. The transformers in the pipeline can be cached using memory argument.

In [134]:
from sklearn.pipeline import Pipeline

In [135]:
# steps(list) - List of (name, transform) tuples (implementing fit/transform) that are chained, 
# in the order in which they are chained, with the last object an estimator.
clf_pipeline = Pipeline(steps=[('tfidf_vect',tfidf_vect), ('classifier',classifier)])
pipeline_model = clf_pipeline.fit(x_train,y_train)

In [136]:
y_pred = pipeline_model.predict(x_test)
y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [153]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8930635838150289

In [154]:
print(acc)

0.8930635838150289


In [155]:
pipe_clf_params = {}

pipe_clf_params['pipeline_clf'] = pipeline_model
pipe_clf_params['sklearn_version'] = sklearn.__version__ 
pipe_clf_params['accuracy'] = accuracy 

In [156]:
filename = "models/pipe_clf_checkpoint.joblib"
joblib.dump(text_clf_params,filename)

clf_checkpoint = joblib.load(filename)

clf_checkpoint

{'pipeline_clf': Pipeline(steps=[('tfidf_vect', TfidfVectorizer(max_features=15)),
                 ('classifier', LinearSVC(tol=0.001))]),
 'sklearn_version': '0.23.2',
 'accuracy': <function sklearn.metrics._classification.accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None)>}

In [157]:
reloaded_pipeline = pipe_clf_params['pipeline_clf']
reloaded_pipeline

Pipeline(steps=[('tfidf_vect', TfidfVectorizer(max_features=15)),
                ('classifier', LinearSVC(tol=0.001))])

In [159]:
y_pred = reloaded_pipeline.predict(x_test)
y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [160]:
accuracy_score(y_test,y_pred)

0.8930635838150289

In [161]:
pipe_clf_params['accuracy']

0.8930635838150289