In [1]:
import sklearn
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
scikit_version = sklearn.__version__

scikit_version

'1.3.0'

In [3]:
sentimental_data = pd.read_csv('sentimental_analysis_data.csv', 
                               header=None, 
                               names=['Label', 'Text'], 
                               sep='\t')

sentimental_data.sample(10)

Unnamed: 0,Label,Text
2929,1,I love Harry Potter.
4996,0,and mission impossible was pretty crappy aside...
3053,1,I LOVE BROKEBACK MOUNTAIN..
2319,1,I love Harry Potter.
2008,1,"I absolutely LOVE Harry Potter, as you can tel..."
4252,0,"by the way, the Da Vinci Code sucked, just let..."
577,1,The Da Vinci Code is awesome..
525,1,DA VINCI CODE IS AWESOME!!
2436,1,Harry Potter is AWESOME I don't care if anyone...
4718,0,Da Vinci Code sucks be...


In [4]:
sentimental_data.shape

(6918, 2)

In [5]:
X = sentimental_data['Text']

Y = sentimental_data['Label']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [7]:
x_train.shape, x_test.shape

((5534,), (1384,))

In [8]:
y_train.shape, y_test.shape

((5534,), (1384,))

In [9]:
tfidf_vect = TfidfVectorizer(max_features=15)

x_trans = tfidf_vect.fit_transform(x_train)

In [10]:
tfidf_vect

In [11]:
print(x_trans[0:3])

  (0, 1)	0.4815954400643128
  (0, 6)	0.4869721732971091
  (0, 9)	0.4869721732971091
  (0, 0)	0.4201358431053717
  (0, 12)	0.34244402508436333
  (1, 8)	0.2725279386731572
  (1, 7)	0.2797898015398152
  (1, 11)	0.5034312487408347
  (1, 5)	0.5034312487408347
  (1, 0)	0.5404196642611973
  (1, 12)	0.22024243837002258
  (2, 7)	0.45972159749471897
  (2, 1)	0.5089285405607112
  (2, 6)	0.5146104319772619
  (2, 9)	0.5146104319772619


In [12]:
x_trans.shape

(5534, 15)

In [13]:
classifier = LinearSVC(C=1.0, max_iter=1000, tol=1e-3)
linear_svc_model = classifier.fit(x_trans, y_train)

linear_svc_model



In [14]:
x_test_trans = tfidf_vect.fit_transform(x_test)

In [15]:
x_test_trans.shape

(1384, 15)

In [16]:
y_pred = linear_svc_model.predict(x_test_trans)

y_pred

array([0, 1, 1, ..., 1, 1, 1])

In [17]:
pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})

pred_results.sample(5)

Unnamed: 0,y_test,y_pred
2235,1,1
83,1,1
5038,0,0
6458,0,0
6656,0,0


In [18]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8865606936416185

In [19]:
text_clf_param = {}

text_clf_param['preprocessing'] = tfidf_vect
text_clf_param['model'] = linear_svc_model
text_clf_param['sklearn_version'] = scikit_version
text_clf_param['accuracy'] = accuracy

In [20]:
text_clf_param

{'preprocessing': TfidfVectorizer(max_features=15),
 'model': LinearSVC(tol=0.001),
 'sklearn_version': '1.3.0',
 'accuracy': 0.8865606936416185}

In [21]:
import joblib

In [22]:
filename = 'models/text_clf_checkpoint.joblib'

In [23]:
joblib.dump(text_clf_param, filename)

['models/text_clf_checkpoint.joblib']

In [24]:
clf_checkpoint = joblib.load(filename)

In [25]:
reloaded_vect = clf_checkpoint['preprocessing']

reloaded_vect

In [26]:
clf_model = clf_checkpoint['model']

clf_model

In [27]:
x_test_trans_new = reloaded_vect.fit_transform(x_test)

In [28]:
y_pred = clf_model.predict(x_test_trans_new)

y_pred

array([0, 1, 1, ..., 1, 1, 1])

In [29]:
accuracy_score(y_test, y_pred)

0.8865606936416185

In [30]:
clf_checkpoint['accuracy']

0.8865606936416185

In [31]:
from sklearn.pipeline import Pipeline

In [32]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', classifier)])

pipeline_model = clf_pipeline.fit(x_train, y_train)



In [33]:
pipeline_model

In [34]:
y_pred = pipeline_model.predict(x_test)

In [35]:
accuracy = accuracy_score(y_test, y_pred)

accuracy

0.8865606936416185

In [36]:
pipe_clf_param = {}

pipe_clf_param['pipeline_clf'] = pipeline_model
pipe_clf_param['sklearn_version'] = scikit_version
pipe_clf_param['accuracy'] = accuracy

In [37]:
pipe_clf_param

{'pipeline_clf': Pipeline(steps=[('tfidf_vect', TfidfVectorizer(max_features=15)),
                 ('classifier', LinearSVC(tol=0.001))]),
 'sklearn_version': '1.3.0',
 'accuracy': 0.8865606936416185}

In [38]:
filename = 'models/pipe_clf_checkpoint.joblib'

In [39]:
joblib.dump(pipe_clf_param, filename)

['models/pipe_clf_checkpoint.joblib']

In [40]:
pipe_clf_checkpoint = joblib.load(filename)

In [41]:
reloaded_pipeline = pipe_clf_checkpoint['pipeline_clf']

reloaded_pipeline

In [42]:
y_pred = reloaded_pipeline.predict(x_test)

In [43]:
accuracy_score(y_test, y_pred)

0.8865606936416185

In [44]:
pipe_clf_checkpoint['accuracy']

0.8865606936416185