In [55]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

In [17]:
converters = {"embedding": lambda x: np.fromstring(x[1:-1], sep="  ")}
df = pd.read_csv('science_preprocessed_embedding.csv', converters=converters)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8396 entries, 0 to 8395
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         8396 non-null   object
 1   Comment    8396 non-null   object
 2   Topic      8396 non-null   object
 3   embedding  8396 non-null   object
dtypes: object(4)
memory usage: 262.5+ KB


In [38]:
X = df.loc[:, 'embedding'].tolist()
Y = df.loc[:, 'Topic'].tolist()

In [39]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15,
                                                    random_state=7)
# _test is for final validation only!

In [32]:
rfc = RandomForestClassifier(n_estimators=100, random_state=7)

In [37]:
_ = cross_validate(rfc, x_train, y_train, scoring=['accuracy'])
_

{'fit_time': array([6.43953133, 6.32897615, 6.40564036, 6.51249909, 7.78329659]),
 'score_time': array([0.09704113, 0.04700089, 0.05152059, 0.05600262, 0.06805444]),
 'test_accuracy': array([0.69677871, 0.70567624, 0.69586545, 0.67904695, 0.6923616 ])}

In [42]:
sv = SVC(kernel='linear', random_state=7)

In [44]:
_ = cross_validate(sv, x_train, y_train, scoring=['accuracy'])
_

{'fit_time': array([1.99537277, 1.60805035, 1.60653186, 1.55415797, 1.58533883]),
 'score_time': array([0.20796824, 0.21499157, 0.22251701, 0.21225548, 0.20900178]),
 'test_accuracy': array([0.67086835, 0.70567624, 0.6706377 , 0.66433076, 0.67133847])}

In [57]:
sv_ = SVC(kernel='linear', random_state=7)
search = GridSearchCV(sv_,
                      {'C': [0.1, 1, 10, 100],
                       'kernel': ['linear', 'rbf'],
                       },
                      scoring='accuracy', n_jobs=-1, verbose=3)
search.fit(x_train, y_train);

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [58]:
search.cv_results_

{'mean_fit_time': array([ 6.46514149, 10.13927932,  7.81648836,  7.82645345, 10.49346156,
         7.76989942, 37.88509836, 13.39023499]),
 'std_fit_time': array([1.05580651, 1.07526714, 1.32887965, 0.53453762, 0.3503068 ,
        0.65662611, 1.32560183, 1.11862119]),
 'mean_score_time': array([0.80857987, 3.58352532, 0.79644217, 2.62824035, 0.70867729,
        2.37821674, 0.43724585, 2.56178641]),
 'std_score_time': array([0.03852799, 0.66234223, 0.15778997, 0.08374678, 0.07610315,
        0.18194912, 0.16786083, 0.43851958]),
 'param_C': masked_array(data=[0.1, 0.1, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf',
                    'linear', 'rbf'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C':

In [59]:
search.best_score_

0.6984307247776476

In [60]:
search.best_params_

{'C': 10, 'kernel': 'rbf'}

In [52]:
rf_ = RandomForestClassifier(random_state=7)
searchf = GridSearchCV(rf_, {'n_estimators': [1,5,10,50,100,300]},
                      scoring='accuracy', n_jobs=-1, verbose=3)
searchf.fit(x_train, y_train);

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [54]:
searchf.best_score_

0.6992716497951669

In [None]:
#

In [61]:
accuracy_score(y_test, search.predict(x_test))

0.7285714285714285

In [62]:
import pickle
pickle.dump(search, open('science_svm.pkl', 'wb'))