In [22]:
import os
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report

from run_multilabel_classifier import _load_comments, run

In [23]:
train_comments_path = os.path.join('../', 'data/train.csv')
test_comments_path = os.path.join('../', 'data/test_final.csv')

## Dummy classifier

In [24]:
param_grid = [{
        'estimator__bag_of_words__stop_words': ['english'],
        'estimator__bag_of_words__ngram_range': [(1, 2)],
        'estimator__bag_of_words__max_features': [500],
        'estimator__dim_reduct__n_components': [300],
        'estimator__normalizer__norm': ['l2'],
        'estimator__classifier__strategy': ['uniform']
}]

clf = DummyClassifier()

#### Train phase

In [25]:
trained_clf = run(param_grid, clf, comments_file=train_comments_path)

0 15294
1 1595
2 8449
3 478
4 7877
5 1405
0 5849
[15294  1595  8449   478  7877  1405]
0 9445
1 4254
2 8449
3 5371
4 7877
5 4444
143346
39840
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s finished
  str(classes[c]))
  str(classes[c]))


{'mean_fit_time': array([0.07897167]),
 'mean_score_time': array([0.00678205]),
 'mean_test_score': array([0.41428571]),
 'mean_train_score': array([0.41428571]),
 'param_estimator__bag_of_words__max_features': masked_array(data=[500],
             mask=[False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__ngram_range': masked_array(data=[(1, 2)],
             mask=[False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__stop_words': masked_array(data=['english'],
             mask=[False],
       fill_value='?',
            dtype=object),
 'param_estimator__classifier__strategy': masked_array(data=['uniform'],
             mask=[False],
       fill_value='?',
            dtype=object),
 'param_estimator__dim_reduct__n_components': masked_array(data=[300],
             mask=[False],
       fill_value='?',
            dtype=object),
 'param_estimator__normalizer__norm': masked_array(data=['l2'],
             mask=[

  'precision', 'predicted', average, warn_for)


#### Saving the model

In [26]:
with open('./saved_models/dummy_trained_mlutils.pkl', 'wb') as saved_model:
    pickle.dump(trained_clf, file=saved_model)

#### Loading the model and testing it

In [27]:
with open('./saved_models/dummy_trained_mlutils.pkl', 'rb') as saved_model:
    loaded_clf = pickle.load(saved_model)
    print(loaded_clf)
    X_test, y_test = _load_comments(test_comments_path)
    y_test_predict = loaded_clf.predict(X_test)

    print(classification_report(y_test, y_test_predict))

OneVsRestClassifier(estimator=Pipeline(memory=None,
     steps=[('bag_of_words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=500, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=T... norm='l2')), ('classifier', DummyClassifier(constant=None, random_state=None, strategy='uniform'))]),
          n_jobs=1)
0 6090
1 367
2 3691
3 211
4 3427
5 712
0 2416
[6090  367 3691  211 3427  712]
0 3674
1 2049
2 1275
3 2205
4 3427
5 1704
57735
14334
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       100
          1       0.00      0.00      0.00         4
          2       0.00      0.00      0.00        59
          3       0.00      0.00      0.00         1
          4       0.00      0.00      0.00        49
          5       0.00      0.00      0.00         9

avg / total 

  'precision', 'predicted', average, warn_for)


## Logistic regression

In [28]:
param_grid = [{
        'estimator__bag_of_words__stop_words': ['english'],
        'estimator__bag_of_words__ngram_range': [(1, 2)],
        'estimator__bag_of_words__max_features': [500],
        'estimator__dim_reduct__n_components': [300],
        'estimator__normalizer__norm': ['l2'],
        'estimator__classifier__C': [10., 100.]
}]


clf = LogisticRegression()

#### Train phase

In [29]:
trained_clf = run(param_grid, clf, comments_file=train_comments_path)

0 15294
1 1595
2 8449
3 478
4 7877
5 1405
0 5849
[15294  1595  8449   478  7877  1405]
0 9445
1 4254
2 8449
3 5371
4 7877
5 4444
143346
39840
Fitting 5 folds for each of 2 candidates, totalling 10 fits


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


{'mean_fit_time': array([0.10530896, 0.10531878]),
 'mean_score_time': array([0.00778255, 0.00897546]),
 'mean_test_score': array([0.42857143, 0.4       ]),
 'mean_train_score': array([0.92142857, 1.        ]),
 'param_estimator__bag_of_words__max_features': masked_array(data=[500, 500],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__ngram_range': masked_array(data=[(1, 2), (1, 2)],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__stop_words': masked_array(data=['english', 'english'],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__classifier__C': masked_array(data=[10.0, 100.0],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__dim_reduct__n_components': masked_array(data=[300, 300],
             mask=[False, False],
       fill_value='?',
   

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.3s finished
  str(classes[c]))
  'precision', 'predicted', average, warn_for)


#### Saving the model

In [30]:
with open('./saved_models/logreg_trained_mlutils.pkl', 'wb') as saved_model:
    pickle.dump(trained_clf, file=saved_model)

#### Loading the model and testing it

In [31]:
with open('./saved_models/logreg_trained_mlutils.pkl', 'rb') as saved_model:
    loaded_clf = pickle.load(saved_model)
    print(loaded_clf)
    X_test, y_test = _load_comments(test_comments_path)
    y_test_predict = loaded_clf.predict(X_test)

    print(classification_report(y_test, y_test_predict))

OneVsRestClassifier(estimator=Pipeline(memory=None,
     steps=[('bag_of_words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=500, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=T...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          n_jobs=1)
0 6090
1 367
2 3691
3 211
4 3427
5 712
0 2416
[6090  367 3691  211 3427  712]
0 3674
1 2049
2 1275
3 2205
4 3427
5 1704
57735
14334
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       100
          1       0.00      0.00      0.00         6
          2       0.72      0.60      0.65        60
          3       0.00      0.00      0.00         7
          4       0.55      0.44      0.49        48
          5       0.00      0.00      0.00         9

avg / total 

  'precision', 'predicted', average, warn_for)


## SVM

In [32]:
param_grid = {
        'estimator__bag_of_words__stop_words': ['english'],
        'estimator__bag_of_words__ngram_range': [(1, 2)],
        'estimator__bag_of_words__max_features': [500],
        'estimator__dim_reduct__n_components': [300],
        'estimator__normalizer__norm': ['l2'],
        'estimator__classifier__C': [100., 0.1, 0.0001]
}

clf = SVC()

#### Train pahse

In [33]:
trained_clf = run(param_grid, clf, comments_file=train_comments_path)

0 15294
1 1595
2 8449
3 478
4 7877
5 1405
0 5849
[15294  1595  8449   478  7877  1405]
0 9445
1 4254
2 8449
3 5371
4 7877
5 4444
143346
39840
Fitting 5 folds for each of 3 candidates, totalling 15 fits


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    1.7s finished
  str(classes[c]))
  str(classes[c]))


{'mean_fit_time': array([0.08636236, 0.08058505, 0.08775525]),
 'mean_score_time': array([0.00897617, 0.008183  , 0.00777864]),
 'mean_test_score': array([0.25714286, 0.41428571, 0.41428571]),
 'mean_train_score': array([0.99285714, 0.41428571, 0.41428571]),
 'param_estimator__bag_of_words__max_features': masked_array(data=[500, 500, 500],
             mask=[False, False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__ngram_range': masked_array(data=[(1, 2), (1, 2), (1, 2)],
             mask=[False, False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__stop_words': masked_array(data=['english', 'english', 'english'],
             mask=[False, False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__classifier__C': masked_array(data=[100.0, 0.1, 0.0001],
             mask=[False, False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__dim_reduct

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


#### Saving the model

In [34]:
with open('./saved_models/svm_trained_mlutils.pkl', 'wb') as saved_model:
    pickle.dump(trained_clf, file=saved_model)

#### Loading the model and testing it

In [35]:
with open('./saved_models/logreg_trained_mlutils.pkl', 'rb') as saved_model:
    loaded_clf = pickle.load(saved_model)
    print(loaded_clf)
    X_test, y_test = _load_comments(test_comments_path)
    y_test_predict = loaded_clf.predict(X_test)

    print(classification_report(y_test, y_test_predict))

OneVsRestClassifier(estimator=Pipeline(memory=None,
     steps=[('bag_of_words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=500, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=T...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          n_jobs=1)
0 6090
1 367
2 3691
3 211
4 3427
5 712
0 2416
[6090  367 3691  211 3427  712]
0 3674
1 2049
2 1275
3 2205
4 3427
5 1704
57735
14334
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       100
          1       0.00      0.00      0.00         7
          2       0.74      0.64      0.69        58
          3       0.00      0.00      0.00         2
          4       0.59      0.52      0.55        52
          5       0.00      0.00      0.00        12

avg / total 

  'precision', 'predicted', average, warn_for)
