In [1]:
import os
import pickle

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from run_multilabel_classifier import _load_comments, run

In [2]:
train_comments_path = os.path.join('../', 'data/train.csv')
test_comments_path = os.path.join('../', 'data/test_final.csv')

## Logistic regression

In [3]:
param_grid = [{
        'estimator__bag_of_words__stop_words': ['english'],
        'estimator__bag_of_words__ngram_range': [(1, 2)],
        'estimator__bag_of_words__max_features': [500],
        'estimator__dim_reduct__n_components': [300],
        'estimator__normalizer__norm': ['l2'],
        'estimator__classifier__C': [10., 100.]
}]


clf = LogisticRegression()

#### Train phase

In [4]:
multilabel_clf = run(param_grid, clf, comments_file=train_comments_path)

0 15294
1 1595
2 8449
3 478
4 7877
5 1405
0 5849
[15294  1595  8449   478  7877  1405]
0 9445
1 4254
2 8449
3 5371
4 7877
5 4444
143346
39840
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 17.0min finished


{'mean_fit_time': array([76.8087019 , 83.87642398]),
 'mean_score_time': array([4.23123312, 4.25043154]),
 'mean_test_score': array([0.64150605, 0.64147019]),
 'mean_train_score': array([0.64772299, 0.6480771 ]),
 'param_estimator__bag_of_words__max_features': masked_array(data=[500, 500],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__ngram_range': masked_array(data=[(1, 2), (1, 2)],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__stop_words': masked_array(data=['english', 'english'],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__classifier__C': masked_array(data=[10.0, 100.0],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__dim_reduct__n_components': masked_array(data=[300, 300],
             mask=[False, False],
       fill_value='?',
 

#### Saving the model

In [5]:
with open('./saved_models/logreg_trained_mlutils.pkl', 'wb') as saved_model:
    pickle.dump(multilabel_clf, file=saved_model)

#### Loading the model and testing it

In [6]:
with open('./saved_models/logreg_trained_mlutils.pkl', 'rb') as saved_model:
    loaded_clf = pickle.load(saved_model)
    print(loaded_clf)
    X_test, y_test = _load_comments(test_comments_path)
    y_test_predict = loaded_clf.predict(X_test)

    print(classification_report(y_test, y_test_predict))

OneVsRestClassifier(estimator=Pipeline(memory=None,
     steps=[('bag_of_words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=500, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=T...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          n_jobs=1)
0 6090
1 367
2 3691
3 211
4 3427
5 712
0 2416
[6090  367 3691  211 3427  712]
0 3674
1 2049
2 1275
3 2205
4 3427
5 1704
57735
14334
             precision    recall  f1-score   support

          0       0.89      0.86      0.88     14102
          1       0.57      0.44      0.50      3528
          2       0.87      0.81      0.84     10848
          3       0.76      0.61      0.68      2884
          4       0.82      0.78      0.80     11030
          5       0.82      0.54      0.65      4179

avg / total 

## Naive bayes

In [7]:
param_grid = [{
        'estimator__bag_of_words__stop_words': ['english'],
        'estimator__bag_of_words__ngram_range': [(1, 2)],
        'estimator__bag_of_words__max_features': [500],
        'estimator__dim_reduct__n_components': [300],
        'estimator__normalizer__norm': ['l2'],
        'estimator__classifier__alpha': [1.0],
        'estimator__classifier__binarize': [0.0]
}]

#### Train phase

In [8]:
multilabel_clf = run(param_grid, BernoulliNB(), comments_file=train_comments_path)

0 15294
1 1595
2 8449
3 478
4 7877
5 1405
0 5849
[15294  1595  8449   478  7877  1405]
0 9445
1 4254
2 8449
3 5371
4 7877
5 4444
143346
39840
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.4min finished


{'mean_fit_time': array([63.33680687]),
 'mean_score_time': array([5.07463994]),
 'mean_test_score': array([0.50657104]),
 'mean_train_score': array([0.50900493]),
 'param_estimator__bag_of_words__max_features': masked_array(data=[500],
             mask=[False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__ngram_range': masked_array(data=[(1, 2)],
             mask=[False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__stop_words': masked_array(data=['english'],
             mask=[False],
       fill_value='?',
            dtype=object),
 'param_estimator__classifier__alpha': masked_array(data=[1.0],
             mask=[False],
       fill_value='?',
            dtype=object),
 'param_estimator__classifier__binarize': masked_array(data=[0.0],
             mask=[False],
       fill_value='?',
            dtype=object),
 'param_estimator__dim_reduct__n_components': masked_array(data=[300],
             mask=[False

#### Saving the model

In [9]:
with open('./saved_models/naiveB_multilabel.pkl', 'wb') as saved_model:
	pickle.dump(multilabel_clf, file=saved_model)

#### Loading the model and testing it

In [10]:
with open('./saved_models/naiveB_multilabel.pkl', 'rb') as saved_model:
    loaded_clf = pickle.load(saved_model)
    print(loaded_clf)
    X_test, y_test = _load_comments(test_comments_path)
    y_test_predict = loaded_clf.predict(X_test)

    print(classification_report(y_test, y_test_predict))

OneVsRestClassifier(estimator=Pipeline(memory=None,
     steps=[('bag_of_words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=500, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=T...norm='l2')), ('classifier', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))]),
          n_jobs=1)
0 6090
1 367
2 3691
3 211
4 3427
5 712
0 2416
[6090  367 3691  211 3427  712]
0 3674
1 2049
2 1275
3 2205
4 3427
5 1704
57735
14334
             precision    recall  f1-score   support

          0       0.82      0.80      0.81     14103
          1       0.35      0.64      0.45      3538
          2       0.74      0.80      0.77     10884
          3       0.51      0.63      0.57      2891
          4       0.73      0.77      0.75     11044
          5       0.44      0.55      0.49      4087

avg / total 

## Decision tree

In [11]:
param_grid = [{
        'estimator__bag_of_words__stop_words': ['english'],
        'estimator__bag_of_words__ngram_range': [(1, 2)],
        'estimator__bag_of_words__max_features': [500],
        'estimator__dim_reduct__n_components': [300],
        'estimator__normalizer__norm': ['l2'],
        'estimator__classifier__max_depth': [5, 10, 15]
}]

#### Train phase

In [12]:
multilabel_clf = run(param_grid, DecisionTreeClassifier(), comments_file=train_comments_path)

0 15294
1 1595
2 8449
3 478
4 7877
5 1405
0 5849
[15294  1595  8449   478  7877  1405]
0 9445
1 4254
2 8449
3 5371
4 7877
5 4444
143346
39840
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 38.5min finished


{'mean_fit_time': array([100.42878318, 131.28491473, 162.20315762]),
 'mean_score_time': array([4.52870488, 4.32165112, 4.42007117]),
 'mean_test_score': array([0.60455401, 0.66752129, 0.71788436]),
 'mean_train_score': array([0.61279695, 0.71593008, 0.7996459 ]),
 'param_estimator__bag_of_words__max_features': masked_array(data=[500, 500, 500],
             mask=[False, False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__ngram_range': masked_array(data=[(1, 2), (1, 2), (1, 2)],
             mask=[False, False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__stop_words': masked_array(data=['english', 'english', 'english'],
             mask=[False, False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__classifier__max_depth': masked_array(data=[5, 10, 15],
             mask=[False, False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__dim_r

#### Saving the model

In [13]:
with open('./saved_models/dec_tree_multilabel.pkl', 'wb') as saved_model:
	pickle.dump(multilabel_clf, file=saved_model)

#### Loading the model and testing it

In [14]:
with open('./saved_models/dec_tree_multilabel.pkl', 'rb') as saved_model:
    loaded_clf = pickle.load(saved_model)
    print(loaded_clf)
    X_test, y_test = _load_comments(test_comments_path)
    y_test_predict = loaded_clf.predict(X_test)

    print(classification_report(y_test, y_test_predict))

OneVsRestClassifier(estimator=Pipeline(memory=None,
     steps=[('bag_of_words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=500, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=T...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
          n_jobs=1)
0 6090
1 367
2 3691
3 211
4 3427
5 712
0 2416
[6090  367 3691  211 3427  712]
0 3674
1 2049
2 1275
3 2205
4 3427
5 1704
57735
14334
             precision    recall  f1-score   support

          0       0.89      0.79      0.84     14124
          1       0.45      0.31      0.37      3474
          2       0.84      0.76      0.80     10942
          3       0.69      0.38      0.49      2873
          4       0.80      0.70      0.75     11120
          5       0.71      0.37      0.48      4105

avg / total 