In [1]:
import os
import pickle

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import run_binary_classifier
import run_multilabel_classifier

In [2]:
train_binary = os.path.join('../../../', 'data/train_binary.csv')
test_binary = os.path.join('../../../', 'data/test_clean_binary.csv')

train_multilabel = os.path.join('../../../', 'data/train.csv')
test_multilabel = os.path.join('../../../', 'data/test_clean.csv')

## Logistic regression

In [3]:
binary_param_grid = {
        'bag_of_words__stop_words': ['english'],
        'bag_of_words__ngram_range': [(1, 2)],
        'bag_of_words__max_features': [500],
        'dim_reduct__n_components': [300],
        'normalizer__norm': ['l2'],
        'classifier__C': [5., 10.]
}


multilabel_param_grid  = [{
        'estimator__bag_of_words__stop_words': ['english'],
        'estimator__bag_of_words__ngram_range': [(1, 2)],
        'estimator__bag_of_words__max_features': [500],
        'estimator__dim_reduct__n_components': [300],
        'estimator__normalizer__norm': ['l2'],
        'estimator__classifier__C': [5., 10.]
}]

#### Train binary

In [4]:
binary_clf = run_binary_classifier.run(binary_param_grid, LogisticRegression(), comments_file=train_binary)

with open('./saved_models/log_reg_joint_binary.pkl', 'wb') as saved_model:
	pickle.dump(binary_clf, file=saved_model)



Fitting 5 folds for each of 2 candidates, totalling 10 fits
             precision    recall  f1-score   support

          0       0.69      0.73      0.71        15
          1       0.71      0.67      0.69        15

avg / total       0.70      0.70      0.70        30

{'bag_of_words__max_features': 500, 'bag_of_words__ngram_range': (1, 2), 'bag_of_words__stop_words': 'english', 'classifier__C': 5.0, 'dim_reduct__n_components': 300, 'normalizer__norm': 'l2'}


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.6s finished


#### Train multilabel

In [5]:
multilabel_clf = run_multilabel_classifier.run(multilabel_param_grid, LogisticRegression(), comments_file=train_multilabel)
with open('./saved_models/log_reg_joint_multilabel.pkl', 'wb') as saved_model:
	pickle.dump(binary_clf, file=saved_model)

  str(classes[c]))
  str(classes[c]))


Fitting 5 folds for each of 2 candidates, totalling 10 fits


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


{'mean_fit_time': array([0.11554632, 0.11701474]),
 'mean_score_time': array([0.00883236, 0.00886106]),
 'mean_test_score': array([0.45714286, 0.42857143]),
 'mean_train_score': array([0.78928571, 0.95714286]),
 'param_estimator__bag_of_words__max_features': masked_array(data=[500, 500],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__ngram_range': masked_array(data=[(1, 2), (1, 2)],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__bag_of_words__stop_words': masked_array(data=['english', 'english'],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__classifier__C': masked_array(data=[5.0, 10.0],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_estimator__dim_reduct__n_components': masked_array(data=[300, 300],
             mask=[False, False],
       fill_value='?',
     

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.5s finished
  str(classes[c]))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


#### Predict binary

In [6]:
X_binary_test, y_binary_test = run_binary_classifier.load_comments(test_binary)
y_binary_test_predict = binary_clf.predict(X_binary_test)

print(classification_report(y_binary_test, y_binary_test_predict))

             precision    recall  f1-score   support

          0       0.72      0.82      0.77        50
          1       0.79      0.68      0.73        50

avg / total       0.75      0.75      0.75       100



#### Predict multilabel

In [7]:
X_multilabel_test, y_multilabel_test = run_multilabel_classifier.load_comments(test_multilabel)
y_multilabel_test_predict = multilabel_clf.predict(X_multilabel_test)

print(classification_report(y_multilabel_test, y_multilabel_test_predict))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       100
          1       0.20      0.14      0.17         7
          2       0.62      0.78      0.69        59
          3       0.00      0.00      0.00         7
          4       0.59      0.64      0.62        50
          5       0.00      0.00      0.00        12

avg / total       0.71      0.76      0.74       235



  'precision', 'predicted', average, warn_for)


### Final joint prediction

In [8]:
final_predictions = np.full_like(y_multilabel_test_predict, -1)


non_toxic_indices = np.argwhere(y_binary_test_predict == 0).flatten()
toxic_indices = np.argwhere(y_binary_test_predict == 1).flatten()

# place binary classifier's prediction of clean comments
final_predictions[non_toxic_indices] = np.array([0, 0, 0, 0, 0, 0])

multilabel_toxic_predictions = y_multilabel_test_predict[toxic_indices]
final_predictions[toxic_indices] = multilabel_toxic_predictions

print(classification_report(y_multilabel_test, final_predictions))

             precision    recall  f1-score   support

          0       1.00      0.43      0.60       100
          1       0.25      0.14      0.18         7
          2       0.56      0.31      0.40        59
          3       0.00      0.00      0.00         7
          4       0.46      0.26      0.33        50
          5       0.00      0.00      0.00        12

avg / total       0.67      0.32      0.43       235



  'precision', 'predicted', average, warn_for)
