In [1]:
import os
import pickle

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

import run_binary_classifier
import run_multilabel_classifier

In [2]:
train_binary = os.path.join('../../../', 'data/train_binary.csv')
test_binary = os.path.join('../../../', 'data/test_clean_binary.csv')

train_multilabel = os.path.join('../../../', 'data/train.csv')
test_multilabel = os.path.join('../../../', 'data/test_clean.csv')

In [9]:
import pandas as pd

In [18]:
trb = pd.read_csv(train_binary)
tstb = pd.read_csv(test_binary)
trm = pd.read_csv(train_multilabel)
tstm = pd.read_csv(test_multilabel)

In [19]:
trb = trb.values[:, 2:]
tstb = tstb.values[:, 2:]
trm = trm.values[:, 2:]
tstm = tstm.values[:, 2:]

In [20]:
print(trb.view())
print(tstb.view())
print(trm.view())
print(tstm.view())

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 ...
 [0]
 [1]
 [0]]
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ...
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ...
 [0 0 0 0 0 0]
 [1 0 1 0 1 0]
 [0 0 0 0 0 0]]


In [13]:
tstb.view()

array([[0,
        'Thank you for understanding. I think very highly of you and would not revert without discussion.',
        0],
       [1, ':Dear god this site is horrible.', 0],
       [2,
        '"::: Somebody will invariably try to add Religion?  Really??  You mean, the way people have invariably kept adding ""Religion"" to the Samuel Beckett infobox?  And why do you bother bringing up the long-dead completely non-existent ""Influences"" issue?  You\'re just flailing, making up crap on the fly. \n ::: For comparison, the only explicit acknowledgement in the entire Amos Oz article that he is personally Jewish is in the categories!    \n\n "',
        0],
       ...,
       [63975,
        '==shame on you all!!!== \n\n You want to speak about gays and not about romanians...',
        0],
       [63976,
        'MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MOVIES. HE HAS SO MUCH BUTTSEX THAT HIS ASSHOLE IS NOW BIG ENOUGH TO BE CONSIDERED A COUNTRY.',
        1],
       [63977,
     

## Logistic regression

In [3]:
binary_param_grid = {
        'bag_of_words__stop_words': ['english'],
        'bag_of_words__ngram_range': [(1, 2)],
        'bag_of_words__max_features': [500],
        'dim_reduct__n_components': [300],
        'normalizer__norm': ['l2'],
        'classifier__C': [5., 10.]
}


multilabel_param_grid  = [{
        'estimator__bag_of_words__stop_words': ['english'],
        'estimator__bag_of_words__ngram_range': [(1, 2)],
        'estimator__bag_of_words__max_features': [500],
        'estimator__dim_reduct__n_components': [300],
        'estimator__normalizer__norm': ['l2'],
        'estimator__classifier__C': [5., 10.]
}]

#### Train binary

In [4]:
binary_clf = run_binary_classifier.run(binary_param_grid, LogisticRegression(), comments_file=train_binary)

with open('./saved_models/log_reg_joint_binary.pkl', 'wb') as saved_model:
	pickle.dump(binary_clf, file=saved_model)



Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.6min finished


             precision    recall  f1-score   support

          0       0.81      0.92      0.86      4883
          1       0.90      0.78      0.84      4852

avg / total       0.86      0.85      0.85      9735

{'bag_of_words__max_features': 500, 'bag_of_words__ngram_range': (1, 2), 'bag_of_words__stop_words': 'english', 'classifier__C': 10.0, 'dim_reduct__n_components': 300, 'normalizer__norm': 'l2'}


#### Train multilabel

In [5]:
multilabel_clf = run_multilabel_classifier.run(multilabel_param_grid, LogisticRegression(), comments_file=train_multilabel)
with open('./saved_models/log_reg_joint_multilabel.pkl', 'wb') as saved_model:
	pickle.dump(multilabel_clf, file=saved_model)

0 15294
1 1595
2 8449
3 478
4 7877
5 1405
0 5849
[15294  1595  8449   478  7877  1405]
0 9445
1 4254
2 8449
3 5371
4 7877
5 4444
143346
39840
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 68.1min finished


             precision    recall  f1-score   support

          0       0.99      1.00      0.99     73851
          1       0.85      0.76      0.80     25075
          2       0.95      0.98      0.97     55793
          3       0.84      0.68      0.75      7915
          4       0.94      0.99      0.96     69097
          5       0.93      0.79      0.85     22484

avg / total       0.94      0.94      0.94    254215

{'estimator__bag_of_words__max_features': 500,
 'estimator__bag_of_words__ngram_range': (1, 2),
 'estimator__bag_of_words__stop_words': 'english',
 'estimator__classifier__C': 10.0,
 'estimator__dim_reduct__n_components': 300,
 'estimator__normalizer__norm': 'l2'}


#### Predict binary

In [6]:
X_binary_test, y_binary_test = run_binary_classifier.load_comments(test_binary)
y_binary_test_predict = binary_clf.predict(X_binary_test)
print(classification_report(y_binary_test, y_binary_test_predict))

             precision    recall  f1-score   support

          0       0.84      0.84      0.84      6242
          1       0.84      0.84      0.84      6243

avg / total       0.84      0.84      0.84     12485



#### Predict multilabel

In [7]:
X_multilabel_test, y_multilabel_test = run_multilabel_classifier.load_comments(test_multilabel)
y_multilabel_test_predict = multilabel_clf.predict(X_multilabel_test)
print(classification_report(y_multilabel_test, y_multilabel_test_predict))

0 6090
1 367
2 3691
3 211
4 3427
5 712
0 2416
[6090  367 3691  211 3427  712]
0 3674
1 2049
2 1275
3 2205
4 3427
5 1704
57735
14334
             precision    recall  f1-score   support

          0       0.49      1.00      0.66     14108
          1       0.50      0.44      0.47      3466
          2       0.51      0.94      0.66     10889
          3       0.65      0.67      0.66      2880
          4       0.43      0.98      0.60     11049
          5       0.71      0.67      0.69      4142

avg / total       0.51      0.89      0.63     46534



### Final joint prediction

In [8]:
final_predictions = np.full_like(y_multilabel_test_predict, -1)


non_toxic_indices = np.argwhere(y_binary_test_predict == 0).flatten()
toxic_indices = np.argwhere(y_binary_test_predict == 1).flatten()

# place binary classifier's prediction of clean comments
final_predictions[non_toxic_indices] = np.array([0, 0, 0, 0, 0, 0])

multilabel_toxic_predictions = y_multilabel_test_predict[toxic_indices]
final_predictions[toxic_indices] = multilabel_toxic_predictions

print(classification_report(y_multilabel_test, final_predictions))

ValueError: Mix type of y not allowed, got types {'multilabel-indicator', 'multiclass-multioutput'}

## Naive Bayes

In [None]:
binary_param_grid = {
        'bag_of_words__stop_words': ['english'],
        'bag_of_words__ngram_range': [(1, 2)],
        'bag_of_words__max_features': [500],
        'dim_reduct__n_components': [300],
        'normalizer__norm': ['l2'],
        'classifier__alpha': [1.0],
        'classifier__binarize': [0.0]
}

multilabel_param_grid = [{
        'estimator__bag_of_words__stop_words': ['english'],
        'estimator__bag_of_words__ngram_range': [(1, 2)],
        'estimator__bag_of_words__max_features': [500],
        'estimator__dim_reduct__n_components': [300],
        'estimator__normalizer__norm': ['l2'],
        'estimator__classifier__alpha': [1.0],
        'estimator__classifier__binarize': [0.0]
}]

#### Train binary

In [None]:
binary_clf = run_binary_classifier.run(binary_param_grid, BernoulliNB(), comments_file=train_binary)

with open('./saved_models/naiveB_joint_binary.pkl', 'wb') as saved_model:
	pickle.dump(binary_clf, file=saved_model)

#### Train multilabel

In [None]:
multilabel_clf = run_multilabel_classifier.run(multilabel_param_grid, BernoulliNB(), comments_file=train_multilabel)
with open('./saved_models/naiveB_joint_binary.pkl', 'wb') as saved_model:
	pickle.dump(binary_clf, file=saved_model)

#### Predict binary

In [None]:
X_binary_test, y_binary_test = run_binary_classifier.load_comments(test_binary)
y_binary_test_predict = binary_clf.predict(X_binary_test)

print(classification_report(y_binary_test, y_binary_test_predict))

#### Predict multilabel

In [None]:
X_multilabel_test, y_multilabel_test = run_multilabel_classifier.load_comments(test_multilabel)
y_multilabel_test_predict = multilabel_clf.predict(X_multilabel_test)

print(classification_report(y_multilabel_test, y_multilabel_test_predict))

## Final joint prediction

In [None]:
final_predictions = np.full_like(y_multilabel_test_predict, -1)


non_toxic_indices = np.argwhere(y_binary_test_predict == 0).flatten()
toxic_indices = np.argwhere(y_binary_test_predict == 1).flatten()

# place binary classifier's prediction of clean comments
final_predictions[non_toxic_indices] = np.array([0, 0, 0, 0, 0, 0])

multilabel_toxic_predictions = y_multilabel_test_predict[toxic_indices]
final_predictions[toxic_indices] = multilabel_toxic_predictions

print(classification_report(y_multilabel_test, final_predictions))

## Decision tree

In [None]:
binary_param_grid = {
        'bag_of_words__stop_words': ['english'],
        'bag_of_words__ngram_range': [(1, 2)],
        'bag_of_words__max_features': [500],
        'dim_reduct__n_components': [300],
        'normalizer__norm': ['l2'],
        'classifier__max_depth': [5, 10, 15]
}

multilabel_param_grid = [{
        'estimator__bag_of_words__stop_words': ['english'],
        'estimator__bag_of_words__ngram_range': [(1, 2)],
        'estimator__bag_of_words__max_features': [500],
        'estimator__dim_reduct__n_components': [300],
        'estimator__normalizer__norm': ['l2'],
        'estimator__classifier__max_depth': [5, 10, 15]
}]

#### Train binary

In [None]:
binary_clf = run_binary_classifier.run(binary_param_grid, DecisionTreeClassifier(), comments_file=train_binary)

with open('./saved_models/dec_tree_joint_binary.pkl', 'wb') as saved_model:
	pickle.dump(binary_clf, file=saved_model)

#### Train multilabel

In [None]:
multilabel_clf = run_multilabel_classifier.run(multilabel_param_grid, DecisionTreeClassifier(), comments_file=train_multilabel)
with open('./saved_models/dec_tree_joint_binary.pkl', 'wb') as saved_model:
	pickle.dump(binary_clf, file=saved_model)

#### Predict binary

In [None]:
X_binary_test, y_binary_test = run_binary_classifier.load_comments(test_binary)
y_binary_test_predict = binary_clf.predict(X_binary_test)

print(classification_report(y_binary_test, y_binary_test_predict))

#### Predict multilabel

In [None]:
X_multilabel_test, y_multilabel_test = run_multilabel_classifier.load_comments(test_multilabel)
y_multilabel_test_predict = multilabel_clf.predict(X_multilabel_test)

print(classification_report(y_multilabel_test, y_multilabel_test_predict))

## Final joint prediction

In [None]:
final_predictions = np.full_like(y_multilabel_test_predict, -1)


non_toxic_indices = np.argwhere(y_binary_test_predict == 0).flatten()
toxic_indices = np.argwhere(y_binary_test_predict == 1).flatten()

# place binary classifier's prediction of clean comments
final_predictions[non_toxic_indices] = np.array([0, 0, 0, 0, 0, 0])

multilabel_toxic_predictions = y_multilabel_test_predict[toxic_indices]
final_predictions[toxic_indices] = multilabel_toxic_predictions

print(classification_report(y_multilabel_test, final_predictions))