In [1]:
import os
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

from sklearn.metrics import classification_report

from run_binary_classifier import _load_comments, run

In [2]:
train_comments_path = os.path.join('..', '..', '..', 'data', 'train_binary.csv')
test_comments_path = os.path.join('..', '..', '..', 'data', 'test_clean_binary.csv')

## Dummy classifier

In [3]:
param_grid = {
        'bag_of_words__stop_words': ['english'],
        'bag_of_words__ngram_range': [(1, 2)],
        'bag_of_words__max_features': [500],
        'dim_reduct__n_components': [300],
        'normalizer__norm': ['l2'],
        'classifier__strategy': ['uniform']
}

clf = DummyClassifier()

#### Train phase

In [4]:
trained_clf = run(param_grid, clf, comments_file=train_comments_path)



Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   38.3s finished


             precision    recall  f1-score   support

          0       0.50      0.50      0.50      4825
          1       0.51      0.51      0.51      4910

avg / total       0.51      0.51      0.51      9735

{'bag_of_words__max_features': 500, 'bag_of_words__ngram_range': (1, 2), 'bag_of_words__stop_words': 'english', 'classifier__strategy': 'uniform', 'dim_reduct__n_components': 300, 'normalizer__norm': 'l2'}


#### Saving the model

In [5]:
with open('./saved_models/dummy_trained_binary.pkl', 'wb') as saved_model:
	pickle.dump(trained_clf, file=saved_model)

#### Loading the model and testing it

In [6]:
with open('./saved_models/dummy_trained_binary.pkl', 'rb') as saved_model:
	loaded_clf = pickle.load(saved_model)

	X_test, y_test = _load_comments(test_comments_path)
	y_test_predict = loaded_clf.predict(X_test)

	print(classification_report(y_test, y_test_predict))

             precision    recall  f1-score   support

          0       0.50      0.50      0.50      6242
          1       0.50      0.50      0.50      6243

avg / total       0.50      0.50      0.50     12485



## Logistic regression

In [7]:
param_grid = {
        'bag_of_words__stop_words': ['english'],
        'bag_of_words__ngram_range': [(1, 2)],
        'bag_of_words__max_features': [500],
        'dim_reduct__n_components': [300],
        'normalizer__norm': ['l2'],
        'classifier__C': [5., 10.]
}

clf = LogisticRegression()

#### Train phase 

In [8]:
trained_clf = run(param_grid, clf, comments_file=train_comments_path)



Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.5min finished


             precision    recall  f1-score   support

          0       0.81      0.91      0.85      4905
          1       0.89      0.78      0.83      4830

avg / total       0.85      0.84      0.84      9735

{'bag_of_words__max_features': 500, 'bag_of_words__ngram_range': (1, 2), 'bag_of_words__stop_words': 'english', 'classifier__C': 10.0, 'dim_reduct__n_components': 300, 'normalizer__norm': 'l2'}


#### Saving the model

In [9]:
with open('./saved_models/log_reg_trained_binary.pkl', 'wb') as saved_model:
	pickle.dump(trained_clf, file=saved_model)

#### Loading the model and testing it

In [10]:
with open('./saved_models/log_reg_trained_binary.pkl', 'rb') as saved_model:
	loaded_clf = pickle.load(saved_model)

	X_test, y_test = _load_comments(test_comments_path)
	y_test_predict = loaded_clf.predict(X_test)

	print(classification_report(y_test, y_test_predict))

             precision    recall  f1-score   support

          0       0.84      0.85      0.84      6242
          1       0.85      0.84      0.84      6243

avg / total       0.84      0.84      0.84     12485



## Decision tree

In [11]:
param_grid = {
        'bag_of_words__stop_words': ['english'],
        'bag_of_words__ngram_range': [(1, 2)],
        'bag_of_words__max_features': [500],
        'dim_reduct__n_components': [300],
        'normalizer__norm': ['l2'],
        'classifier__max_depth': [5, 10, 15]
}

clf = DecisionTreeClassifier()

#### Train phase

In [12]:
trained_clf = run(param_grid, clf, comments_file=train_comments_path)



Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.3min finished


             precision    recall  f1-score   support

          0       0.75      0.89      0.82      4920
          1       0.86      0.70      0.77      4815

avg / total       0.81      0.80      0.79      9735

{'bag_of_words__max_features': 500, 'bag_of_words__ngram_range': (1, 2), 'bag_of_words__stop_words': 'english', 'classifier__max_depth': 5, 'dim_reduct__n_components': 300, 'normalizer__norm': 'l2'}


#### Saving the model

In [13]:
with open('./saved_models/dec_tree_trained_binary.pkl', 'wb') as saved_model:
	pickle.dump(trained_clf, file=saved_model)

#### Loading the model and testing it

In [14]:
with open('./saved_models/dec_tree_trained_binary.pkl', 'rb') as saved_model:
	loaded_clf = pickle.load(saved_model)

	X_test, y_test = _load_comments(test_comments_path)
	y_test_predict = loaded_clf.predict(X_test)

	print(classification_report(y_test, y_test_predict))

             precision    recall  f1-score   support

          0       0.77      0.87      0.82      6242
          1       0.85      0.75      0.79      6243

avg / total       0.81      0.81      0.81     12485



## Naive Bayes classifier

In [15]:
param_grid = {
        'bag_of_words__stop_words': ['english'],
        'bag_of_words__ngram_range': [(1, 2)],
        'bag_of_words__max_features': [500],
        'dim_reduct__n_components': [300],
        'normalizer__norm': ['l2'],
        'classifier__alpha': [1.0],
        'classifier__binarize': [0.0]
}

clf = BernoulliNB()

#### Train phase

In [16]:
trained_clf = run(param_grid, clf, comments_file=train_comments_path)



Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   41.1s finished


             precision    recall  f1-score   support

          0       0.76      0.79      0.77      4827
          1       0.78      0.75      0.77      4908

avg / total       0.77      0.77      0.77      9735

{'bag_of_words__max_features': 500, 'bag_of_words__ngram_range': (1, 2), 'bag_of_words__stop_words': 'english', 'classifier__alpha': 1.0, 'classifier__binarize': 0.0, 'dim_reduct__n_components': 300, 'normalizer__norm': 'l2'}


#### Saving the model

In [17]:
with open('./saved_models/naiveB_trained_binary.pkl', 'wb') as saved_model:
	pickle.dump(trained_clf, file=saved_model)

#### Loading the model and testing it

In [18]:
with open('./saved_models/naiveB_trained_binary.pkl', 'rb') as saved_model:
	loaded_clf = pickle.load(saved_model)

	X_test, y_test = _load_comments(test_comments_path)
	y_test_predict = loaded_clf.predict(X_test)

	print(classification_report(y_test, y_test_predict))

             precision    recall  f1-score   support

          0       0.80      0.72      0.76      6242
          1       0.75      0.82      0.78      6243

avg / total       0.78      0.77      0.77     12485



## SVM

In [19]:
param_grid = {
        'bag_of_words__stop_words': ['english'],
        'bag_of_words__ngram_range': [(1, 2)],
        'bag_of_words__max_features': [500],
        'bag_of_words__lowercase': [True, False],
        'dim_reduct__n_components': [100],
        'normalizer__norm': ['l2']
}

clf = SVC()

#### Train phase

In [20]:
trained_clf = run(param_grid, clf, comments_file=train_comments_path)



Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 13.2min finished


             precision    recall  f1-score   support

          0       0.76      0.90      0.82      4817
          1       0.88      0.72      0.79      4918

avg / total       0.82      0.81      0.81      9735

{'bag_of_words__lowercase': True, 'bag_of_words__max_features': 500, 'bag_of_words__ngram_range': (1, 2), 'bag_of_words__stop_words': 'english', 'dim_reduct__n_components': 100, 'normalizer__norm': 'l2'}


#### Saving the model

In [21]:
with open('./saved_models/SVM_trained_binary.pkl', 'wb') as saved_model:
	pickle.dump(trained_clf, file=saved_model)

#### Loading the model and testing it

In [22]:
with open('./saved_models/SVM_trained_binary.pkl', 'rb') as saved_model:
	loaded_clf = pickle.load(saved_model)

	X_test, y_test = _load_comments(test_comments_path)
	y_test_predict = loaded_clf.predict(X_test)

	print(classification_report(y_test, y_test_predict))

             precision    recall  f1-score   support

          0       0.79      0.85      0.82      6242
          1       0.84      0.78      0.80      6243

avg / total       0.81      0.81      0.81     12485

