In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, f1_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

In [None]:
def plot_roc(targets, probs):
    fpr, tpr, _ = roc_curve(targets, probs)
    auc = roc_auc_score(targets, probs)
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label='AUC {:.3f}'.format(auc))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='lower right')
    plt.show()

### Random Forest Classifier with 100 trees

In [None]:
# load testing set
random_forest_test = joblib.load('random_forest_test.pkl')

In [None]:
# load classifier
forest = joblib.load('RandomForestBalanced.pkl')

In [None]:
rf_result = forest.predict(list(random_forest_test['question_vector_avg']))
rf_probs = forest.predict_proba(list(random_forest_test['question_vector_avg']))[:, 1]

plot_roc(random_forest_test['target'], rf_probs)

In [None]:
print('F1-score = %.6f' % f1_score(random_forest_test['target'], rf_result))

### LSTM

In [None]:
# load testing set
lstm_train = joblib.load('lstm_train.pkl')
lstm_test = joblib.load('lstm_test.pkl')

In [None]:
# load classifier
model = joblib.load('LSTMBalanced.pkl')

In [None]:
vocab_size = len(set([i for j in list(lstm_train['question_text_clean'])for i in j.split(' ')]))

In [None]:
### Create sequence
vocabulary_size = vocab_size
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(lstm_train['question_text_clean'])

In [None]:
sequences_test = tokenizer.texts_to_sequences(lstm_test['question_text_clean'])
data_test = pad_sequences(sequences_test, maxlen=32)
lstm_result = model.predict_classes(data_test)
lstm_probs = model.predict(data_test).ravel()

plot_roc(lstm_test['target'],lstm_probs)

In [None]:
print('F1-score = %.6f' % f1_score(lstm_test['target'], lstm_result))