In [0]:
import numpy as np
import glob
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

Using TensorFlow backend.


In [0]:
def print_result(y_pred, y_test, clf_name):
    matrix = confusion_matrix(y_test, y_pred)
    TP, FP = matrix[0]
    FN, TN = matrix[1]
    PPV = (TP * 1.0) / (TP + FP)
    TPR = (TP * 1.0) / (TP + FN)
    TNR = (FP * 1.0) / (TN + FP)
    ACC = (TP + TN) * 1.0 / (TP + TN + FP + FN)
    F1 = 2.0 * PPV * TPR / (PPV + TPR)
    print("%s\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f" %
          (clf_name, PPV, TPR, TNR, ACC, F1))

In [0]:
with open('train_neg.dt', 'rb') as datafile:
    train_neg = pickle.load(datafile)
with open('train_pos.dt', 'rb') as datafile:
    train_pos = pickle.load(datafile)
with open('val_neg.dt', 'rb') as datafile:
    val_neg = pickle.load(datafile)
with open('val_pos.dt', 'rb') as datafile:
    val_pos = pickle.load(datafile)
with open('test_neg.dt', 'rb') as datafile:
    test_neg = pickle.load(datafile)
with open('test_pos.dt', 'rb') as datafile:
    test_pos = pickle.load(datafile)

In [0]:
X_data = train_neg + train_pos + val_neg + val_pos + test_neg + test_pos

In [0]:
vectorizer = TfidfVectorizer(min_df=5, sublinear_tf=True)

In [0]:
X = vectorizer.fit_transform(X_data)

In [0]:
print(X.shape)

(50000, 8242)


In [0]:
X_train = X[:30000]
X_val = X[30000:40000]
X_test = X[40000:]

In [0]:
y_train = [0]*15000 + [1]*15000
y_val = [0]*5000 + [1]*5000
y_test = [0]*5000 + [1]*5000

In [0]:
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=64, epochs=10)
y_pred = model.predict_classes(X_test)
print_result(y_pred, y_test, 'Deep learning standard: ')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               1055104   
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 1,071,745
Trainable params: 1,071,745
Non-trainable params: 0
___________

In [0]:
from sklearn.svm import LinearSVC

In [0]:
linear_svm = LinearSVC(C=1)

In [0]:
print(type(X_train))

<class 'scipy.sparse.csr.csr_matrix'>


In [0]:
linear_svm.fit(X_train, y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [0]:
y_pred_svm = linear_svm.predict(X_test)

In [0]:
print_result(y_pred_svm, y_test, 'Linear svm: ')

Linear svm: 	0.87460	0.88540	0.12389	0.88070	0.87997
