In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

In [2]:
stop_words = set(stopwords.words('english'))

train = pd.read_csv('train.csv', index_col=0)
# test = pd.read_csv('test.csv')
# test_labels = pd.read_csv('test_labels.csv')

X = train.iloc[:, 0]
y = train.iloc[:, 1:]

In [3]:
NB_pl = Pipeline([
    ('vectorizer', CountVectorizer(binary=True)),
    ('mnb', OneVsRestClassifier(BernoulliNB()))
    ])

SVM_pl = Pipeline([
    ('vectorizer', CountVectorizer(binary=True)),
    ('clf', OneVsRestClassifier(LinearSVC()))
    ])

LR_pl = Pipeline([
    ('vectorizer', CountVectorizer(binary=True)),
    ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

RF_pl = Pipeline([
    ('vectorizer', CountVectorizer(binary=True)),
    ('clf', OneVsRestClassifier(RandomForestClassifier()))
    ])

In [4]:
kf = KFold(n_splits=5, shuffle=True)

scores_NB = []

for train_idx, test_idx in kf.split(X):
    X_train, y_train = X.iloc[train_idx].values, y.iloc[train_idx, :].values
    X_test, y_test = X.iloc[test_idx].values, y.iloc[test_idx, :].values

    NB_pl.fit(X_train, y_train)
    y_pred = NB_pl.predict_proba(X_test)
    score = roc_auc_score(y_test, y_pred)
    scores_NB.append(score)
    print(score)
    
print('MEAN: {}'.format(np.mean(scores_NB)) )

0.9062120207919269
0.9119533691286069
0.910689987128161
0.908464809060519
0.9078365565129053
MEAN: 0.9090313485244238


In [6]:
kf = KFold(n_splits=5, shuffle=True)

scores_SVC = []

for train_idx, test_idx in kf.split(X):
    X_train, y_train = X.iloc[train_idx].values, y.iloc[train_idx, :].values
    X_test, y_test = X.iloc[test_idx].values, y.iloc[test_idx, :].values
    
    SVM_pl.fit(X_train, y_train)
    y_pred = SVM_pl.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    scores_SVC.append(score)
    print(score)
    
print('MEAN: {}'.format(np.mean(scores_SVC)) )

0.7564507059714446
0.7640329274865851
0.7693577129913266
0.7549248042584734
0.7592589255907579
MEAN: 0.7608050152597176


In [7]:
"""
kf = KFold(n_splits=5, shuffle=True)

scores_LR = []

for train_idx, test_idx in kf.split(X):
    X_train, y_train = X.iloc[train_idx].values, y.iloc[train_idx, :].values
    X_test, y_test = X.iloc[test_idx].values, y.iloc[test_idx, :].values
    
    LR_pl.fit(X_train, y_train)
    y_pred = LR_pl.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    scores_LR.append(score)
    print(score)
    
print('MEAN: {}'.format(np.mean(scores_LR)) )
"""

"\nkf = KFold(n_splits=5, shuffle=True)\n\nscores_LR = []\n\nfor train_idx, test_idx in kf.split(X):\n    X_train, y_train = X.iloc[train_idx].values, y.iloc[train_idx, :].values\n    X_test, y_test = X.iloc[test_idx].values, y.iloc[test_idx, :].values\n    \n    LR_pl.fit(X_train, y_train)\n    y_pred = LR_pl.predict(X_test)\n    score = roc_auc_score(y_test, y_pred)\n    scores_LR.append(score)\n    print(score)\n    \nprint('MEAN: {}'.format(np.mean(scores_LR)) )\n"

In [9]:
kf = KFold(n_splits=5, shuffle=True)

scores_RF = []

for train_idx, test_idx in kf.split(X):
    X_train, y_train = X.iloc[train_idx].values, y.iloc[train_idx, :].values
    X_test, y_test = X.iloc[test_idx].values, y.iloc[test_idx, :].values
    
    RF_pl.fit(X_train, y_train)
    y_pred = RF_pl.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    scores_RF.append(score)
    print(score)
    
print('MEAN: {}'.format(np.mean(scores_RF)) )

0.601998285330387
0.6122517539543583
0.6026882544141264
0.6085768942693336
0.6064119138966888
MEAN: 0.6063854203729788


# Pytorch implementation

In [15]:
import numpy as np
import nltk
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, RegexpStemmer
from string import punctuation

torch.manual_seed(1)

print('number of CUDA devices: ',torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
device = torch.cuda.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

number of CUDA devices:  1
Quadro K2200
<torch.cuda.device object at 0x7fd3fe800320>


In [16]:
def make_bow_vector(sentence, word_to_ix):
    vec = np.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec

def make_target(label):
    tmp = []
    for lab in label:
        tmp.append(lab)
    return tuple( np.array(tmp) )

class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size, n_hidden1, n_hidden2):
        super(BoWClassifier, self).__init__()
        self.hidden1 = nn.Linear(vocab_size, n_hidden1)
        self.hidden2 = nn.Linear(n_hidden1, n_hidden2)
        self.out = nn.Linear(n_hidden2, num_labels)

    def forward(self, x_val):
        x = Variable(x_val, requires_grad=False)
        x = F.relu(self.hidden1(x))
        x = F.relu(self.hidden2(x))
        x = F.sigmoid(self.out(x))
        return x

In [18]:
stop = stopwords.words('english')

stop.append("!")
stop.append(',')
stop.append('')
stop.append('=')
stop = list(stop)
re = RegexpStemmer('[0-9]+')

 # Preprocess text
def preprocess_text(string):
    # Annoying things!
    string = string.replace("=", "")
    string = string.replace("-", "")
    string = string.replace("'", "")
    tokens = word_tokenize(str(string))
    # Punctuations
    tokens = [re.stem(token.lower()) for token in tokens if token not in punctuation and token not in stop]
    return (tokens)

In [21]:
X_nn = train.comment_text.apply(preprocess_text)
Y_nn = train.iloc[:, 1:]

# Make naive BOW-presentation
word_to_ix = {}
for sent in X_nn:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [22]:
VOCAB_SIZE = len(word_to_ix)
HIDDEN1 = 32
HIDDEN2 = 16
NUM_LABELS = 6

model = BoWClassifier(NUM_LABELS, VOCAB_SIZE, HIDDEN1, HIDDEN2)
optimizer = optim.Adam(model.parameters())
# MultiLabelSoftMarginLoss - 
epochs = 5
batch_size = 1000
num_batches = int(len(X) * 0.9) // batch_size
# critize = nn.MultiLabelSoftMarginLoss()
critize = nn.BCELoss()

X_train = X_nn[:int(len(X_nn) * 0.95)]
y_train = Y_nn[:int(len(Y_nn) * 0.95)]

X_test = X_nn[int(len(X_nn) * 0.95) + 1 :]
y_test = Y_nn[int(len(Y_nn) * 0.95) + 1 :]

loss_table = []
score_table = []
for epoch in range(epochs):
    losses = []
    for batch in range(num_batches):
        start, end = batch * batch_size, (batch + 1) * batch_size
        # Fetch the part of data neededVariable
        X_batch = X_train[start:end]
        y_batch = y_train[start:end]
        # Prepare data to BOW and make torch vectors
        X_batch = np.array([make_bow_vector(comment, word_to_ix) for comment in X_batch])
        y_batch = np.array([ make_target(y_batch.iloc[i]) for i in range(len(y_batch))])
        X_batch = torch.from_numpy(X_batch).float()
        y_batch = Variable(torch.from_numpy(y_batch).float(), requires_grad=False)
        # GD towards opt
        model.zero_grad()
        output_fw = model.forward(X_batch)

        loss = critize(output_fw, y_batch)
        loss.backward()
        optimizer.step()
        losses.append(loss.data.mean())

    X_batch = X_test
    y_batch = y_test
    X_batch = np.array([make_bow_vector(comment, word_to_ix) for comment in X_batch])
    y_batch = np.array([ make_target(y_batch.iloc[i]) for i in range(len(y_batch))])
    X_batch = torch.from_numpy(X_batch).float()
    # GD towards opt
    model.zero_grad()
    output_fw = model.forward(X_batch)
    score = roc_auc_score(y_batch, output_fw.data.numpy())

    print('[%d/%d] Training-error: %.3f' % (epoch+1, epochs, np.mean(losses)))
    print('[%d/%d] ROC-score:  %.3f' % (epoch+1, epochs, score))
    
    loss_table.append(np.mean(losses))
    score_table.append(score)

[1/5] Training-error: 0.336
[1/5] ROC-score:  0.831
[2/5] Training-error: 0.103
[2/5] ROC-score:  0.929
[3/5] Training-error: 0.064
[3/5] ROC-score:  0.940
[4/5] Training-error: 0.050
[4/5] ROC-score:  0.950
[5/5] Training-error: 0.041
[5/5] ROC-score:  0.957


In [25]:
import gensim

def read_corpus(X):
    for i, line in enumerate(X):
        line = " ".join(line)
        yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

d2v_model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=55)

corpus = list(read_corpus(X_nn))
d2v_model.build_vocab(corpus)
%time d2v_model.train(corpus, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

CPU times: user 14min 52s, sys: 39.1 s, total: 15min 31s
Wall time: 6min 35s


In [26]:
VOCAB_SIZE = 100
HIDDEN1 = 256
HIDDEN2 = 128
NUM_LABELS = 6

model = BoWClassifier(NUM_LABELS, VOCAB_SIZE, HIDDEN1, HIDDEN2)
optimizer = optim.Adam(model.parameters())

epochs = 5
batch_size = 1000
num_batches = int(len(X) * 0.8) // batch_size
#critize = nn.MultiLabelSoftMarginLoss()

critize = nn.BCELoss()

X_train = X_nn[:int(len(X_nn) * 0.95)]
y_train = Y_nn[:int(len(Y_nn) * 0.95)]

X_test = X_nn[int(len(X_nn) * 0.95) + 1 :]
y_test = Y_nn[int(len(Y_nn) * 0.95) + 1 :]

loss_table = []
score_table = []
for epoch in range(epochs):
    losses = []
    for batch in range(num_batches):
        start, end = batch * batch_size, (batch + 1) * batch_size
        # Fetch the part of data needed
        X_batch = X_train[start:end]
        y_batch = y_train[start:end]
        # Prepare data to BOW and make torch vectors
        X_batch = np.array([d2v_model.infer_vector(comment) for comment in X_batch])
        y_batch = np.array([ make_target(y_batch.iloc[i]) for i in range(len(y_batch))])
        
        X_batch = torch.from_numpy(X_batch).float()
        y_batch = Variable(torch.from_numpy(y_batch).float(), requires_grad=False)
        # GD towards opt
        model.zero_grad()
        output_fw = model.forward(X_batch)

        loss = critize(output_fw, y_batch)
        loss.backward()
        optimizer.step()
        losses.append(loss.data.mean())
        
    X_batch = X_test
    y_batch = y_test
    X_batch = np.array([d2v_model.infer_vector(comment) for comment in X_batch])
    y_batch = np.array([ make_target(y_batch.iloc[i]) for i in range(len(y_batch))])
    X_batch = torch.from_numpy(X_batch).float()
    # GD towards opt
    model.zero_grad()
    output_fw = model.forward(X_batch)
    score = roc_auc_score(y_batch, output_fw.data.numpy())

    print('[%d/%d] Training-error: %.3f' % (epoch+1, epochs, np.mean(losses)))
    print('[%d/%d] ROC-score:  %.3f' % (epoch+1, epochs, score))
    
    loss_table.append(np.mean(losses))
    score_table.append(score)

[1/5] Training-error: 0.181
[1/5] ROC-score:  0.830
[2/5] Training-error: 0.108
[2/5] ROC-score:  0.858
[3/5] Training-error: 0.099
[3/5] ROC-score:  0.873
[4/5] Training-error: 0.097
[4/5] ROC-score:  0.880
[5/5] Training-error: 0.095
[5/5] ROC-score:  0.885
