In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

from scipy.sparse import hstack
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import normalize
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer

from utils import get_data, evaluate_model, WrapperClassifier, pred_for_sparce
import pickle
from scipy import sparse
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score
from mlxtend.classifier import StackingCVClassifier
from word2vec import Word2VecVectorizer

# Load data

In [3]:
# set this flag to train classifier using all train data without validation
to_subm = False

In [4]:
# tokens were generated using 'razdel' and 'maru'
(x_train, x_val, x_test), (x_tokens_train, x_tokens_val, x_tokens_test), (y_train, y_val)  = get_data(to_subm)

# Make features

## TF-IDF Chars

In [6]:
char_vectorizer = TfidfVectorizer(
    preprocessor=lambda x: ' '.join(x),
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 5),
    norm='l2',
    min_df=0,
    smooth_idf=False,
    max_features=None
)

x_train_char = char_vectorizer.fit_transform(tqdm(x_tokens_train, desc='Train:'))
x_val_char = char_vectorizer.transform(tqdm(x_tokens_val, desc='Val:'))
x_test_char = char_vectorizer.transform(tqdm(x_tokens_test, desc='Test:'))

HBox(children=(IntProgress(value=0, description='Train:', max=89973), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Val:', max=22494), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Test:', max=112466), HTML(value='')))

## TF-IDF Words

In [7]:
tfidf_vectorizer = TfidfVectorizer(
    min_df = 1,
    tokenizer = lambda x: x,
    preprocessor = lambda x: x,
    stop_words = [],
    ngram_range = (1,1),
)
x_train_word = tfidf_vectorizer.fit_transform(tqdm(x_tokens_train, desc='Train:'))
x_val_word = tfidf_vectorizer.transform(tqdm(x_tokens_val, desc='Val:'))
x_test_word = tfidf_vectorizer.transform(tqdm(x_tokens_test, desc='Test:'))

HBox(children=(IntProgress(value=0, description='Train:', max=89973), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Val:', max=22494), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Test:', max=112466), HTML(value='')))

##  Word2Vec features

In [8]:
# vectorizer = Word2VecVectorizer('./data/word2vec.bin')
# x_train_w2v = vectorizer.transform(tqdm(x_train, desc='Train:'))
# x_val_w2v = vectorizer.transform(tqdm(x_val, desc='Val:'))
# x_test_w2v = vectorizer.transform(tqdm(x_test, desc='Test:'))


if to_subm:
    x_train_w2v = pickle.load(open('./data/trainFULL_w2v.p', 'rb'))
else:
    x_train_w2v = pickle.load(open('./data/train_w2v.p', 'rb'))
    
x_val_w2v = pickle.load(open('./data/val_w2v.p', 'rb'))
x_test_w2v = pickle.load(open('./data/test_w2v.p', 'rb'))

## Stack features

In [9]:
train_features = normalize(sparse.hstack([x_train_char, x_train_word, x_train_w2v]).tocsr())
val_features = normalize(sparse.hstack([x_val_char, x_val_word, x_val_w2v]).tocsr())
test_features = normalize(sparse.hstack([x_test_char, x_test_word, x_test_w2v]).tocsr())

In [None]:
# features
# svm: x_train_char, x_train_word, x_train_w2v
# lr: x_train_char, x_train_word
# nb: x_train_char, x_train_word

# Classifier

## SVM Classifier

In [10]:
clf_svm = CalibratedClassifierCV(LinearSVC(), cv=3)
evaluate_model(
    clf_svm, 
    train_features,
    val_features,
    y_train,
    y_val
)

0.904996888059038


## Logistic Regression

In [11]:
# Wrapper classifier helps select only certain columns for a pipeline
clf_lr = WrapperClassifier(LogisticRegression(), stop_idx=x_train_char.shape[1]+x_train_word.shape[1])
evaluate_model(
    clf_lr, 
    train_features,
    val_features,
    y_train,
    y_val
)

0.8941940072908331


## Naive bayes

In [12]:
clf_nb = WrapperClassifier(MultinomialNB(alpha=0.0005), stop_idx=x_train_char.shape[1]+x_train_word.shape[1])
evaluate_model(
    clf_nb, 
    train_features,
    val_features,
    y_train,
    y_val
)

0.887792300168934


# Stacking classifiers

In [13]:
%%time
clf_meta = CalibratedClassifierCV(LinearSVC(), cv=3)

clf_stack = StackingCVClassifier(
    classifiers=[clf_svm, clf_nb, clf_lr], 
    meta_classifier=clf_meta,
    use_features_in_secondary=1,
    cv=3,
    use_probas=1,
    verbose=1,
)

n_train = train_features.shape[0]
n_val = val_features.shape[0]
clf_stack.fit(train_features[:n_train], y_train[:n_train])
pred = pred_for_sparce(clf_stack, val_features[:n_val])
print(f1_score(y_val[:n_val], pred, average='micro'))

Fitting 3 classifiers...
Fitting classifier1: calibratedclassifiercv (1/3)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
Fitting classifier2: wrapperclassifier (2/3)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
Fitting classifier3: wrapperclassifier (3/3)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...


HBox(children=(IntProgress(value=0, max=225), HTML(value='')))

0.9113096825820218
CPU times: user 1h 15min 3s, sys: 1h 45min 32s, total: 3h 36s
Wall time: 15min 27s


# Generate submission

In [None]:
predictions_test = pred_for_sparce(clf_stack, test_features)

In [None]:
with open('./subm/subm.csv', 'w') as f:
    f.write('id,label\n')
    for j, p in enumerate(predictions_test):
        f.write('{},{}\n'.format(j,p))