In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, classification_report
import pandas as pd
from gensim.models import Doc2Vec
import re
import string
from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import SMOTE

In [2]:
BALANCE = True

In [3]:
print('Loading data')
training_text = pd.read_csv('data/training_text', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
training_variants = pd.read_csv('data/training_variants')
train = pd.merge(training_text, training_variants, on='ID')

punct = []
for c in list(string.punctuation):
    if c in ['-', '%', '$']:
        continue
    punct += [c]
punct += ['′', '–', '°']

print('Cleaning data')

model = Doc2Vec.load('data/doc2vec2.model')

Loading data
Cleaning data


In [None]:
def clean(txt):
    txt = txt.strip().lower()
    txt = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', txt)
    txt = re.sub('\[[0-9]+\]', '', txt)
    txt = re.sub('/', ' ', txt)
    for p in punct:
        txt = txt.replace(p, ' ')
    txt = re.sub(' [0-9]+ ', ' ', txt)
    return txt.strip()

def show_prediction_result(classifier, X, y):
    probas = classifier.predict_proba(X)
    pred_indices = np.argmax(probas, axis=1)
    classes = np.unique(y)
    preds = classes[pred_indices]
    print('Log loss: {}'.format(log_loss(y, probas)))
    print(classification_report(y, preds))
    
def evaluate(classifier, X, y, X_test=None, y_test=None, t=False):
    classifier.fit(X, y)
    show_prediction_result(classifier, X, y)
    if t
        print('')
        print('Test set:')
        show_prediction_result(classifier, X_test, y_test)

In [4]:
print('Vectorizing data')
Xdoc = []
for doc in train['Text'].apply(clean):
    Xdoc.append(model.infer_vector(list(filter(None, doc.split()))))
Xdoc = np.array(Xdoc)
ydoc = train['Class'].values


Vectorizing data


In [5]:
if BALANCE:
    print('Balancing data')

    ratios = {
        7: 600,
        4: 500,
        1: 480,
        2: 450
    }
    ros = ClusterCentroids(random_state=8, ratio=ratios)
    Xdoc, ydoc = ros.fit_sample(Xdoc, ydoc)

    ratios = {
        6: 400,
        5: 400,
        3: 350,
        9: 300,
        8: 300
    }
    ros = SMOTE(random_state=8, ratio=ratios)
    Xdoc, ydoc = ros.fit_sample(Xdoc, ydoc)
    print('New data shape:', Xdoc.shape)

Balancing data
New data shape: (3780, 1000)


In [6]:
print('Training')
lr_clf = LogisticRegression(C=.8)

if BALANCE:
    X_train, X_test, y_train, y_test = train_test_split(Xdoc, ydoc)
    evaluate(lr_clf, X_train, y_train, X_test=X_test, y_test=y_test, t=True)
else:
    evaluate(lr_clf, Xdoc, ydoc)

print('old: train 0.02666984817734259 1.00 test 1.878469765018703 0.74')

Training


TypeError: evaluate() got an unexpected keyword argument 't'

In [4]:
print('Testing')
print('Loading data')
test_variants = pd.read_csv('data/test_variants')
test_text = pd.read_csv('data/test_text', sep="\|\|", engine="python", skiprows=1,
    names=["ID", "Text"])
test_y_data = pd.read_csv('data/stage1_solution_filtered.csv')
test_data = pd.merge(test_text, test_variants, on='ID')
test = test_data.loc[test_data['ID'].isin(test_y_data['ID'].values)]

print('Vectorizing test data')
Xt = []
for doc in test['Text'].apply(clean):
    Xt.append(model.infer_vector(list(filter(None, doc.split()))))
Xt = np.array(Xt)


Testing
Loading data
Vectorizing test data


In [13]:
yt = np.argmax(test_y_data.values[:, 1:], axis=1) + 1


In [14]:
print(Xt.shape, yt.shape, yt[1])

(368, 700) (368,) 2


In [5]:
print('Training full data')
lr_clf.fit(Xdoc, ydoc)

Training full data


LogisticRegression(C=0.8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
print('Predicting')
probas = lr_clf.predict_proba(Xt)

Predicting


In [15]:
print(probas.shape, np.unique(yt))

(368, 9) [1 2 3 4 5 6 7 8 9]


In [17]:
pred_indices = np.argmax(probas, axis=1)
classes = np.unique(ydoc)
preds = classes[pred_indices]
print('Log loss: {}'.format(log_loss(yt, probas)))
print(classification_report(yt, preds))

Log loss: 3.5841476087919077
             precision    recall  f1-score   support

          1       0.64      0.47      0.54        94
          2       0.36      0.33      0.34        46
          3       0.33      0.43      0.38         7
          4       0.52      0.63      0.57        65
          5       0.28      0.44      0.34        25
          6       0.47      0.64      0.54        22
          7       0.67      0.62      0.65       101
          8       0.00      0.00      0.00         2
          9       0.60      0.50      0.55         6

avg / total       0.55      0.53      0.53       368



In [None]:
train.apply