In [43]:
import numpy as np
import pickle
np.set_printoptions(precision=4,suppress=True)
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [56]:
documents = pickle.load(open('documents.pl','rb'))
tags = pickle.load(open('tags.pl','rb'))

In [58]:
tags[-1]

'Prio-Fall'

# FEATURES

In [49]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

BoW = CountVectorizer(documents, strip_accents='unicode', ngram_range=(1,3), min_df=3)
X_bag = BoW.fit_transform(documents)
TFIDF = TfidfTransformer()
tf_transformer = TFIDF.fit(X_bag)
X_bag = tf_transformer.fit_transform(X_bag)

In [4]:
Y = np.array(np.array(tags)=='Prio-Fall', dtype=int)

In [5]:
sum(Y)/len(Y)

0.1450676982591876

In [6]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# CLASSIFIERS

In [7]:
import xgboost as xgb
# Code for cross-validation
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold



In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

## XG BOOST

In [51]:
# TEST
X_bag = BoW.transform(documents)
X_bag = tf_transformer.transform(X_bag)

In [52]:
D_test = xgb.DMatrix(X_bag)
predictions = models[1].predict(D_test)

In [54]:
predictions.round()

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
      dtype=float32)

In [12]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.07
params['max_depth'] = 30
params['min_child_weight'] = 5
params['lambda'] = 1
params['nthread'] = 4

In [13]:
# Create 2 folds
k = 3
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results = np.zeros((k,3))
models = []

# iterate over two folds
for i, (train_ind, test_ind) in enumerate(kfold):
    X_train, X_val, Y_train, Y_val = X_bag[train_ind], X_bag[test_ind], Y[train_ind], Y[test_ind]
    
    ros = RandomOverSampler(random_state=0)
    X_resampled, Y_resampled = ros.fit_sample(X_train, Y_train)
    print(X_resampled.shape, X_val.shape, Y_resampled.shape, Y_val.shape)
    
    D_train = xgb.DMatrix(X_resampled, label=Y_resampled)
    D_val = xgb.DMatrix(X_val, label=Y_val)

    watchlist = [(D_train, 'train'), (D_val, 'valid')]
    models.append(xgb.train(params, D_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10))
    predictions = models[-1].predict(data=D_val).round()
    f1 = f1_score(Y_val, predictions)
    acc = accuracy_score(Y_val, predictions)
    prio_acc = accuracy_score(Y_val[Y_val==1], predictions[Y_val==1])
    results[i] = [f1, acc, prio_acc]
    print('f1 score: {}'.format(f1))
    print('accuracy: {}\%'.format(acc))
    print('Prio accuracy: {}'.format(prio_acc))
    print(confusion_matrix(Y_val, predictions))

(124348, 321767) (36363, 321767) (124348,) (36363,)
[0]	train-logloss:0.671831	valid-logloss:0.675022
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.544939	valid-logloss:0.568291
[20]	train-logloss:0.486976	valid-logloss:0.522635
[30]	train-logloss:0.454343	valid-logloss:0.498988
[40]	train-logloss:0.433313	valid-logloss:0.485221
[50]	train-logloss:0.420028	valid-logloss:0.476748
[60]	train-logloss:0.41051	valid-logloss:0.470725
[70]	train-logloss:0.402963	valid-logloss:0.466284
[80]	train-logloss:0.396722	valid-logloss:0.462662
[90]	train-logloss:0.391053	valid-logloss:0.459482
[100]	train-logloss:0.385243	valid-logloss:0.456073
[110]	train-logloss:0.380326	valid-logloss:0.453364
[120]	train-logloss:0.376123	valid-logloss:0.451137
[130]	train-logloss:0.371796	valid-logloss:0.448743
[140]	train-logloss:0.366935	valid-logloss:0.445959
[150]	train-logloss:0.363093	

In [14]:
np.mean(results,0), np.var(results,0)

(array([0.5004, 0.8305, 0.585 ]), array([0.0001, 0.    , 0.0001]))

## SVC for challenge 1A

In [17]:
from sklearn.svm import LinearSVC

In [23]:
k = 3
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results_svc = np.zeros((k,3))
cv = CountVectorizer(min_df = 3, max_df = 10000, ngram_range=(1, 3), lowercase =True)
tfidfTransform = TfidfTransformer()

# iterate over two folds
compareScores = 0
finalSVCModel = None
for i, (train_ind, test_ind) in enumerate(kfold):
    model = LinearSVC(dual=False,tol=1e-3, class_weight = 'balanced') 
    
    trainContent = list( documents[i] for i in train_ind )
    train_counts = cv.fit_transform(trainContent)
    
    train_tfidf = tfidfTransform.fit_transform(train_counts)
    
    testContent = list(documents[i] for i in test_ind)
    test_counts = cv.transform(testContent)
    
    test_tfidf = tfidfTransform.transform(test_counts)
    
    trainLabel = list( Y[i] for i in train_ind )
    model.fit(train_tfidf, trainLabel)
    
    predictions = model.predict(X_val).round()
    f1 = f1_score(Y_val, predictions)
    acc = accuracy_score(Y_val, predictions)
    prio_acc = accuracy_score(Y_val[Y_val==1], predictions[Y_val==1])
    results_svc[i] = [f1, acc, prio_acc]
    print('f1 score: {}'.format(f1))
    print('accuracy: {}\%'.format(acc))
    print('Prio accuracy: {}'.format(prio_acc))
    print(confusion_matrix(Y_val, predictions))
    
    if(prio_acc > compareScores):
        compareScores = prio_acc
        finalSVCModel = model

ValueError: X has 321767 features per sample; expecting 183871

In [20]:
np.mean(results_svc,0), np.var(results_svc,0)

(array([0.4885, 0.8261, 0.5723]), array([0., 0., 0.]))