In [1]:
import pandas as pd
import glob
pd.set_option("display.max_rows", 30)
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_colwidth", 1000)
pd.set_option("display.width", 2000)

In [2]:
import numpy as np
import pickle
np.set_printoptions(precision=4,suppress=True)
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [30]:
documents = pickle.load(open('documents.pl','rb'))
labels = pickle.load(open('labels.pl','rb'))
tags = pickle.load(open('tags.pl','rb'))

# Feature Analysis

In [None]:
from collections import Counter
# create lemma lists filter punctuation and numbers
# from spacy
lemmas = [word.lemma_ for words in docs for word in words if word.pos_ not in ['PUNCT','NUM']]
counts = Counter(lemmas)

In [None]:
lemma_df = pd.DataFrame.from_dict(counts, orient='index')
lemma_df.sort_values(by=0,ascending=False).head(50).plot(kind='bar', figsize=(20,10))

In [327]:
counts = Counter(labels)

In [None]:
lemmas = [len(words) for words in documents]
word_counts = Counter(lemmas)

lemma_df = pd.DataFrame.from_dict(word_counts, orient='index')
a = lemma_df.sort_values(by=0,ascending=False).head(50).plot(kind='bar', figsize=(20,10))
a.set_title('number of words per row')
a.set_ylabel('number of rows')
a.set_xlabel('number of words')

In [None]:
lemmas = ([len(document) for document in documents])
word_counts = Counter(lemmas)

lemma_df = pd.DataFrame.from_dict(word_counts, orient='index')
a = lemma_df.sort_values(by=0,ascending=False).head(50).plot(kind='bar', figsize=(20,10))
a.set_title('number of characters per row')
a.set_ylabel('number of rows')
a.set_xlabel('number of words')

In [None]:
from wordcloud import WordCloud
cloud = WordCloud(width=1440, height=1080).generate(" ".join([word.lemma_ for doc in docs for word in doc]))
plt.figure(figsize=(20, 15))
plt.imshow(cloud)
plt.title('Most common lemmas')
plt.axis('off')

In [None]:
from wordcloud import WordCloud
cloud = WordCloud(width=1440, height=1080).generate(" ".join([word.lemma_ for doc in docs for word in doc if word.lemma_.lower() not in stopwords]))
plt.figure(figsize=(20, 15))
plt.imshow(cloud)
plt.title('Most common lemmas without stopwords')
plt.axis('off')

In [None]:
qmarks = np.mean(df.Content.apply(lambda x: '?' in x))
math = np.mean(df.Content.apply(lambda x: '€' in x))
fullstop = np.mean(df.Content.apply(lambda x: '.' in x))
capital_first = np.mean(df.Content.apply(lambda x: x[0].isupper()))
capitals = np.mean(df.Content.apply(lambda x: max([y.isupper() for y in x.split(' ')])))
numbers = np.mean(df.Content.apply(lambda x: max([y.isdigit() for y in x])))

print('Rows with question marks: {:.2f}%'.format(qmarks * 100))
print('Rows with €: {:.2f}%'.format(math * 100))
print('Rows with full stops: {:.2f}%'.format(fullstop * 100))
print('Rows with capitalised first letters: {:.2f}%'.format(capital_first * 100))
print('Rows with capital letters: {:.2f}%'.format(capitals * 100))
print('Rows with numbers: {:.2f}%'.format(numbers * 100))

In [276]:
{label for label in labels if type(label) is str}
len([label for label in labels if type(label) is str])

190

In [283]:
print('prio distrib.: {}'.format(sum(tags)/len(tags)))

prio distrib.: 0.10005817335660268


# FEATURES

In [19]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer

CountVectorizer()
BoW = CountVectorizer(documents, stop_words=stopwords, strip_accents='unicode', ngram_range=(1,3), min_df=3)

In [22]:
X_bag = BoW.fit_transform(documents)

In [23]:
X_bag[0].shape

(1, 367992)

In [24]:
Y = np.array(np.array(tags)=='Prio-Fall', dtype=int)

In [25]:
sum(Y)/len(Y)

0.14504915987966835

# XG BOOST

In [26]:
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['max_delta_step'] = 1
params['eta'] = 0.02
params['max_depth'] = 10

In [None]:
# Code for cross-validation
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.utils import class_weight
from sklearn.metrics import f1_score

# Create 2 folds
k = 3
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results = np.zeros(k)

# iterate over two folds
for i, (train_ind, test_ind) in enumerate(kfold):
    X_train, X_val, Y_train, Y_val = X_bag[train_ind], X_bag[test_ind], Y[train_ind], Y[test_ind]
    
    D_train = xgb.DMatrix(X_train, label=Y_train)
    D_val = xgb.DMatrix(X_val, label=Y_val)

    weights = class_weight.compute_sample_weight('balanced', [0,1], Y_train)
    weightss = []
    for y in Y_train:
        weightss.append(weights[y])

    watchlist = [(D_train, 'train'), (D_val, 'valid')]
    bst = xgb.train(params, D_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)
    predictions = bst.predict(data=D_val).round()
    results[i] = f1_score(predictions, Y_val)
    print('f1 score: {}'.format(results[i]))
    print('accuracy: {}\%'.format(sum(predictions==Y_val)/len(Y_val)*100))

[0]	train-logloss:0.685985	valid-logloss:0.686139
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.619309	valid-logloss:0.621231
[20]	train-logloss:0.562046	valid-logloss:0.565916
[30]	train-logloss:0.513792	valid-logloss:0.519646
[40]	train-logloss:0.473825	valid-logloss:0.481173
[50]	train-logloss:0.441473	valid-logloss:0.450276
[60]	train-logloss:0.416329	valid-logloss:0.426366
[70]	train-logloss:0.396462	valid-logloss:0.407615
[80]	train-logloss:0.380974	valid-logloss:0.393184
[90]	train-logloss:0.368871	valid-logloss:0.382121
[100]	train-logloss:0.359491	valid-logloss:0.37359
[110]	train-logloss:0.352182	valid-logloss:0.367058
[120]	train-logloss:0.346345	valid-logloss:0.361976
[130]	train-logloss:0.341422	valid-logloss:0.357792
[140]	train-logloss:0.337484	valid-logloss:0.354511
[150]	train-logloss:0.334163	valid-logloss:0.351858
[160]	train-logloss:0.331348	

In [328]:
np.mean(results), np.var(results)

(0.2882856141195729, 6.498816470590017e-06)