In [1]:
import pickle
import pandas as pd

In [2]:
import numpy as np
np.set_printoptions(precision=4,suppress=True)
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import spacy

In [4]:
from tqdm import tqdm

In [5]:
nlp = spacy.load('de', disable=['parser', 'tagger'])
# FOR FastText vectors
# nlp.vocab.from_disk('./vocab/')

In [6]:
import re
GRUBER_URLINTEXT_PAT = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
NUMBER_CLEAN = re.compile(r'\d+(:?[.,]?\d+)*')
TIME_CLEAN = re.compile(r'\d+:\d+(?:\s?[Uu]hr)?')

def preprocessing(text):
    try:
        # clean-up url
        text = GRUBER_URLINTEXT_PAT.sub('url', text)
        # remove time
        text = TIME_CLEAN.sub('zeit', text)
        # clean-up numbers
        text = NUMBER_CLEAN.sub('nummer', text)
        return text
    except:
        return ''

preprocessing('Yeah , der WG gefällt deine Musik . Hol dir auch superschnelles Internet ! https://t.co/0dPArCUv0b https://t.co/HHRZaAQnte http://pbs.twimg.com/amplify_video_thumb/0123456789001234567890/img/QEQntlwB_ifYH3Gv.jpg')

with open('./stopwords-de.txt') as f:
    stopwords = f.read().split('\n')
    
def filter_toks(token):
    if token.tag_ in ['$,','$.','$(']:
        return False
    if token.pos_ in ['ADP']:
        return False
    if token.lemma_.lower() in stopwords:
        return False
    return True

In [7]:
data = {}
for file in ['docs', 'documents', 'labels', 'tags']:
    with open(file+'.pl') as f:
        data[file] = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'docs.pl'

In [None]:
labels[0], documents[0], docs[0], tags[0]

# Feature Analysis

In [None]:
from collections import Counter
# create lemma lists filter punctuation and numbers
# from spacy
lemmas = [word.lemma_ for words in docs for word in words if word.pos_ not in ['PUNCT','NUM']]
counts = Counter(lemmas)

In [None]:
lemma_df = pd.DataFrame.from_dict(counts, orient='index')
lemma_df.sort_values(by=0,ascending=False).head(50).plot(kind='bar', figsize=(20,10))

In [327]:
counts = Counter(labels)

In [None]:
lemmas = [len(words) for words in documents]
word_counts = Counter(lemmas)

lemma_df = pd.DataFrame.from_dict(word_counts, orient='index')
a = lemma_df.sort_values(by=0,ascending=False).head(50).plot(kind='bar', figsize=(20,10))
a.set_title('number of words per row')
a.set_ylabel('number of rows')
a.set_xlabel('number of words')

In [None]:
lemmas = ([len(document) for document in documents])
word_counts = Counter(lemmas)

lemma_df = pd.DataFrame.from_dict(word_counts, orient='index')
a = lemma_df.sort_values(by=0,ascending=False).head(50).plot(kind='bar', figsize=(20,10))
a.set_title('number of characters per row')
a.set_ylabel('number of rows')
a.set_xlabel('number of words')

In [None]:
from wordcloud import WordCloud
cloud = WordCloud(width=1440, height=1080).generate(" ".join([word.lemma_ for doc in docs for word in doc]))
plt.figure(figsize=(20, 15))
plt.imshow(cloud)
plt.title('Most common lemmas')
plt.axis('off')

In [None]:
from wordcloud import WordCloud
cloud = WordCloud(width=1440, height=1080).generate(" ".join([word.lemma_ for doc in docs for word in doc if word.lemma_.lower() not in stopwords]))
plt.figure(figsize=(20, 15))
plt.imshow(cloud)
plt.title('Most common lemmas without stopwords')
plt.axis('off')

In [None]:
qmarks = np.mean(df.Content.apply(lambda x: '?' in x))
math = np.mean(df.Content.apply(lambda x: '€' in x))
fullstop = np.mean(df.Content.apply(lambda x: '.' in x))
capital_first = np.mean(df.Content.apply(lambda x: x[0].isupper()))
capitals = np.mean(df.Content.apply(lambda x: max([y.isupper() for y in x.split(' ')])))
numbers = np.mean(df.Content.apply(lambda x: max([y.isdigit() for y in x])))

print('Rows with question marks: {:.2f}%'.format(qmarks * 100))
print('Rows with €: {:.2f}%'.format(math * 100))
print('Rows with full stops: {:.2f}%'.format(fullstop * 100))
print('Rows with capitalised first letters: {:.2f}%'.format(capital_first * 100))
print('Rows with capital letters: {:.2f}%'.format(capitals * 100))
print('Rows with numbers: {:.2f}%'.format(numbers * 100))

# RASA TRAINING DATA

In [308]:
import simplejson
fp = {
    "rasa_nlu_data": {
        "common_examples": [],
        "regex_features" : [],
        "entity_synonyms": []
    }
}
fb = {
    "rasa_nlu_data": {
        "common_examples": [],
        "regex_features" : [],
        "entity_synonyms": []
    }
}

for i in range(len(documents)):
    fp["rasa_nlu_data"]['common_examples'].append(dict(text=documents[i], intent=tags[i], entities=[]))

with open('train_prio.json', 'wt') as file_p:
    simplejson.dump(fp, file_p)
    
# with open('train_bucket.json', 'w') as file_b:
#     json.dump(fb, file_b)

In [276]:
{label for label in labels if type(label) is str}
len([label for label in labels if type(label) is str])

190

In [283]:
print('prio distrib.: {}'.format(sum(tags)/len(tags)))

prio distrib.: 0.10005817335660268


In [None]:
# TODO: try different sets of stopwords (e.g. with or without kein, nicht)

In [261]:
with open('./train_bucket.json') as f:
    test=json.load(f)

In [267]:
test['common_examples']

{'intent': nan, 'text': 'Ich warte noch auf eine korrekte Rechnung !'}

# FEATURES

In [None]:
# wordvectors:
w2v = [[word.vector for word in doc if word.text not in stopwords] for doc in docs]

# average w2v
X = np.array([np.mean([word.vector for word in doc if word.text not in stopwords], 0) for doc in docs])

In [316]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer

CountVectorizer()
BoW = CountVectorizer(documents, )

In [317]:
X_bag = BoW.fit_transform(documents)

In [318]:
Y = np.array(np.array(tags)=='Prio-Fall', dtype=int)

In [320]:
sum(Y)/len(Y)

0.14504915987966835

# XG BOOST

In [325]:
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 10

In [326]:
# Code for cross-validation
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.utils import class_weight
from sklearn.metrics import f1_score

# Create 2 folds
k = 3
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results = np.zeros(k)

# iterate over two folds
for i, 
for i, (train_ind, test_ind) in enumerate(kfold):
    X_train, X_val, Y_train, Y_val = X_bag[train_ind], X_bag[test_ind], Y[train_ind], Y[test_ind]
    gnb = GaussianNB()
    D_train = xgb.DMatrix(X_train, label=Y_train)
    D_val = xgb.DMatrix(X_val, label=Y_val)

    weights = class_weight.compute_sample_weight('balanced', [0,1], Y_train)
    weightss = []
    for y in Y_train:
        weightss.append(weights[y])

    watchlist = [(D_train, 'train'), (D_val, 'valid')]
    bst = xgb.train(params, D_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)
    predictions = bst.predict(data=D_val).round()
    results[i] = f1_score(predictions, Y_val)
    print('f1 score: {}'.format(results[i]))
    print('accuracy: {}\%'.format(sum(predictions==Y_val)/len(Y_val)*100))

[0]	train-logloss:0.681398	valid-logloss:0.681793
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.585418	valid-logloss:0.589439
[20]	train-logloss:0.516978	valid-logloss:0.524332
[30]	train-logloss:0.466842	valid-logloss:0.477235
[40]	train-logloss:0.429251	valid-logloss:0.442479
[50]	train-logloss:0.400655	valid-logloss:0.416498
[60]	train-logloss:0.378061	valid-logloss:0.396746
[70]	train-logloss:0.360315	valid-logloss:0.381628
[80]	train-logloss:0.346352	valid-logloss:0.370119
[90]	train-logloss:0.335163	valid-logloss:0.361113
[100]	train-logloss:0.326145	valid-logloss:0.354207
[110]	train-logloss:0.318762	valid-logloss:0.348827
[120]	train-logloss:0.312501	valid-logloss:0.344499
[130]	train-logloss:0.307515	valid-logloss:0.340989
[140]	train-logloss:0.303432	valid-logloss:0.338127
[150]	train-logloss:0.299804	valid-logloss:0.335769
[160]	train-logloss:0.296927

In [328]:
np.mean(results), np.var(results)

(0.2882856141195729, 6.498816470590017e-06)

In [219]:
tags

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [31]:
X_bag.shape

(22, 465)