In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import string

import spacy
import nltk
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import scipy.stats as st

[nltk_data] Downloading package stopwords to /Users/Sanna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# download german fastText Embeddings
import wget
import gzip
import os
import os.path

# downloading the .vec.gz-files (fasText, facebook)
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz'
target_path = '/Users/Sanna/data/cc.de.300.vec.gz'
if not os.path.isfile(target_path):
    print('downloading...')
    wget.download(url, out=target_path)
    print('done')
else:
    print('file already exists')

file already exists


In [3]:
def load_vectors(path=None, limit=None):
    print('loading embeddings ...')
    f = gzip.open(path, 'rb')
    n, d = map(int, f.readline().split())
    data = {}
    counter = 0
    for line in f:
        line = line.decode()
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(tokens[1:], dtype=np.float32)
        if limit:
            if counter >= limit:
                break
            else:
                counter += 1
    f.close()
    print('done')
    return data

In [4]:
embs = load_vectors('/Users/Sanna/data/cc.de.300.vec.gz', limit = None)

loading embeddings ...
done


In [21]:
# exploring the embeddings to see what the preprocessing should be like?
print(embs.get('Garten').shape)
print(embs['das'][:5])
print(embs['Das'][:5])
print(embs['ging'][:5])
print(embs['gehst'][:5])
print(embs['gegangen'][:5])
print(embs[','][:5])
print(embs['corona'][:5]) # :D Tja, das ist wohl eher italienisch oder Bier...

(300,)
[-0.016   0.0168  0.11    0.0636  0.0124]
[-0.0561  0.0101  0.0784  0.0929  0.0192]
[-0.0017  0.0986  0.1736  0.0391  0.0817]
[ 0.017  -0.0775  0.012   0.0453 -0.1227]
[0.0263 0.0028 0.0664 0.0028 0.0067]
[-0.0354 -0.0088 -0.018   0.016  -0.0152]
[ 0.0032  0.0169  0.0392 -0.1217 -0.0255]


In [6]:
ROOT = Path('/Volumes/INWT/Daten_NLP/') # encrypted folder!
DATA = ROOT / '200707_aachener_zeitung_modified.csv' # contains new columns and (minimal) preprocessed texts

In [7]:
df = pd.read_csv(DATA, index_col = 0)
df = df.fillna('') # replacing Nan with emtpy string
df.head()

Unnamed: 0_level_0,pageviews,entrances,exits,bounces,timeOnPage,conversions,avgTimeOnPage,stickiness,entranceRate,bounceRate,...,titelH3,wordcount,category,city,text_preprocessed,avgTimeOnPage/wordcount,nr_tokens,mean_token_length,nr_tokens_teaser,nr_tokens_titelH1
articleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48620281,21,7,12,7,1012,,112.444444,42.857143,33.333333,33.333333,...,,769,vm,München/Stuttgart,Frische Luft und Bewegung: Diese Kombination r...,0.146222,796,5.359296,29,9
48620381,19,6,11,5,1484,,185.5,42.105263,31.578947,26.315789,...,,441,vm,Berlin/Frankfurt/Main,"Der Wecker klingelt, aufstehen! Doch gerade im...",0.420635,452,5.938053,33,8
48622639,2,2,2,2,0,,0.0,0.0,100.0,100.0,...,,390,vm,Berlin,Eltern auf der Suche nach einem guten Babyphon...,0.0,396,5.848485,30,7
48623085,32,9,20,9,974,,81.166667,37.5,28.125,28.125,...,,345,vm,Berlin,Spülmaschinentabs sollen kleine Alleskönner se...,0.235266,367,5.594005,30,7
48623259,24,2,7,2,3797,,223.352941,70.833333,8.333333,8.333333,...,,182,vm,Berlin,Make-up hat heutzutage einen Zweck: Es soll da...,1.227214,183,5.622951,22,8


In [8]:
class Preprocessor():
    def __init__(self, delete_stopwords=False, lemmatize=False, delete_punctuation=False):
        self.nlp = spacy.load("de_core_news_sm", disable=['parser', 'ner'])
        #self.nlp = spacy.load("de_core_news_md", disable=['parser', 'ner'])
        self.delete_stopwords = delete_stopwords
        self.lemmatize = lemmatize
        self.delete_punctuation = delete_punctuation
        self.stopwords = nltk.corpus.stopwords.words('german')

    def __call__(self, doc):
        rt = []
        doc = self.nlp(doc)
        
        if self.lemmatize==True:
            for token in doc:
                rt.append(token.lemma_.lower())
        else:
            for token in doc:
                rt.append(token.text)

        if self.delete_punctuation==True:
            rt = [ t for t in rt if t not in string.punctuation ]
        
        if self.delete_stopwords==True:
            if self.lemmatize == True:
                self.stopwords = [ self.nlp(s)[0].lemma_ for s in self.stopwords ]
            rt = [ t for t in rt if t not in self.stopwords ]

        return " ".join(rt)

In [9]:
s = "Das hier sind verschiedene Wörter, um alle Dinge zu testen und anzuschauen."
pr1 = Preprocessor()
print(pr1(s))
pr2 = Preprocessor(delete_stopwords=True)
print(pr2(s))
pr3 = Preprocessor(delete_punctuation=True)
print(pr3(s))
pr4 = Preprocessor(lemmatize=True, delete_punctuation=True)
print(pr4(s))
pr5 = Preprocessor(lemmatize=True, delete_stopwords=True, delete_punctuation=True)
print(pr5(s))

Das hier sind verschiedene Wörter , um alle Dinge zu testen und anzuschauen .
Das verschiedene Wörter , Dinge testen anzuschauen .
Das hier sind verschiedene Wörter um alle Dinge zu testen und anzuschauen
der hier sein verschieden wort um all ding zu testen und anschauen
verschieden wort ding testen anschauen


In [23]:
def get_averaged_embs(text, preprocessor, embs):
    vector = np.zeros(300)
    text_preprocessed = preprocessor(text)
    tokens = text_preprocessed.split() # preprocessor returns string with " " as seperator, so needs to be split up
    counter = 0
    for t in tokens:
        #print(t)
        if t in embs:
            vector += embs.get(t)
            counter +=1
    #print(counter)
    if counter !=0:
        vector = vector/counter
    return vector

In [24]:
prepr = Preprocessor(lemmatize=True, delete_stopwords=True)

In [25]:
example = get_averaged_embs("Nicht jeder geht gerne ins Schwimmbad, nur weil die Sonne scheint.",
                            preprocessor = prepr,
                            embs = embs)
example.shape

(300,)

In [26]:
#creating train, dev, test
RANDOM_SEED = 123
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)
df_dev, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED, shuffle=True)
print(df_train.shape, df_dev.shape, df_test.shape)

(712, 35) (89, 35) (90, 35)


## features: averaged embeddings of the text (or teaser, title)

In [67]:
# define text base features: text? text_preprocessed? titelH1?

feature = 'text_preprocessed'
#feature = 'titelH1'
#feature = 'teaser' # note: not all have 'teaser' but I replaced Nan wih empty string ""

X_train = np.array([ get_averaged_embs(text, preprocessor=prepr, embs=embs) for text in df_train[feature] ])
X_dev = np.array([ get_averaged_embs(text, preprocessor=prepr, embs=embs) for text in df_dev[feature] ])
X_test = np.array([ get_averaged_embs(text, preprocessor=prepr, embs=embs) for text in df_test[feature] ])

In [68]:
X_train.shape, X_dev.shape, X_test.shape

((712, 300), (89, 300), (90, 300))

In [87]:
# define the target labels
#target = 'timeOnPage'
target =  'pageviews'
#target = 'avgTimeOnPage'
#target = 'stickiness'

y_train = np.array(df_train[target])
y_dev = np.array(df_dev[target])
y_test = np.array(df_test[target])

In [88]:
y_train.shape, y_dev.shape, y_test.shape

((712,), (89,), (90,))

In [89]:
from sklearn.linear_model import Ridge

In [90]:
# choose model ???
model = Ridge()

In [91]:
model.fit(X_train, y_train)

Ridge()

In [92]:
# predict for dev set
pred_dev = model.predict(X_dev)

In [93]:
# postprocessing: replace negative values with 0 (better way? can I give that hint to the model?)
pred_dev[pred_dev < 0] = 0
pred_dev

array([66.09238913, 19.7963544 ,  5.31039472, 33.18575782, 20.23860999,
       29.9962147 , 29.44286806, 28.2507586 , 10.32347411, 50.4317532 ,
       48.53036061, 47.62864136,  9.68556234, 14.37338586, 41.17240008,
       35.72628765, 43.01535499, 63.12380705, 22.78533407, 71.35635256,
       71.19728059, 29.39557016, 46.30525339, 44.97687027, 60.93388843,
       37.92476161, 31.4556853 , 55.97063217, 42.11671319, 49.22450772,
       50.73903052, 28.81379404, 61.3329802 , 25.59293726, 69.304076  ,
       36.96100997, 32.91148499,  6.08442219, 28.47456071, 56.44512564,
       41.35323922, 39.39745109, 11.14293029, 28.1758878 ,  9.60988004,
       38.9900646 , 46.18551785, 58.12965785, 20.22825571, 22.85680856,
       48.76808666, 44.00123859, 20.12032599, 16.21154834, 66.24928451,
       34.12627147, 50.60897622, 33.79176813, 39.70805523, 20.34371296,
       29.12702502, 56.94202829, 30.21366103, 22.4359978 , 16.62886993,
       38.97272018, 14.02136826, 57.46088121, 48.23670629, 26.06

In [94]:
np.array(y_dev)

array([3047,   33,   20,   64,    1,   12,    2,   22,    3,    6,    1,
         19,    3,  262,    9,    4,   24,  100,   27,  124,    3,    8,
         14,    6,   39,   14,   16,   18,    9,    8,    8,    9,   16,
         32,   19,    8,   20,    3,   18,   11,   18,    9,    9,   24,
         14,    6,    5,  115,    4,   14,    9,    7,   16,   17,    6,
          8,   19,   39,   17,   29,   24,   13,    1,    8,   10,    3,
          7,   14,    9,    7,    7,    2,    7,    4,   12,    5,   12,
          7,  173,   15,    3,   18,   29,   21,    1,   10,    2,    9,
         18])

In [95]:
st.pearsonr(pred_dev, y_dev)

(0.20071014672268453, 0.059299224221468286)

Notizen:

Keine so gute Evaluation, woran könnte das liegen? Mitteln der Embeddings einfach nicht sinnvoll? Präprozessierung ungeeignet für die Embeddings (Kleinschreibung, Lemmatisierung etc.)?

Sowohl bei pageviews als auch timeOnPage und auch bei Text/Teaser/Titel ähnlich schlecht.

Feature-Berechnung dauert sehr lange, kann man das irgendwie schlauer machen und beschleunigen?