In [84]:
from master_thesis.src import utils, data
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

import scipy.stats as st

In [33]:
full = pd.read_csv(utils.DATA / 'combined.tsv', sep='\t')

In [34]:
full.shape

(84887, 47)

In [35]:
full.category.value_counts()

sp    29476
vm    20915
pl    17113
wi    13283
ku     4048
rs       52
Name: category, dtype: int64

In [36]:
full.set_index('articleId', inplace=True)

## overview: average for each category

In [37]:
# get one hot encoding of column 'category'
category_one_hot = pd.get_dummies(full['category'])
category_one_hot.to_csv(utils.OUTPUT / f'meta_file_category.csv', index=True) # save to file

In [38]:
category_one_hot.head()

Unnamed: 0_level_0,ku,pl,rs,sp,vm,wi
articleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
49297055,0,0,0,0,1,0
49307861,0,0,0,1,0,0
49307973,0,0,0,0,1,0
49308023,1,0,0,0,0,0
49308055,0,1,0,0,0,0


In [39]:
category_one_hot.shape

(84887, 6)

In [40]:
full.loc[49297055].category

'vm'

In [103]:
df = utils.get_conditioned_df()

Shape of raw df: (84887, 47)
Shape of remaining df after conditioning: (7902, 49)


In [104]:
df.category.value_counts()

vm    3270
pl    2650
wi    1117
sp     639
ku     226
Name: category, dtype: int64

In [105]:
df_train, df_dev, df_test = data.create_train_dev_test(df, random_seed = 123)

In [106]:
df_train.shape, df_dev.shape, df_test.shape, 

((6321, 49), (790, 49), (791, 49))

In [107]:
categories = ['vm', 'pl', 'ku', 'wi', 'sp', 'rs']

In [108]:
categories

['vm', 'pl', 'ku', 'wi', 'sp', 'rs']

Bedeutung der Ressortkürzel:\
pl : Politik\
wi : Wirtschaft\
ku : Kultur\
sp : Sport\
vm : Vermischtes\
rs : Redaktioneller Service, wie Kurzdienst, Hinweise, etc.

In [118]:
df_category_mean = {}
for c in categories:
    tmp = df[df.category == c]
    if len(tmp) == 0:
        mean_time = 0
        mean_pv = 0
        mean_tokens = 0
    else:
        mean_time = np.mean(tmp.avgTimeOnPagePerWordcount).round(3)
        mean_pv = np.mean(tmp.pageviews).round(3)
        mean_tokens = np.mean(tmp.wordcount).round(3)

    df_category_mean[c] = {'mean_time' : mean_time,
                          'mean_pv' : mean_pv,
                          'mean_tokens' : mean_tokens}

    
full_category_mean = {}
for c in categories:
    tmp = full[full.category == c]
    if len(tmp) == 0:
        mean_time = 0
        mean_pv = 0
        mean_tokens = 0
    else:
        mean_time = np.mean(tmp.avgTimeOnPagePerWordcount).round(3)
        mean_pv = np.mean(tmp.pageviews).round(3)
        mean_tokens = np.mean(tmp.wordcount).round(3)

    full_category_mean[c] = {'mean_time' : mean_time,
                             'mean_pv' : mean_pv,
                             'mean_tokens' : mean_tokens}

In [120]:
df_category_mean # conditioned df

{'vm': {'mean_time': 0.554, 'mean_pv': 1144.66, 'mean_tokens': 346.074},
 'pl': {'mean_time': 0.436, 'mean_pv': 933.182, 'mean_tokens': 433.989},
 'ku': {'mean_time': 0.447, 'mean_pv': 440.301, 'mean_tokens': 386.018},
 'wi': {'mean_time': 0.462, 'mean_pv': 882.578, 'mean_tokens': 408.252},
 'sp': {'mean_time': 0.466, 'mean_pv': 443.252, 'mean_tokens': 401.736},
 'rs': {'mean_time': 0, 'mean_pv': 0, 'mean_tokens': 0}}

In [121]:
full_category_mean # full data

{'vm': {'mean_time': 0.49, 'mean_pv': 480.147, 'mean_tokens': 309.566},
 'pl': {'mean_time': 0.479, 'mean_pv': 371.178, 'mean_tokens': 396.539},
 'ku': {'mean_time': 0.446, 'mean_pv': 133.38, 'mean_tokens': 331.678},
 'wi': {'mean_time': 0.469, 'mean_pv': 213.106, 'mean_tokens': 369.06},
 'sp': {'mean_time': 0.355, 'mean_pv': 30.93, 'mean_tokens': 283.455},
 'rs': {'mean_time': 1.208, 'mean_pv': 3000.442, 'mean_tokens': 118.962}}

Die Mittelwerte bei avgTime unterscheiden sich nicht sonderlich stark (außer bei 'rs', das ist die seltenste Kategorie und kommt im df gar nicht vor)

## using category label matrix as features --> so: predicting average value of category

In [58]:
df_train.set_index('articleId', inplace=True)
df_dev.set_index('articleId', inplace=True)
df_test.set_index('articleId', inplace=True)

In [62]:
#df_dev.head()

In [63]:
train_ids = df_train.index.tolist()
dev_ids = df_dev.index.tolist()
test_ids = df_test.index.tolist()

In [64]:
len(train_ids), len(dev_ids), len(test_ids)

(6321, 790, 791)

In [65]:
X_train = np.array(category_one_hot.loc[train_ids])
X_dev = np.array(category_one_hot.loc[dev_ids])
X_test = np.array(category_one_hot.loc[test_ids])

In [66]:
X_train.shape, X_dev.shape, X_test.shape

((6321, 6), (790, 6), (791, 6))

In [67]:
X_train

array([[0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       ...,
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0]], dtype=uint8)

In [70]:
# check one example
print(dev_ids[0])
print(category_one_hot.loc[49662987])
print(X_dev[0])
print(df.set_index('articleId').loc[49662987]['category'])

49662987
ku    0
pl    1
rs    0
sp    0
vm    0
wi    0
Name: 49662987, dtype: uint8
[0 1 0 0 0 0]
pl


In [71]:
# define the target label
target = 'avgTimeOnPagePerWordcount'


y_train = np.array(df_train[target])
y_dev = np.array(df_dev[target])
y_test = np.array(df_test[target])

In [72]:
y_train.shape, y_dev.shape, y_test.shape

((6321,), (790,), (791,))

In [75]:
print(y_dev[0])
print(df.set_index('articleId').loc[49662987]['avgTimeOnPagePerWordcount'])

0.12798234280792414
0.12798234280792414


In [76]:
from sklearn.linear_model import Ridge, LinearRegression, SGDRegressor

In [77]:
model = Ridge()

In [78]:
model.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [79]:
# predict for dev set
pred_dev = model.predict(X_dev)

In [80]:
# postprocessing: replace negative values with 0 (better way? can I give that hint to the model?)
pred_dev[pred_dev < 0] = 0
pred_dev

array([0.43725592, 0.43725592, 0.55763765, 0.43725592, 0.46677399,
       0.46677399, 0.55763765, 0.46677399, 0.43725592, 0.43725592,
       0.44942092, 0.55763765, 0.55763765, 0.46677399, 0.43725592,
       0.55763765, 0.43725592, 0.46677399, 0.55763765, 0.43725592,
       0.46677399, 0.55763765, 0.55763765, 0.43725592, 0.47107188,
       0.43725592, 0.55763765, 0.44942092, 0.55763765, 0.46677399,
       0.55763765, 0.55763765, 0.55763765, 0.55763765, 0.47107188,
       0.43725592, 0.46677399, 0.43725592, 0.55763765, 0.47107188,
       0.55763765, 0.46677399, 0.43725592, 0.43725592, 0.43725592,
       0.55763765, 0.43725592, 0.55763765, 0.46677399, 0.47107188,
       0.46677399, 0.46677399, 0.55763765, 0.47107188, 0.43725592,
       0.43725592, 0.46677399, 0.46677399, 0.55763765, 0.43725592,
       0.43725592, 0.43725592, 0.55763765, 0.43725592, 0.46677399,
       0.46677399, 0.55763765, 0.43725592, 0.46677399, 0.55763765,
       0.55763765, 0.47107188, 0.55763765, 0.43725592, 0.43725

In [81]:
np.array(y_dev)

array([0.12798234, 0.1478802 , 1.12616382, 0.2850785 , 0.08211664,
       0.10554065, 1.59102746, 0.43003365, 0.23800202, 0.13883567,
       0.16161185, 1.22833333, 0.68404516, 0.2432484 , 0.57534326,
       1.3384727 , 0.39071567, 0.34569746, 0.14552633, 0.15971867,
       0.65778986, 0.53012864, 0.3548755 , 0.51716383, 0.46479281,
       0.43658631, 0.30921228, 0.22349374, 0.50958966, 0.08926003,
       0.62573495, 0.59721011, 0.3062435 , 0.58566149, 0.75907564,
       0.22474275, 0.25169042, 0.27184466, 0.37131519, 0.67224409,
       0.54312195, 0.26893002, 0.21942873, 0.32585245, 0.5792046 ,
       0.24510446, 0.18567156, 0.17776634, 0.74464916, 0.49167333,
       0.40843046, 0.19593033, 0.80501641, 1.34256055, 0.58144416,
       0.27951434, 0.75582673, 0.08779929, 0.78848386, 1.13761317,
       1.11053361, 0.24179584, 0.25393462, 0.18680106, 0.23014767,
       0.5579    , 0.67491582, 0.88362498, 1.01905626, 1.53988036,
       1.02280812, 0.17145672, 1.23399267, 0.28771097, 0.10128

In [82]:
st.pearsonr(pred_dev, y_dev) # 0.1797 also nicht sonderlich gut

(0.17966646920924587, 3.7093706311773657e-07)

In [85]:
mean_squared_error(pred_dev, y_dev)

0.10376933082688541

In [83]:
model.predict(X_dev[:1])

array([0.43725592])

Pearson's r = 0.11

Also ist das category-Label nicht sonderlich aufschlussreich auf die avgTime (auch die Mittelwerte unterscheiden sich nicht besonders stark, siehe oben).

## veraltete Betrachtung von 'subject':

In [163]:
df_subject = utils.get_meta_cat_file('subject')

In [164]:
df_subject

Unnamed: 0_level_0,Arbeit,Automobilindustrie,Ernährung,Essen und Trinken,Familie,Finanzen,Freizeit,Gesundheit,Immobilien,Kosmetik,...,Recht,Sport,Steuern,Technik,Tier,Tourismus,Umwelt,Verbraucher,Verkehr,Wohnen
articleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48620281,0,0,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,0
48620381,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
48622639,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
48623085,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,0,1
48623259,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51562817,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
51564503,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
51564511,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51565043,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [165]:
subjects = utils.get_set_of_meta_cat('subject')

subject_mean_train = {}
for s in subjects:
    #print(s)
    ids_candidates = df_subject[df_subject[s] == 1].index # take the ones where s == 1
    #print(len(ids_candidates))
    ids = [ ID for ID in ids_candidates if ID in train_ids] # theirof just take the ones in train
    #print(len(ids))
    
    mean_t = np.mean(df.loc[ids]['avgTimeOnPage/wordcount'])
    mean_s = np.mean(df.loc[ids]['stickiness'])
    mean_p = np.mean(df.loc[ids]['pageviews'])
    subject_mean_train[s] = {'mean_t' : mean_t,
                             'mean_s' : mean_s,
                             'mean_p' : mean_p}
print(subject_mean_train)

{'Arbeit': {'mean_t': 0.37619595447595433, 'mean_s': 38.88707093902196, 'mean_p': 91.52127659574468}, 'Automobilindustrie': {'mean_t': 0.2312182741116751, 'mean_s': 90.9090909090909, 'mean_p': 11.0}, 'Ernährung': {'mean_t': 0.4740111189699141, 'mean_s': 70.7095000088689, 'mean_p': 11.430769230769231}, 'Essen und Trinken': {'mean_t': 0.3134978229317856, 'mean_s': 82.10526315789471, 'mean_p': 95.0}, 'Familie': {'mean_t': 0.4272728128837668, 'mean_s': 58.85934927692625, 'mean_p': 22.37037037037037}, 'Finanzen': {'mean_t': 0.581271541318767, 'mean_s': 45.33625730994152, 'mean_p': 19.25}, 'Freizeit': {'mean_t': 0.14622164427105852, 'mean_s': 42.8571428571429, 'mean_p': 21.0}, 'Gesundheit': {'mean_t': 0.4032563624659454, 'mean_s': 59.7165516363176, 'mean_p': 44.837606837606835}, 'Immobilien': {'mean_t': 0.3808411340654103, 'mean_s': 65.2343099313056, 'mean_p': 11.957446808510639}, 'Kosmetik': {'mean_t': nan, 'mean_s': nan, 'mean_p': nan}, 'Medizin': {'mean_t': 0.28342828245298957, 'mean_s': 

In [168]:
# Vorhersage für dev_Set
pred_dev_mean = []
for ID in dev_ids:
    #print(ID)
    #print(df_subject.loc[ID])
    subjects_ID = [ c for c in df_subject.columns if df_subject.loc[ID][c] == 1 ]
    #print(subjects_ID)
    mean_mean_t = np.mean( [subject_mean_train[s]['mean_t'] for s in subjects_ID] )
    #print(mean_mean_t)
    pred_dev_mean.append(mean_mean_t)
    

In [169]:
pred_dev_mean

[0.37619595447595433,
 0.4032563624659454,
 0.2520830410035817,
 0.37619595447595433,
 0.5555591437331113,
 0.4032563624659454,
 0.39876383248566477,
 0.3808411340654103,
 0.4740111189699141,
 0.39876383248566477,
 0.3808411340654103,
 0.3808411340654103,
 0.4032563624659454,
 0.39876383248566477,
 0.39876383248566477,
 0.39876383248566477,
 0.3808411340654103,
 0.4272728128837668,
 0.3808411340654103,
 0.581271541318767,
 0.5555591437331113,
 0.4272728128837668,
 0.37619595447595433,
 0.4272728128837668,
 0.37619595447595433,
 0.4032563624659454,
 0.4272728128837668,
 0.37619595447595433,
 0.4032563624659454,
 0.37619595447595433,
 0.3808411340654103,
 0.5555591437331113,
 0.3808411340654103,
 0.4032563624659454,
 0.37619595447595433,
 0.39876383248566477,
 0.39876383248566477,
 0.4032563624659454,
 0.4032563624659454,
 0.3808411340654103,
 0.4032563624659454,
 0.3808411340654103,
 0.39876383248566477,
 0.4272728128837668,
 0.4272728128837668,
 0.37619595447595433,
 0.3808411340654103

In [170]:
st.pearsonr(pred_dev_mean, y_dev)

(0.03285113970521218, 0.759895229280333)

Kommentar
* sowohl beim direkten Trainieren anhand der subject label Matrix als auch beim einfachen Vorhersagen der Mittelwerte anhand des (oder der) Subject(s) ist nicht sooo toll (aber immerhin bei diesen beiden Versionen mehr oder weniger gleich). Bei stickiness funktioniert es einigermaßen gut (r=0.4)