In [1]:
import pandas as pd
import numpy as np
import re
import joblib as jb

In [2]:
like = jb.load('data_pkl/like.pkl.z')
train = jb.load('data_pkl/train.pkl.z')
test = jb.load('data_pkl/test.pkl.z')

tag_like = jb.load('data_pkl/tag_like.pkl.z')
tag_bias_factor = jb.load('data_pkl/tag_bias_factor.pkl.z')

tag_pvals = jb.load('data_pkl/tag_pvals.pkl.z')
tag_bias_factor_weighted = jb.load('data_pkl/tag_bias_factor_weighted.pkl.z')

# Feature - Avg

In [3]:
# avg bias feature

def gen_features(data, tag_dict, return_y=True):
    feature_col = list()
    Y = list()
    
    for ts, tags, y in data:
        feature_row = list()

        for tag in tags:
            if tag not in tag_dict:
                feature_row.append(1)
                continue
            feature_row.append(tag_dict[tag])
        
        feature_col.append(np.mean(feature_row))
        Y.append(y)
        
    feature_col = np.array(feature_col)
    feature_col[np.isnan(feature_col)] = 1
    feature_col = feature_col.reshape(-1, 1)
    
    Y = np.array(Y)
    
    if return_y: return feature_col, Y
    
    return feature_col
            

# Models

In [8]:
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

## Tag Like

In [25]:
feature_col_tag_like_tr, Y_tr = gen_features(train, tag_like)
feature_col_tag_like_ts, Y_ts = gen_features(test, tag_like)

mdl = LGBMClassifier(random_state=0)
mdl.fit(feature_col_tag_like_tr, Y_tr)

raw_roc = roc_auc_score(Y_ts, feature_col_tag_like_ts)
model_roc = roc_auc_score(Y_ts, mdl.predict_proba(feature_col_tag_like_ts)[:,1])

print('ROC AUC - Feature {} - Model = {}'.format(raw_roc, model_roc))

ROC AUC - Feature 0.9488402271069403 - Model = 0.9458395913396276


## Tag Bias Factor

In [18]:
feature_col_tag_bias_factor_tr, Y_tr = gen_features(train, tag_bias_factor)
feature_col_tag_bias_factor_ts, Y_ts = gen_features(test, tag_bias_factor)

mdl = LGBMClassifier(random_state=0)
mdl.fit(feature_col_tag_bias_factor_tr, Y_tr)

raw_roc = roc_auc_score(Y_ts, feature_col_tag_bias_factor_ts)
model_roc = roc_auc_score(Y_ts, mdl.predict_proba(feature_col_tag_bias_factor_ts)[:,1])

print('ROC AUC - Feature {} - Model = {}'.format(raw_roc, model_roc))

ROC AUC - Feature 0.948840165329978 - Model = 0.9468058190686209


## Tag P-Values

In [19]:
feature_col_tag_pvals_tr, Y_tr = gen_features(train, tag_pvals)
feature_col_tag_pvals_ts, Y_ts = gen_features(test, tag_pvals)

mdl = LGBMClassifier(random_state=0)
mdl.fit(feature_col_tag_pvals_tr, Y_tr)

raw_roc = roc_auc_score(Y_ts, feature_col_tag_pvals_ts)
model_roc = roc_auc_score(Y_ts, mdl.predict_proba(feature_col_tag_pvals_ts)[:,1])
print('ROC AUC - Feature {} - Model = {}'.format(raw_roc, model_roc))

ROC AUC - Feature 0.5475726317925005 - Model = 0.6724005932482887


## Tag Bias Factor Weighted

In [21]:
feature_col_tag_bias_factor_weighted_tr, Y_tr = gen_features(train, tag_bias_factor_weighted)
feature_col_tag_bias_factor_weighted_ts, Y_ts = gen_features(test, tag_bias_factor_weighted)

mdl = LGBMClassifier(random_state=0)
mdl.fit(feature_col_tag_bias_factor_weighted_tr, Y_tr)

raw_roc = roc_auc_score(Y_ts, feature_col_tag_bias_factor_weighted_ts)
model_roc = roc_auc_score(Y_ts, mdl.predict_proba(feature_col_tag_bias_factor_weighted_ts)[:,1])
print('ROC AUC - Feature {} - Model = {}'.format(raw_roc, model_roc))

ROC AUC - Feature 0.9185109972877444 - Model = 0.9218451926156266


## Tag Likes + Tag P-values

In [22]:
mx_tr = np.hstack([feature_col_tag_like_tr, feature_col_tag_pvals_tr])
mx_ts = np.hstack([feature_col_tag_like_ts, feature_col_tag_pvals_ts])

mdl = LGBMClassifier(random_state=0)
mdl.fit(mx_tr, Y_tr)

raw_roc = roc_auc_score(Y_ts, mx_ts.mean(axis=1))
model_roc = roc_auc_score(Y_ts, mdl.predict_proba(mx_ts)[:,1])
print('ROC AUC - Feature {} - Model = {}'.format(raw_roc, model_roc))

ROC AUC - Feature 0.9209065946330811 - Model = 0.9440787574313567


## Tag Likes Sparse Matrix

In [26]:
from scipy.sparse import csr_matrix

In [27]:
def like_to_matrix(raw_data, tag_dict):
    data = list()
    rows = list()
    cols = list()
    Y = list()
    
    col_names = sorted(tag_dict.keys())
    col_ix = dict(zip(col_names, np.arange(len(col_names))))
    
    for i, (ts, tags, y) in enumerate(raw_data):
        for t in tags:
            row = i
            col = col_ix.get(t, -1)
            if col < 0:
                continue
            value = tag_dict[t]
            
            data.append(value)
            rows.append(i)
            cols.append(col)
        Y.append(y)
    mx = csr_matrix((data, (rows,cols)), shape=(len(raw_data), len(col_names)))
    Y = np.array(Y)

    return mx, Y

In [31]:
mx_tr, Y_tr = like_to_matrix(train, tag_like)
mx_ts, Y_ts = like_to_matrix(test, tag_like)

mdl = LGBMClassifier(random_state=0)
mdl.fit(mx_tr, Y_tr)

raw_roc = roc_auc_score(Y_ts, mx_ts.mean(axis=1))
model_roc = roc_auc_score(Y_ts, mdl.predict_proba(mx_ts)[:,1])
print('ROC AUC - Feature {} - Model = {}'.format(raw_roc, model_roc))

ROC AUC - Feature 0.9023124033396461 - Model = 0.8685419238647535


In [33]:
mx_tr, Y_tr = like_to_matrix(train, tag_pvals)
mx_ts, Y_ts = like_to_matrix(test, tag_pvals)

mdl = LGBMClassifier(random_state=0)
mdl.fit(mx_tr, Y_tr)

raw_roc = roc_auc_score(Y_ts, mx_ts.mean(axis=1))
model_roc = roc_auc_score(Y_ts, mdl.predict_proba(mx_ts)[:,1])
print('ROC AUC - Feature {} - Model = {}'.format(raw_roc, model_roc))

ROC AUC - Feature 0.5606185568687417 - Model = 0.7548779501382485
