# start

In [94]:
PRFX='mdl0331_1'
trntmstmp=1584412344
valtmstmp=1585761578

tm_trn_min,tm_trn_max=(1580947200, 1581551999)
tm_val_min,tm_val_max=(1581552000, 1582156799)

SEED=101

valsz = int(1e3)
trnsz = int(1e4)
pos_rate_at_least = 0.1

train_total=int(1.5e8)
test_total=int(1.4e7)
print(f'train_total {train_total:,}, test_total {test_total:,}')


import datetime
def showtm(tm): return datetime.datetime.fromtimestamp(tm).strftime('%Y-%m-%d %H:%M:%S')
print([showtm(tm) for tm in (trntmstmp, valtmstmp)])
print([showtm(tm) for tm in (tm_trn_min,tm_trn_max)])
print([showtm(tm) for tm in (tm_val_min,tm_val_max)])

train_total 150,000,000, test_total 14,000,000
['2020-03-17 02:32:24', '2020-04-01 17:19:38']
['2020-02-06 00:00:00', '2020-02-12 23:59:59']
['2020-02-13 00:00:00', '2020-02-19 23:59:59']


# setup

In [95]:
from tqdm import tqdm
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc, precision_recall_curve
import pickle
import xgboost as xgb
import lightgbm as lgb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)


import datetime
def dtnow(): return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

def iou(s1,s2):
    return len(s1&s2) / len(s1|s2)

HOME='/data/git/recsys20'
p_in=f'{HOME}/input'
p_out=f'{HOME}/output/{PRFX}'
Path(p_out).mkdir(exist_ok=True, parents=True)

from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(pred, gt):
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

def calculate_ctr(gt):
    positive = len([x for x in gt if x == 1])
    ctr = positive/float(len(gt))
    return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

cols=['toks','hshtgs','twtid','media','links','domns','twttyp','lang','tm','u1id','u1_fllwer_cnt','u1_fllwing_cnt','u1_vrfed','u1_create_tm','u2id','u2_follower_cnt','u2_following_cnt','u2_vrfed','u2_create_tm','u1_fllw_u2','reply_tm','retwt_tm','retwt_cmmnt_tm','like_tm',]

cols_val = cols[:-4]
cols_tgt_tmstmp=[ 'retwt_tm', 'reply_tm', 'like_tm', 'retwt_cmmnt_tm',]
cols_tgt=[o.split('_tm')[0] for o in cols_tgt_tmstmp]
tgts             = ['Retweet','Reply','Like','RTwCmnt',]
assert cols_tgt == ['retwt',  'reply','like','retwt_cmmnt',]
tgt2col = dict(zip(tgts, cols_tgt))
print(tgt2col)
ntgts=len(tgts)

{'Retweet': 'retwt', 'Reply': 'reply', 'Like': 'like', 'RTwCmnt': 'retwt_cmmnt'}


# prep

In [96]:
df = pd.read_csv(f'{p_in}/trn_{trntmstmp}.tsv',sep='\x01',header=None,names=cols,nrows=5)
dfval = pd.read_csv(f'{p_in}/trn_{trntmstmp}.tsv',sep='\x01',header=None,names=cols,nrows=2)
dftrn = pd.read_csv(f'{p_in}/trn_{trntmstmp}.tsv',sep='\x01',header=None,names=cols,skiprows=2,nrows=3)

In [97]:
display(df.twtid)
display(dfval.twtid)
display(dftrn.twtid)

0    D4D1EBDE74F74C5DA529959AF979625C
1    BFB529DAB6D384EB83E899A72AB3830D
2    519078C7834E9642508F72A6C2D0F3B7
3    52AAE9E33EFAC8C478C57B31A9E31ED1
4    89C1298C55EB3D68E2784F0BFB69E6F8
Name: twtid, dtype: object

0    D4D1EBDE74F74C5DA529959AF979625C
1    BFB529DAB6D384EB83E899A72AB3830D
Name: twtid, dtype: object

0    519078C7834E9642508F72A6C2D0F3B7
1    52AAE9E33EFAC8C478C57B31A9E31ED1
2    89C1298C55EB3D68E2784F0BFB69E6F8
Name: twtid, dtype: object

In [98]:
%%time
dfval = pd.read_csv(f'{p_in}/trn_{trntmstmp}.tsv',sep='\x01',header=None,names=cols,nrows=valsz)
dfval.shape

CPU times: user 14.7 ms, sys: 0 ns, total: 14.7 ms
Wall time: 13.2 ms


(1000, 24)

In [99]:
%%time
dftrn0 = pd.read_csv(f'{p_in}/trn_{trntmstmp}.tsv',sep='\x01',header=None,names=cols,
                     skiprows=valsz,nrows=min(1e6,trnsz*15))
dftrn0.shape

CPU times: user 1.18 s, sys: 99.2 ms, total: 1.28 s
Wall time: 1.28 s


(150000, 24)

In [100]:
def prp_tm(df, col):
    coldt=col+'_dt'
    df[coldt]=pd.to_datetime(df[col], unit='s')
    df[col+'_dayofweek']=df[coldt].dt.dayofweek
    df[col+'_day']=df[coldt].dt.day
    df[col+'_hour']=df[coldt].dt.hour

def prpdf(df,istrn=True):
    tags = ['hshtgs','media','links','domns',]
    for tag in tags:
        df[f'has_{tag}']=df[tag].notna()
        df[tag]=df[tag].fillna('')
        df[f'lst_{tag}']=df[tag].fillna('').apply(lambda x: x.split('\t') if len(x) else [])
        df[f'n_{tag}']=df[f'lst_{tag}'].apply(len)

    df['toks']=df.toks.apply(lambda x: x.split('\t'))
    df['len_toks']=df.toks.apply(len)
    
    prp_tm(df, 'tm')
    
    df['tmdlta_u2u1']  = df.u2_create_tm - df.u1_create_tm
    df['tmdlta_twtu1'] = df.tm - df.u1_create_tm
    df['tmdlta_twtu2'] = df.tm - df.u2_create_tm
    if istrn:
        df['tmleft_twt']   = tm_trn_max-df.tm
        df['tmleft_u1']    = tm_trn_max-df.u1_create_tm
        df['tmleft_u2']    = tm_trn_max-df.u2_create_tm
    else:
        df['tmleft_twt']   = tm_val_max-df.tm
        df['tmleft_u1']    = tm_val_max-df.u1_create_tm
        df['tmleft_u2']    = tm_val_max-df.u2_create_tm
    
    if istrn: 
        df[cols_tgt]=df[cols_tgt_tmstmp].notna()
        df['enged']=df[cols_tgt].sum(1).astype(bool)
    return df    

In [101]:
%%time
dftrn0 = prpdf(dftrn0)

CPU times: user 1.56 s, sys: 228 ms, total: 1.79 s
Wall time: 1.78 s


In [102]:
%%time
dfval = prpdf(dfval,istrn=False)

CPU times: user 266 ms, sys: 4.43 ms, total: 270 ms
Wall time: 263 ms


In [103]:
dftrn0[cols_tgt].mean()

retwt          0.112273
reply          0.027487
like           0.438047
retwt_cmmnt    0.007887
dtype: float64

In [104]:
np.random.seed(SEED)
tgt2idxtrn = {}
for tgt,col in tgt2col.items(): 
    pos_rate = dftrn0[col].mean()
    if pos_rate>=pos_rate_at_least:
        idxtrn = np.random.choice(range(len(dftrn0)), trnsz, replace=False)
    else:
        npos_at_least=int(trnsz*pos_rate_at_least)
        idx_pos = np.where(dftrn0[col])[0]
        idx_neg = np.where(~dftrn0[col])[0]
        idxtrn_pos = np.random.choice(idx_pos, npos_at_least,       replace=False)
        idxtrn_neg = np.random.choice(idx_neg, trnsz-npos_at_least, replace=False)
        idxtrn = np.concatenate([idxtrn_pos,idxtrn_neg])
        assert len(idxtrn)==len(set(idxtrn))
        np.random.shuffle(idxtrn)
    print(col)
    dftrn=dftrn0.iloc[idxtrn]
    print(len(dftrn), dftrn[col].mean())
    tgt2idxtrn[tgt]=idxtrn    

retwt
10000 0.1163
reply
10000 0.1
like
10000 0.4345
retwt_cmmnt
10000 0.1
