# setup

In [1]:
PRFX = 'mdl0308_1'

SEED = 101

# import torch
# from transformers import *

import numpy as np
import pandas as pd
from pathlib import Path
# pd.set_option('display.max_columns', 500)
# import dask.dataframe as dd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, average_precision_score


HOME='/Users/yang.zhang/git/recsys20/'
p_in=f'{HOME}/input'
p_out=f'{HOME}/output/{PRFX}'
Path(p_out).mkdir(exist_ok=True)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# (rcss20) ➜  recsys20 git:(master) ✗ wc -l data/training.tsv
#  148,075,238 data/training.tsv
# (rcss20) ➜  recsys20 git:(master) ✗ wc -l data/val.tsv
#  15,127,684 data/val.tsv
# !head -100000 {p_in}/trn.tsv > {p_in}/trn1e5.tsv
# !head -10000 {p_in}/val.tsv > {p_in}/val1e4.tsv


In [2]:
cols_tgt   = ['did_rtwt', 'did_rply','did_like','did_cmmt',            ]
cols_subnm = ['Retweet' , 'Reply',   'Like',    'Retweet with comment',]
ntgts = len(cols_tgt)

# prep

In [3]:
cols=[
'Text tokens',
'Hashtags',
'Tweet id',
'Present media',
'Present links',
'Present domains',
'Tweet type',
'Language',
'Timestamp',
    
'Engaged User id',
'Engaged Follower count',
'Engaged Following count',
'Engaged Is verified?',
'Engaged Account creation time',
    
'Engaging User id',
'Engaging Follower count',
'Engaging Following count',
'Engaging Is verified?',
'Engaging Account creation time',
    
'Engagee follows engager?',
'Reply engagement timestamp',
'Retweet engagement timestamp',
'Retweet with comment engagement timestamp',
'Like engagement timestamp',
]
cols_val=cols[:-4]
cols_time=[
'Timestamp',
'Engaged Account creation time',
'Engaging Account creation time',
'Reply engagement timestamp',
'Retweet engagement timestamp',
'Retweet with comment engagement timestamp',
'Like engagement timestamp',
]

In [4]:
df=pd.read_csv(f'{p_in}/trn1e5.tsv',
               sep='\x01',
               header=None,
               usecols=range(1,len(cols)),
               names=cols[1:],)

In [29]:
df['did_rtwt']=df['Retweet engagement timestamp'].notna()
df['did_rply']=df['Reply engagement timestamp'].notna()
df['did_cmmt']=df['Retweet with comment engagement timestamp'].notna()
df['did_like']=df['Like engagement timestamp'].notna()

In [30]:
y=df[cols_tgt].astype(int).values

## make features

In [39]:
X=df[['Engaged Follower count',
      'Engaging Following count',
      'Engaging Is verified?', 
      'Engagee follows engager?']].astype(int).values


In [40]:
X.shape

(100000, 4)

In [79]:
def makeX(df):
    X=df[['Engaged Follower count',
          'Engaging Following count',
          'Engaging Is verified?', 
          'Engagee follows engager?']].values
    has_media=df['Present media'].notna().values[:,None]
    X = np.concatenate([X,has_media],1)
    has_media=df['Present media'].notna().values[:,None]
    X = np.concatenate([X,has_media],1)
    
    return X

In [80]:
X = makeX(df)

# model

## trnval split

In [82]:
np.random.seed(SEED)
mskval=np.random.rand(len(y))<0.2
val=np.where(mskval)[0]
trn=np.where(~mskval)[0]

# print(y.mean(0), y[trn].mean(0), y[val].mean(0))

Xtrn,Xval,ytrn,yval=X[trn],X[val],y[trn],y[val]

## train

In [83]:
clf = RandomForestClassifier( 
                             min_samples_leaf=2,
                             max_features=0.5,
                             n_jobs=-1,
                             random_state=SEED)
clf.fit(Xtrn, ytrn)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=0.5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=101, verbose=0,
                       warm_start=False)

In [84]:
def get_scrs(y,prd):
    return [average_precision_score(y[:,i],prd[:,i]) for i in range(y.shape[1])]

In [94]:
def show_scrs(scrs):
    print(list(zip(cols_subnm, [round(o,4) for o in scrs])))
    print(np.mean(scrs))


In [95]:
prdval_rnd = np.random.rand(*yval.shape)
prdval_avg = np.repeat(yval.mean(0)[None],  len(yval), 0)
show_scrs(get_scrs(yval,prdval_rnd))
show_scrs(get_scrs(yval,prdval_avg))

[('Retweet', 0.1147), ('Reply', 0.0301), ('Like', 0.4433), ('Retweet with comment', 0.0095)]
0.14941735529738157
[('Retweet', 0.1131), ('Reply', 0.028), ('Like', 0.4387), ('Retweet with comment', 0.0088)]
0.14713332664526485


In [96]:
prdval_bool = clf.predict(Xval)
prdval_probval = clf.predict_proba(Xval)
prdval = np.array([o[:,1] for o in prdval_probval]).T

In [97]:
show_scrs(get_scrs(yval,prdval))

[('Retweet', 0.1541), ('Reply', 0.0394), ('Like', 0.498), ('Retweet with comment', 0.0168)]
0.17706926438691906


# infer

In [21]:
dftst=pd.read_csv(f'{p_in}/val1e4.tsv',
                  sep='\x01',
               header=None,
               names=cols_val)

In [22]:
dftst.head()

Unnamed: 0,Text tokens,Hashtags,Tweet id,Present media,Present links,Present domains,Tweet type,Language,Timestamp,Engaged User id,Engaged Follower count,Engaged Following count,Engaged Is verified?,Engaged Account creation time,Engaging User id,Engaging Follower count,Engaging Following count,Engaging Is verified?,Engaging Account creation time,Engagee follows engager?
0,101\t47185\t10157\t100986\t10343\t55422\t119\t...,,7647B4E9DAF4C1D8973397DC2A04F3E3,Photo,,,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581703126,8A9AB92B775C62C4AB60DF6773A01571,13941,1216,False,1448292186,0000006C3074607050F1339DDCB890BB,27448,600,False,1520948869,True
1,101\t6006\t5086\t1939\t7418\t3601\t6406\t1913\...,,CCBFBA5AFE7EFC03102EA8D0F86C4208,Photo,,,TopLevel,22C448FF81263D4BAF2A176145EE9EAD,1581736431,187AC59639DA9A6F32F7CD118EDD58F7,476439,1478,False,1254447722,00000776B07587ECA9717BFC301F2D6E,102,659,False,1478011810,False
2,101\t56898\t137\t44851\t10317\t11490\t10112\t1...,,E18C2DCFC5AF20C650A0FD94598E69B7,Video,,,Retweet,ECED8A16BE2A5E8871FD55F4842F16B1,1582061925,82626B53CB2AD3B469E4AE06EAA9D930,367,702,False,1518708926,00000860E80C67D8C46CE57C64DE9444,230,189,False,1541013180,True
3,101\t13497\t10437\t94005\t11161\t73632\t11067\...,,26DC813FDF8546B757BB9141099F119E,,D58137F9D688C88435FD64FBAEA82B97,E91CDEC8DC7ABF30592FA024616FF970,TopLevel,ECED8A16BE2A5E8871FD55F4842F16B1,1582110043,7AFE06FF54898A1E9C716F539831849E,278,1229,False,1243548061,00000865A1538142CDA5936B07FE4311,65,165,False,1452599043,True
4,101\t24781\t10152\t42041\t38268\t10301\t10798\...,,30A33055566AAC9EB18734C4EAD11FE1,,AEF0CC9FA7B389B9A2ADF1331F00B65B,42DD9E2D4B2C0B0A71E909A6049EC2C2,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581860270,D240DACE38CA84965270C86D47D3BF40,24313527,121,True,1177506290,00000865A1538142CDA5936B07FE4311,64,164,False,1452599043,False


In [23]:
Xtst=makeX(dftst)

In [24]:
prdtst_bool = clf.predict(Xtst)
prdtst_prob = clf.predict_proba(Xtst)
prdtst = np.array([o[:,1] for o in prdtst_prob]).T

In [25]:
prdtst.shape

(10000, 4)

In [26]:
dfsub_ids = dftst[['Tweet id','Engaging User id',]]

col2dfsub = {}
for i,col in enumerate(cols_tgt):
    dfsub = dfsub_ids.copy()
    dfsub['scr'] = prdtst[:,i]
    col2dfsub[col]=dfsub

In [27]:
for col,colnm in zip(cols_tgt,cols_subnm):
    col2dfsub[col].to_csv(f'{p_out}/{colnm}__{PRFX}.csv', index=False, header=False)