- http://matthewrocklin.com/blog/work/2017/03/28/dask-xgboost
- https://examples.dask.org/machine-learning/xgboost.html#Learn-more

In [2]:
PRFX='prep0316_1'

# setup

In [3]:
from pathlib import Path
import dask.dataframe as dd
import pandas as pd
import dask
print('dask.__version__', dask.__version__)
import xgboost
import datetime
def dtnow(): return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')


SEED=101
HOME='/data/git/recsys20'
p_in=f'{HOME}/input'
p_out=f'{HOME}/output/{PRFX}'
Path(p_out).mkdir(exist_ok=True, parents=True)

cols=[
'text_tokens',
'hashtags',
'tweet_id',
'present_media',
'present_links',
'present_domains',
'tweet_type',
'language',
'timestamp',

'engaged_user_id',
'engaged_follower_count',
'engaged_following_count',
'engaged_is_verified',
'engaged_account_creation_time',

'engaging_user_id',
'engaging_follower_count',
'engaging_following_count',
'engaging_is_verified',
'engaging_account_creation_time',

'engagee_follows_engager',
'reply_engagement_timestamp',
'retweet_engagement_timestamp',
'retweet_with_comment_engagement_timestamp',
'like_engagement_timestamp',
]


cols_val = cols[:-4]

cols_tgt_tmstmp=[
    'retweet_engagement_timestamp',
    'reply_engagement_timestamp',
    'like_engagement_timestamp',
    'retweet_with_comment_engagement_timestamp',
]
cols_tgt=[o.split('_engagement_timestamp')[0] for o in cols_tgt_tmstmp]
print(cols_tgt)

dask.__version__ 2.12.0
['retweet', 'reply', 'like', 'retweet_with_comment']


In [4]:
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(pred, gt):
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

def calculate_ctr(gt):
    positive = len([x for x in gt if x == 1])
    ctr = positive/float(len(gt))
    return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

# prepare data

In [5]:
# (rcss20) ➜  recsys20 git:(master) ✗ wc -l input/trn.tsv
#  148,075,238 data/trn.tsv
# (rcss20) ➜  recsys20 git:(master) ✗ wc -l input/val.tsv
#  15,127,684 data/val.tsv


In [6]:
trntmstmp=1584412344
datetime.datetime.fromtimestamp(trntmstmp).strftime('%Y-%m-%d %H:%M:%S')

'2020-03-17 02:32:24'

In [7]:
valtmstmp=1584405047
datetime.datetime.fromtimestamp(valtmstmp).strftime('%Y-%m-%d %H:%M:%S')

'2020-03-17 00:30:47'

In [8]:
!head -10000 {p_in}/trn_{trntmstmp}.tsv > {p_in}/trn_{trntmstmp}_1e4.tsv

In [9]:
!head -1000 {p_in}/val_{valtmstmp}.tsv > {p_in}/val_{valtmstmp}_1e3.tsv

In [10]:
ls $p_in

[0m[01;32mdownloads.ipynb[0m*        [01;32mtrn_1584412344_1e7.tsv[0m*  val_1584405047_1e3.tsv
[01;32mtmp.ipynb[0m*              trn_1584412344_5e7.tsv   [01;32mval_1584405047.tsv[0m*
trn_1584412344_1e3.tsv  [01;32mtrn_1584412344.tsv[0m*      [01;32mval1e4.tsv[0m*
trn_1584412344_1e4.tsv  [01;32mtrn1e5.tsv[0m*


In [11]:
dftrn=dd.read_csv(
#     f'{p_in}/trn_{trntmstmp}.tsv',
    f'{p_in}/trn_{trntmstmp}_1e4.tsv',
    sep='\x01', header=None, names=cols,)

In [12]:
dfval=dd.read_csv(
#     f'{p_in}/trn_{trntmstmp}.tsv',
#     f'{p_in}/val_{valtmstmp}.tsv',
    f'{p_in}/val_{valtmstmp}_1e3.tsv',
    sep='\x01',header=None,names=cols_val,)

In [13]:
def prep(df, istrn):
    df['len_toks']=df.text_tokens.apply(len, meta=('text_tokens', 'int64'))
    for media in ['Photo', 'Video', 'GIF']:
        df[f'has_media_{media}'] = df.present_media.fillna('').apply(lambda x: media in x, 
                                                                     meta=('present_media', 'string'))
    df['num_hashtags'] = df.hashtags.fillna('').apply(lambda x: len(x.split('\t')) if len(x) else 0,
                                                      meta=('hashtags', 'string'))
    df['num_links'] = df.present_links.fillna('').apply(lambda x: len(x.split('\t')) if len(x) else 0,
                                                      meta=('hashtags', 'string'))
    df['num_domains'] = df.present_domains.fillna('').apply(lambda x: len(x.split('\t')) if len(x) else 0,
                                                      meta=('hashtags', 'string'))
    df['tmdelta_engng_enged'] = df.engaging_account_creation_time - df.engaged_account_creation_time
    df['tmdelta_enged_tweet'] = df.engaged_account_creation_time - df.timestamp
    df['tmdelta_engng_tweet'] = df.engaging_account_creation_time - df.timestamp
    cols_num = ['timestamp', 'engaged_follower_count', 'engaged_following_count',
                'engaged_is_verified', 'engaged_account_creation_time', 'engaging_follower_count',
                'engaging_following_count', 'engaging_is_verified',
                'engaging_account_creation_time', 'engagee_follows_engager',
                'len_toks', 'has_media_Photo', 'has_media_Video', 'has_media_GIF', 
                'num_hashtags', 'num_links', 'num_domains', 
                'tmdelta_engng_enged', 'tmdelta_enged_tweet','tmdelta_engng_tweet']
    cols_cat0 = [
        'tweet_type',
    #         'language',
    ]
    df_num = df[cols_num]
    df_cat = dd.get_dummies(df[cols_cat0].categorize())
    if istrn: 
        df[cols_tgt]=~df[cols_tgt_tmstmp].isna().astype(int)+2
        df_tgt = df[cols_tgt]
        df_prep=dd.concat([df_num,df_cat,df_tgt], axis=1)
    else:
        df_prep=dd.concat([df_num,df_cat], axis=1)
    return df_prep
        


In [14]:
dftrn_prep=prep(dftrn, istrn=True)
dfval_prep=prep(dfval, istrn=False)

We're assuming that the indexes of each dataframes are 
 aligned. This assumption is not generally safe.
  "Concatenating dataframes with unknown divisions.\n"


In [16]:
dftrn_prep.columns

Index(['timestamp', 'engaged_follower_count', 'engaged_following_count',
       'engaged_is_verified', 'engaged_account_creation_time',
       'engaging_follower_count', 'engaging_following_count',
       'engaging_is_verified', 'engaging_account_creation_time',
       'engagee_follows_engager', 'len_toks', 'has_media_Photo',
       'has_media_Video', 'has_media_GIF', 'num_hashtags', 'num_links',
       'num_domains', 'tmdelta_engng_enged', 'tmdelta_enged_tweet',
       'tmdelta_engng_tweet', 'tweet_type_TopLevel', 'tweet_type_Retweet',
       'tweet_type_Quote', 'retweet', 'reply', 'like', 'retweet_with_comment'],
      dtype='object')

In [25]:
%%time
dftrn_prep.to_csv(f'{p_out}/dftrn_prep_{trntmstmp}__{PRFX}.csv',index=False,single_file=True)

CPU times: user 278 ms, sys: 71 ms, total: 349 ms
Wall time: 344 ms


['/data/git/recsys20/output/prep0316_1/dftrn_prep_1584412344__prep0316_1.csv']

In [26]:
%%time
dfval_prep.to_csv(f'{p_out}/dfval_prep_{valtmstmp}__{PRFX}.csv',index=False,single_file=True)

CPU times: user 104 ms, sys: 0 ns, total: 104 ms
Wall time: 101 ms


['/data/git/recsys20/output/prep0316_1/dfval_prep_1584405047__prep0316_1.csv']