# setup

In [1]:
import torch
from transformers import *

import pandas as pd
pd.set_option('display.max_columns', 500)
import dask.dataframe as dd


HOME='/Users/yang.zhang/git/recsys20/'
p_in=f'{HOME}/input'

pretrained_weights='bert-base-multilingual-cased'
bertmodel = BertModel.from_pretrained(pretrained_weights)
tokenizer = BertTokenizer.from_pretrained(pretrained_weights, do_lower_case=False)

# (rcss20) ➜  recsys20 git:(master) ✗ wc -l data/training.tsv
#  148,075,238 data/training.tsv
# (rcss20) ➜  recsys20 git:(master) ✗ wc -l data/val.tsv
#  15,127,684 data/val.tsv

# !head -500000 {p_in}/training.tsv > {p_in}/trn5e5.tsv
# !head -50000  {p_in}/val.tsv      > {p_in}/val5e4.tsv

    (rcss20) ➜  recsys20 git:(master) ✗ wc -l data/training.tsv
     148,075,238 data/training.tsv
    (rcss20) ➜  recsys20 git:(master) ✗ wc -l data/val.tsv
     15,127,684 data/val.tsv

# training.tsv

In [112]:
cols=[
'text_tokens',
'hashtags',
'tweet_id',
'present_media',
'present_links',
'present_domains',
'tweet_type',
'language',
'timestamp',

'engaged_user_id',
'engaged_follower_count',
'engaged_following_count',
'engaged_is_verified',
'engaged_account_creation_time',

'engaging_user_id',
'engaging_follower_count',
'engaging_following_count',
'engaging_is_verified',
'engaging_account_creation_time',

'engagee_follows_engager',
'reply_engagement_timestamp',
'retweet_engagement_timestamp',
'retweet_with_comment_engagement_timestamp',
'like_engagement_timestamp',
]
cols_val=cols[:-4]
cols_time=[
'timestamp',
'engaged_account_creation_time',
'engaging_account_creation_time',
'reply_engagement_timestamp',
'retweet_engagement_timestamp',
'retweet_with_comment_engagement_timestamp',
'like_engagement_timestamp',
]

In [113]:
# #https://docs.dask.org/en/latest/dataframe.html
# df=dd.read_csv(f'{p_in}/training10000.tsv',
#                sep='\x01',
#                header=None,
#                names=cols,)

df=pd.read_csv(f'{p_in}/trn5e5.tsv',
               sep='\x01',
               header=None,
#                usecols=range(1,len(cols)),names=cols[1:],
               names=cols,
               encoding='utf8'
              )

In [114]:
df['did_rtwt']=df.retweet_engagement_timestamp.notna()
df['did_rply']=df.reply_engagement_timestamp.notna()
df['did_cmmt']=df.retweet_with_comment_engagement_timestamp.notna()
df['did_like']=df.like_engagement_timestamp.notna()

In [115]:
cols_tgt=['did_rply',
'did_rtwt',
'did_cmmt',
'did_like',]

In [116]:
pd.to_datetime(df[cols_time[0]].tail(), unit='s')

499995   2020-02-09 06:47:55
499996   2020-02-07 15:15:47
499997   2020-02-11 13:59:43
499998   2020-02-08 17:52:45
499999   2020-02-12 03:14:27
Name: timestamp, dtype: datetime64[ns]

In [117]:
df['toks']=df.text_tokens.apply(lambda x: x.split('\t'))

In [118]:
" ".join(tokenizer.convert_ids_to_tokens(df.sample().toks.values[0]))

'[CLS] This is exactly what I want Justin [SEP]'

## timestamps

In [119]:
df_tm=df[cols_time].describe()
df_tm

Unnamed: 0,timestamp,engaged_account_creation_time,engaging_account_creation_time,reply_engagement_timestamp,retweet_engagement_timestamp,retweet_with_comment_engagement_timestamp,like_engagement_timestamp
count,500000.0,500000.0,500000.0,13990.0,56319.0,3838.0,219505.0
mean,1581249000.0,1397633000.0,1453507000.0,1581267000.0,1581257000.0,1581264000.0,1581261000.0
std,171459.7,115484000.0,101259200.0,171730.5,169048.4,169066.3,168697.5
min,1580947000.0,-1468800.0,-1468800.0,1580947000.0,1580947000.0,1580947000.0,1580947000.0
25%,1581106000.0,1294578000.0,1373851000.0,1581120000.0,1581116000.0,1581122000.0,1581119000.0
50%,1581252000.0,1398212000.0,1479414000.0,1581273000.0,1581257000.0,1581271000.0,1581261000.0
75%,1581395000.0,1506594000.0,1543766000.0,1581419000.0,1581402000.0,1581411000.0,1581407000.0
max,1581552000.0,1578950000.0,1578960000.0,1581552000.0,1581552000.0,1581551000.0,1581552000.0


In [120]:
pd.DataFrame(zip(df_tm.columns,pd.to_datetime(df_tm.loc['min'].values, unit='s')))

Unnamed: 0,0,1
0,timestamp,2020-02-06 00:00:00
1,engaged_account_creation_time,1969-12-15 00:00:00
2,engaging_account_creation_time,1969-12-15 00:00:00
3,reply_engagement_timestamp,2020-02-06 00:02:03
4,retweet_engagement_timestamp,2020-02-06 00:01:23
5,retweet_with_comment_engagement_timestamp,2020-02-06 00:02:25
6,like_engagement_timestamp,2020-02-06 00:01:20


In [121]:
pd.DataFrame(zip(df_tm.columns,pd.to_datetime(df_tm.loc['max'].values, unit='s')))

Unnamed: 0,0,1
0,timestamp,2020-02-12 23:59:59
1,engaged_account_creation_time,2020-01-13 21:09:49
2,engaging_account_creation_time,2020-01-13 23:55:53
3,reply_engagement_timestamp,2020-02-12 23:59:04
4,retweet_engagement_timestamp,2020-02-12 23:59:53
5,retweet_with_comment_engagement_timestamp,2020-02-12 23:49:25
6,like_engagement_timestamp,2020-02-12 23:59:58


In [122]:
sorted(df.engaged_account_creation_time)[:5]

[-1468800, 1141505414, 1141505414, 1141506095, 1145053062]

In [123]:
#prep
df.engaged_account_creation_time[df.engaged_account_creation_time<0]=pd.na

AttributeError: module 'pandas' has no attribute 'na'

## categorical

In [None]:
for col in [
    
    
'hashtags',
'tweet_id',
'present_media',
'present_links',
'present_domains',
'tweet_type',
'language',

'engaged_user_id',
'engaged_follower_count',
'engaged_following_count',
'engaged_is_verified',

'engaging_user_id',
'engaging_follower_count',
'engaging_following_count',
'engaging_is_verified',

'engagee_follows_engager',
    
]+cols_tgt:
    display(col, df[col].value_counts(dropna=False))

In [None]:
unq_twt_typs=df.tweet_type.unique()

In [None]:
for col in cols_tgt:
    for typ in unq_twt_typs:
#         display(df[col][df['Tweet type']==typ].value_counts())
        print(col,typ,df[col][df.tweet_type==typ].mean())

In [None]:
print(df.present_media.notna().mean())
for col in cols_tgt:
    print(col, df[col][df.present_media.notna()].mean(), 
          df[col][df.present_media.isna()].mean())

# val.tsv

In [43]:
dfval=dd.read_csv(f'{p_in}/val5e4.tsv',
                  sep='\x01',
               header=None,
               names=cols_val)

In [44]:
# %%time
# dfval.shape[0].compute()
# CPU times: user 2min 10s, sys: 29.3 s, total: 2min 39s
# Wall time: 1min 31s
# 15127684

In [45]:
dfval.head()

Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,timestamp,engaged_user_id,engaged_follower_count,engaged_following_count,engaged_is_verified,engaged_account_creation_time,engaging_user_id,engaging_follower_count,engaging_following_count,engaging_is_verified,engaging_account_creation_time,engagee_follows_engager
0,101\t47185\t10157\t100986\t10343\t55422\t119\t...,,7647B4E9DAF4C1D8973397DC2A04F3E3,Photo,,,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581703126,8A9AB92B775C62C4AB60DF6773A01571,13941,1216,False,1448292186,0000006C3074607050F1339DDCB890BB,27448,600,False,1520948869,True
1,101\t6006\t5086\t1939\t7418\t3601\t6406\t1913\...,,CCBFBA5AFE7EFC03102EA8D0F86C4208,Photo,,,TopLevel,22C448FF81263D4BAF2A176145EE9EAD,1581736431,187AC59639DA9A6F32F7CD118EDD58F7,476439,1478,False,1254447722,00000776B07587ECA9717BFC301F2D6E,102,659,False,1478011810,False
2,101\t56898\t137\t44851\t10317\t11490\t10112\t1...,,E18C2DCFC5AF20C650A0FD94598E69B7,Video,,,Retweet,ECED8A16BE2A5E8871FD55F4842F16B1,1582061925,82626B53CB2AD3B469E4AE06EAA9D930,367,702,False,1518708926,00000860E80C67D8C46CE57C64DE9444,230,189,False,1541013180,True
3,101\t13497\t10437\t94005\t11161\t73632\t11067\...,,26DC813FDF8546B757BB9141099F119E,,D58137F9D688C88435FD64FBAEA82B97,E91CDEC8DC7ABF30592FA024616FF970,TopLevel,ECED8A16BE2A5E8871FD55F4842F16B1,1582110043,7AFE06FF54898A1E9C716F539831849E,278,1229,False,1243548061,00000865A1538142CDA5936B07FE4311,65,165,False,1452599043,True
4,101\t24781\t10152\t42041\t38268\t10301\t10798\...,,30A33055566AAC9EB18734C4EAD11FE1,,AEF0CC9FA7B389B9A2ADF1331F00B65B,42DD9E2D4B2C0B0A71E909A6049EC2C2,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581860270,D240DACE38CA84965270C86D47D3BF40,24313527,121,True,1177506290,00000865A1538142CDA5936B07FE4311,64,164,False,1452599043,False
