# setup

In [5]:
import torch
from transformers import *

import pandas as pd
pd.set_option('display.max_columns', 500)
import dask.dataframe as dd

HOME='/Users/yang.zhang/git/recsys20/'
p_in=f'{HOME}/data'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
# !head -100000 {p_in}/training.tsv > {p_in}/trn1e5.tsv
# !head -10000 {p_in}/val.tsv > {p_in}/val1e4.tsv

    (rcss20) ➜  recsys20 git:(master) ✗ wc -l data/training.tsv
     148,075,238 data/training.tsv
    (rcss20) ➜  recsys20 git:(master) ✗ wc -l data/val.tsv
     15,127,684 data/val.tsv

# training.tsv

In [7]:
cols=[
'Text tokens',
'Hashtags',
'Tweet id',
'Present media',
'Present links',
'Present domains',
'Tweet type',
'Language',
'Timestamp',
    
'Engaged User id',
'Engaged Follower count',
'Engaged Following count',
'Engaged Is verified?',
'Engaged Account creation time',
    
'Engaging User id',
'Engaging Follower count',
'Engaging Following count',
'Engaging Is verified?',
'Engaging Account creation time',
    
'Engagee follows engager?',
'Reply engagement timestamp',
'Retweet engagement timestamp',
'Retweet with comment engagement timestamp',
'Like engagement timestamp',
]

cols_val=cols[:-4]

cols_time=[
'Timestamp',
'Engaged Account creation time',
'Engaging Account creation time',
'Reply engagement timestamp',
'Retweet engagement timestamp',
'Retweet with comment engagement timestamp',
'Like engagement timestamp',
]

In [18]:
# #https://docs.dask.org/en/latest/dataframe.html
# df=dd.read_csv(f'{p_in}/training10000.tsv',
#                sep='\x01',
#                header=None,
#                names=cols,)

df=pd.read_csv(f'{p_in}/trn1e5.tsv',
               sep='\x01',
               header=None,
               names=cols,)

In [19]:
df['did_rply']=~df['Reply engagement timestamp'].isna()
df['did_rtwt']=~df['Retweet engagement timestamp'].isna()
df['did_cmmt']=~df['Retweet with comment engagement timestamp'].isna()
df['did_like']=~df['Like engagement timestamp'].isna()

In [81]:
cols_tgt=['did_rply',
'did_rtwt',
'did_cmmt',
'did_like',]

In [20]:
pd.to_datetime(df[cols_time[0]].head(), unit='s')

0   2020-02-09 14:22:24
1   2020-02-06 19:55:10
2   2020-02-09 10:41:06
3   2020-02-09 12:08:44
4   2020-02-06 03:13:25
Name: Timestamp, dtype: datetime64[ns]

In [21]:
df['toks']=df['Text tokens'].apply(lambda x: x.split('\t'))

In [70]:
" ".join(tokenizer.convert_ids_to_tokens(df.sample().toks.values[0]))

'[CLS] [UNK] [unused132] [unused182] [UNK] [UNK] bicycle [unused126] ##ort [unused126] [unused141] bothered calgary imaginary [unused114] [unused114] blossom triumph ##cu longtime [UNK] [unused182] imaginary [unused126] responding [unused126] [unused115] [unused115] [unused183] [unused114] playwright [unused115] [unused182] lacked battalions ##ila pepper syn intimate ##worthy ##ila [SEP]'

In [83]:
for col in [
'Present media',
'Present links',
'Present domains',
'Tweet type',
'Language',
'Engaged Follower count',
'Engaged Following count',
'Engaged Is verified?',
'Engaging Follower count',
'Engaging Following count',
'Engaging Is verified?',
'Engagee follows engager?',
]+cols_tgt:
    display(col, df[col].value_counts())

'Present media'

Photo                         18397
Video                         10851
Photo\tPhoto                   4125
Photo\tPhoto\tPhoto\tPhoto     2317
Photo\tPhoto\tPhoto            1321
GIF                            1123
Video\tVideo                      1
Name: Present media, dtype: int64

'Present links'

E522B6C629B01DA6B29DE65BF5DFA55B    16
6DDBD9DAAA2AE38B8EBB54BA8E7098DD    11
270B33AB02B6FB3F60B748CF1CB9AB75    10
84868AAB78F7ECF88BBB70D231D816B0     9
F804191E9F4429E30AFCB1012FC0E660     9
                                    ..
FDDC7184687805CCAA3A71DA89ED1839     1
EE8758F49905035D59CD9FCAF9FA6C5B     1
504F2EA067B133D68D9294363B23676D     1
0BD13777A2878730CA37E1F5AC3DB208     1
98723BEA553A88580C165DA2D153953F     1
Name: Present links, Length: 12810, dtype: int64

'Present domains'

3896E26D12C903F0A00B6B1BE9A9BEA3                                      1234
E91CDEC8DC7ABF30592FA024616FF970                                       839
FECA6F2E8244F2294BD2CE957C0602A9                                       591
FCA72262B99BB78F010B25C882A0C93C                                       303
A32007FFB62B2DFC4E449BD37F985C53                                       192
                                                                      ... 
5271968067C925CEC2863D887E8A9CA5                                         1
2F76E77135D2A3E5AD81942318507A59\t9C61A6B76D9005A54B0C8A61A2601F46       1
8529331982F96BFE71A559BA33582787                                         1
3AAD753B1909E2A1E390764603B099A9                                         1
F5D4AC337C972C4FE2DCDC6827073AE4                                         1
Name: Present domains, Length: 4669, dtype: int64

'Tweet type'

TopLevel    56862
Retweet     34292
Quote        8846
Name: Tweet type, dtype: int64

'Language'

D3164C7FBCF2565DDF915B1B3AEFB1DC    41179
22C448FF81263D4BAF2A176145EE9EAD    16127
06D61DCBBE938971E1EA0C38BD9B5446     8887
ECED8A16BE2A5E8871FD55F4842F16B1     6895
B9175601E87101A984A50F8A62A1C374     5348
4DC22C3F31C5C43721E6B5815A595ED6     3288
167115458A0DBDFF7E9C0C53A83BAC9B     3161
125C57F4FA6D4E110983FB11B52EFD4E     2410
022EC308651FACB02794A8147AEE1B78     2361
FA3F382BC409C271E3D6EAF8BE4648DD     2152
9BF3403E0EB7EA8A256DA9019C0B0716     1942
2996EB2FE8162C076D070A4C8D6532CD      916
975B38F44D65EE42A547283787FF5A21      862
3E16B11B7ADE3A22DDFC4423FBCEAD5D      568
3820C29CBCA409A33BADF68852057C4A      515
FF60A88F53E63000266F8B9149E35AD9      482
717293301FE296B0B61950D041485825      439
AEF22666801F0A5846D853B9CEB2E327      258
9ECD42BC079C20F156F53CB3B99E600E      250
76B8A9C3013AE6414A3E6012413CDC3B      234
190BA7DA361BC06BC1D7E824C378064D      230
48236EC80FDDDFADE99420ABC9210DDF      173
1FFD2FE4297F5E70EBC6C3230D95CB9C      173
A0C7021AD8299ADF0C9EBE326C115F6F  

'Engaged Follower count'

71924831    132
8081340     112
23818306     99
2178700      68
120          60
           ... 
11444         1
1554347       1
495532        1
237486        1
105546        1
Name: Engaged Follower count, Length: 44542, dtype: int64

'Engaged Following count'

0         994
1         703
3         459
2         385
47        383
         ... 
20500       1
6173        1
6587        1
174155      1
43820       1
Name: Engaged Following count, Length: 11354, dtype: int64

'Engaged Is verified?'

False    75083
True     24917
Name: Engaged Is verified?, dtype: int64

'Engaging Follower count'

0        1195
1        1099
2        1034
3        1000
4         900
         ... 
2018        1
3322        1
13527       1
4003        1
6075        1
Name: Engaging Follower count, Length: 4891, dtype: int64

'Engaging Following count'

80       295
106      272
109      261
75       259
101      255
        ... 
25489      1
5087       1
3164       1
11416      1
2049       1
Name: Engaging Following count, Length: 4242, dtype: int64

'Engaging Is verified?'

False    99794
True       206
Name: Engaging Is verified?, dtype: int64

'Engagee follows engager?'

False    64044
True     35956
Name: Engagee follows engager?, dtype: int64

'did_rply'

False    97246
True      2754
Name: did_rply, dtype: int64

'did_rtwt'

False    88789
True     11211
Name: did_rtwt, dtype: int64

'did_cmmt'

False    99199
True       801
Name: did_cmmt, dtype: int64

'did_like'

False    56071
True     43929
Name: did_like, dtype: int64

## val.tsv

In [76]:
dfval=dd.read_csv(f'{p_in}/val1e4.tsv',
                  sep='\x01',
               header=None,
               names=cols_val)

In [77]:
# %%time
# dfval.shape[0].compute()
# CPU times: user 2min 10s, sys: 29.3 s, total: 2min 39s
# Wall time: 1min 31s
# 15127684

In [78]:
dfval.head()

Unnamed: 0,Text tokens,Hashtags,Tweet id,Present media,Present links,Present domains,Tweet type,Language,Timestamp,Engaged User id,Engaged Follower count,Engaged Following count,Engaged Is verified?,Engaged Account creation time,Engaging User id,Engaging Follower count,Engaging Following count,Engaging Is verified?,Engaging Account creation time,Engagee follows engager?
0,101\t47185\t10157\t100986\t10343\t55422\t119\t...,,7647B4E9DAF4C1D8973397DC2A04F3E3,Photo,,,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581703126,8A9AB92B775C62C4AB60DF6773A01571,13941,1216,False,1448292186,0000006C3074607050F1339DDCB890BB,27448,600,False,1520948869,True
1,101\t6006\t5086\t1939\t7418\t3601\t6406\t1913\...,,CCBFBA5AFE7EFC03102EA8D0F86C4208,Photo,,,TopLevel,22C448FF81263D4BAF2A176145EE9EAD,1581736431,187AC59639DA9A6F32F7CD118EDD58F7,476439,1478,False,1254447722,00000776B07587ECA9717BFC301F2D6E,102,659,False,1478011810,False
2,101\t56898\t137\t44851\t10317\t11490\t10112\t1...,,E18C2DCFC5AF20C650A0FD94598E69B7,Video,,,Retweet,ECED8A16BE2A5E8871FD55F4842F16B1,1582061925,82626B53CB2AD3B469E4AE06EAA9D930,367,702,False,1518708926,00000860E80C67D8C46CE57C64DE9444,230,189,False,1541013180,True
3,101\t13497\t10437\t94005\t11161\t73632\t11067\...,,26DC813FDF8546B757BB9141099F119E,,D58137F9D688C88435FD64FBAEA82B97,E91CDEC8DC7ABF30592FA024616FF970,TopLevel,ECED8A16BE2A5E8871FD55F4842F16B1,1582110043,7AFE06FF54898A1E9C716F539831849E,278,1229,False,1243548061,00000865A1538142CDA5936B07FE4311,65,165,False,1452599043,True
4,101\t24781\t10152\t42041\t38268\t10301\t10798\...,,30A33055566AAC9EB18734C4EAD11FE1,,AEF0CC9FA7B389B9A2ADF1331F00B65B,42DD9E2D4B2C0B0A71E909A6049EC2C2,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581860270,D240DACE38CA84965270C86D47D3BF40,24313527,121,True,1177506290,00000865A1538142CDA5936B07FE4311,64,164,False,1452599043,False
