In [2]:
import pandas as pd
import s3fs
import boto3
from io import StringIO # python3; python2: BytesIO 
from boto3.s3.transfer import TransferConfig
import torch
from transformers import *
import numpy as np
import re
from langdetect import detect
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import RegexpTokenizer

#### Divide timesamp and Language into train and validation set

In [2]:
all_features = ["Language", "tweet_timestamp", "engaging_user_id", "reply_timestamp"]

train = pd.read_csv("s3://recsys-challenge-2020/training.tsv", encoding="utf-8",
                    names = all_features, usecols= [7, 8, 14, 20], sep="\x01")
train_set_positive = train[train.reply_timestamp.notnull()]

In [3]:
first = train_set_positive[['engaging_user_id', 'Language']].drop_duplicates()
second = first.groupby(['engaging_user_id']).size().reset_index(name = 'count').\
sort_values(by = 'count', ascending = False)

In [5]:
second.to_csv('data/user_language_distribution')

In [None]:
first

In [4]:
second

Unnamed: 0,engaging_user_id,count
1075610,5926693A20E6B0C1405D7CBCB67AE153,10
1643742,884F1F0173B54377F547C5AE25B4FCD2,9
78255,067470F71433C013AAD7120AEEE2053E,9
1289556,6AE0222949209CDD6FB4C4AB001D90B2,8
2672388,DDA7175C6CA2059B5AE581E32F96471F,8
...,...,...
1047096,56C655BCCF003F2786D5F3DAA346C017,1
1047097,56C656164088429371D45985C3497FBA,1
1047098,56C65C804A94A82B8E7D4267DEA74178,1
1047099,56C66233DDE02079E661B4B330A5E094,1


In [6]:
train_sorted = train.sort_values(by=['tweet_timestamp']).reset_index()

In [7]:
n_head = 90
n_tail = 10
train_set = train_sorted.head(int(len(train_sorted)*(n_head/100)))
val_set = train_sorted.tail(int(len(train_sorted)*(n_tail/100)))

In [8]:
train_set_text = pd.read_csv('s3://recsys-challenge-2020/train_set_text.csv')
val_set_text = pd.read_csv('s3://recsys-challenge-2020/val_set_text.csv')

In [12]:
train_set_text.head()

Unnamed: 0,user_text
0,[CLS] # ENVIVO | ¡ Buenas noches! Comienza una...
1,[CLS] Celebrate Lunar New Year with the new Ti...
2,[CLS] The media tend not to feature abortion s...
3,"[CLS] porto alegre, tem novidade na agenda [UN..."
4,"[CLS] Karnataka boy, who guided ambulance duri..."


In [18]:
train_set_subset['text_ tokens'].apply(lambda x: calculate_text(x))

0    [CLS] # ENVIVO | ¡ Buenas noches! Comienza una...
1    [CLS] Celebrate Lunar New Year with the new Ti...
2    [CLS] The media tend not to feature abortion s...
Name: text_ tokens, dtype: object

In [13]:
train_set_subset = pd.read_csv('s3://recsys-challenge-2020/train_set_reply.csv', nrows = 3)

In [26]:
val_set_subset = pd.read_csv('s3://recsys-challenge-2020/val_set_reply.csv', nrows = 3)

In [14]:
train_set_subset

Unnamed: 0,index,text_ tokens,tweet_id,tweet_timestamp,engaging_user_id,reply_timestamp,reply_bool
0,19856362,101\t108\t31278\t90939\t70325\t196\t199\t71436...,66BB647424F5AD25A562F1C50D688626,1580947200,12A1AF0088C5B4FCBFA024D4A1971323,,0.0
1,70014084,101\t95026\t90141\t54922\t10287\t13567\t10169\...,1BD6E406781F48D3FB023E1212BAD6DC,1580947200,C836F432B0FC7847C2C753949C51B961,,0.0
2,45640141,101\t10117\t12518\t45415\t10472\t10114\t19072\...,7A6C40F8B6FE18A164C8A426E968867D,1580947200,77309DE08AB631D8224254D33648F287,,0.0


In [11]:
train_set.head()

Unnamed: 0,index,Language,tweet_timestamp,engaging_user_id,reply_timestamp
0,19856362,06D61DCBBE938971E1EA0C38BD9B5446,1580947200,12A1AF0088C5B4FCBFA024D4A1971323,
1,70014084,D3164C7FBCF2565DDF915B1B3AEFB1DC,1580947200,C836F432B0FC7847C2C753949C51B961,
2,45640141,D3164C7FBCF2565DDF915B1B3AEFB1DC,1580947200,77309DE08AB631D8224254D33648F287,
3,103063682,ECED8A16BE2A5E8871FD55F4842F16B1,1580947200,91363F238C79DAEE42E2A2C97A5F8E7C,
4,54351326,D3164C7FBCF2565DDF915B1B3AEFB1DC,1580947200,EFA4C5B62E097EB203F8AFDC470AEB27,


In [15]:
def calculate_text(row):
    tweet_tokens = tokenizer.decode(list(map(int, row.split('\t'))))
    return tweet_tokens

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

In [19]:
train_lda_input = pd.concat([train_set, train_set_text], axis=1)

In [34]:
val_lda_input = pd.concat([val_set.reset_index(), val_set_text], axis=1)

In [63]:
train_lda_input.head()

pandas.core.frame.DataFrame

In [35]:
val_lda_input.head()

Unnamed: 0,level_0,index,Language,tweet_timestamp,engaging_user_id,reply_timestamp,text_ tokens,user_text
0,133267715,11908278,125C57F4FA6D4E110983FB11B52EFD4E,1581486678,A4B1828B571271073C68DA93D1465484,,101\t17713\t9405\t119152\t119002\t118823\t9531...,[CLS] SM 사옥빌딩 앞에 있는 전광판차 사진들입니다... 전광판차는 사옥 앞에...
1,133267716,54640167,125C57F4FA6D4E110983FB11B52EFD4E,1581486678,F3A4C8EB81ADF3685E8F693037808A2C,,101\t64002\t9247\t23665\t9251\t17342\t23990\t9...,[CLS] 방송 막하지 말라던 최민호는 본인이 막히기 시작하는데 • • • http...
2,133267717,102671906,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581486678,8BD011AB59AC04491687749245009C60,,101\t56898\t137\t44592\t11090\t13503\t10157\t1...,[CLS] RT @ ReporterTelly : Let's just check it...
3,133267718,54937609,975B38F44D65EE42A547283787FF5A21,1581486678,F7BFBA56274FCBBBB76BDD36D0D6495B,,101\t137\t15595\t21840\t14590\t75268\t57493\t1...,[CLS] @ Abhirupkumardu1 जय माँ [SEP]
4,133267719,38246969,125C57F4FA6D4E110983FB11B52EFD4E,1581486678,10F6E035959B9AC156CED11E0FAED566,,101\t56898\t137\t13069\t93870\t10284\t11274\t1...,[CLS] RT @ LovableBH _ 0506 : 백현이 오늘 심각하게 [UNK...


In [23]:
val_set.head()

Unnamed: 0,index,Language,tweet_timestamp,engaging_user_id,reply_timestamp
133267715,11908278,125C57F4FA6D4E110983FB11B52EFD4E,1581486678,A4B1828B571271073C68DA93D1465484,
133267716,54640167,125C57F4FA6D4E110983FB11B52EFD4E,1581486678,F3A4C8EB81ADF3685E8F693037808A2C,
133267717,102671906,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581486678,8BD011AB59AC04491687749245009C60,
133267718,54937609,975B38F44D65EE42A547283787FF5A21,1581486678,F7BFBA56274FCBBBB76BDD36D0D6495B,
133267719,38246969,125C57F4FA6D4E110983FB11B52EFD4E,1581486678,10F6E035959B9AC156CED11E0FAED566,


In [33]:
val_set.reset_index().head()

Unnamed: 0,level_0,index,Language,tweet_timestamp,engaging_user_id,reply_timestamp
0,133267715,11908278,125C57F4FA6D4E110983FB11B52EFD4E,1581486678,A4B1828B571271073C68DA93D1465484,
1,133267716,54640167,125C57F4FA6D4E110983FB11B52EFD4E,1581486678,F3A4C8EB81ADF3685E8F693037808A2C,
2,133267717,102671906,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581486678,8BD011AB59AC04491687749245009C60,
3,133267718,54937609,975B38F44D65EE42A547283787FF5A21,1581486678,F7BFBA56274FCBBBB76BDD36D0D6495B,
4,133267719,38246969,125C57F4FA6D4E110983FB11B52EFD4E,1581486678,10F6E035959B9AC156CED11E0FAED566,


In [24]:
val_set_text.head()

Unnamed: 0,text_ tokens,user_text
0,101\t17713\t9405\t119152\t119002\t118823\t9531...,[CLS] SM 사옥빌딩 앞에 있는 전광판차 사진들입니다... 전광판차는 사옥 앞에...
1,101\t64002\t9247\t23665\t9251\t17342\t23990\t9...,[CLS] 방송 막하지 말라던 최민호는 본인이 막히기 시작하는데 • • • http...
2,101\t56898\t137\t44592\t11090\t13503\t10157\t1...,[CLS] RT @ ReporterTelly : Let's just check it...
3,101\t137\t15595\t21840\t14590\t75268\t57493\t1...,[CLS] @ Abhirupkumardu1 जय माँ [SEP]
4,101\t56898\t137\t13069\t93870\t10284\t11274\t1...,[CLS] RT @ LovableBH _ 0506 : 백현이 오늘 심각하게 [UNK...


In [27]:
val_set_subset.head()

Unnamed: 0,index,text_ tokens,tweet_id,tweet_timestamp,engaging_user_id,reply_timestamp,reply_bool
0,11908278,101\t17713\t9405\t119152\t119002\t118823\t9531...,F1529D503347DA75EFFD67CC1D9DA0F9,1581486678,A4B1828B571271073C68DA93D1465484,,0.0
1,54640167,101\t64002\t9247\t23665\t9251\t17342\t23990\t9...,41D145799B12B26F5C580B46D7A0B88E,1581486678,F3A4C8EB81ADF3685E8F693037808A2C,,0.0
2,102671906,101\t56898\t137\t44592\t11090\t13503\t10157\t1...,94062C22EE1C754B558A39E206B116E4,1581486678,8BD011AB59AC04491687749245009C60,,0.0


In [29]:
len(val_set)

14807523

In [30]:
len(val_set_text)

14807523

In [25]:
train_set_text.head()

Unnamed: 0,user_text
0,[CLS] # ENVIVO | ¡ Buenas noches! Comienza una...
1,[CLS] Celebrate Lunar New Year with the new Ti...
2,[CLS] The media tend not to feature abortion s...
3,"[CLS] porto alegre, tem novidade na agenda [UN..."
4,"[CLS] Karnataka boy, who guided ambulance duri..."


In [42]:
train_lda_input.Language.value_counts()

D3164C7FBCF2565DDF915B1B3AEFB1DC    54769115
22C448FF81263D4BAF2A176145EE9EAD    21165570
06D61DCBBE938971E1EA0C38BD9B5446    12054370
ECED8A16BE2A5E8871FD55F4842F16B1     9384645
B9175601E87101A984A50F8A62A1C374     7134952
                                      ...   
4690215948DBF6872B8ED1C2BC87B17E         548
D18801336202297E6484F634CAC6592E         538
B2235C8B73239FDC5780DD132419833A         359
2E18F6F53E3CF073911AF0A93BBE5373         130
515E873C86EE1577E75FA2387B7FA59E          12
Name: Language, Length: 66, dtype: int64

In [43]:
val_lda_input.Language.value_counts()

D3164C7FBCF2565DDF915B1B3AEFB1DC    5953858
22C448FF81263D4BAF2A176145EE9EAD    2486695
06D61DCBBE938971E1EA0C38BD9B5446    1218994
ECED8A16BE2A5E8871FD55F4842F16B1     896143
B9175601E87101A984A50F8A62A1C374     749935
                                     ...   
D18801336202297E6484F634CAC6592E         76
4690215948DBF6872B8ED1C2BC87B17E         42
2E18F6F53E3CF073911AF0A93BBE5373          8
B2235C8B73239FDC5780DD132419833A          4
515E873C86EE1577E75FA2387B7FA59E          2
Name: Language, Length: 66, dtype: int64

In [44]:
import numpy as np
import lda
import lda.datasets
X = lda.datasets.load_reuters()
vocab = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()

In [47]:
X

array([[1, 0, 1, ..., 0, 0, 0],
       [7, 0, 2, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0]], dtype=int32)

In [54]:
len(vocab)

4258

In [49]:
titles

('0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20',
 '1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21',
 "2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23",
 '3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25',
 '4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25',
 "5 INDIA: Mother Teresa's condition unchanged, thousands pray. CALCUTTA 1996-08-25",
 '6 INDIA: Mother Teresa shows signs of strength, blesses nuns. CALCUTTA 1996-08-26',
 "7 INDIA: Mother Teresa's condition improves, many pray. CALCUTTA, India 1996-08-25",
 '8 INDIA: Mother Teresa improves, nuns pray for "miracle". CALCUTTA 1996-08-26',
 '9 UK: Charles under fire over prospect of Queen Camilla. LONDON 1996-08-26',
 '10 UK: Britain tells Charles to forget Camilla. LONDON 1996-08-27',
 "11 COTE D'IVOIRE: FEATURE - Quiet homecoming for reprieved Ivory Coast maid. ABIDJAN 1996-08-28",


In [50]:
X.shape

(395, 4258)

In [51]:
X.sum()

84010

In [None]:
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)

In [147]:
train_lda_input.head(100).user_text.replace('\[CLS\]','', regex=True)

0      # ENVIVO | ¡ Buenas noches! Comienza una nueva edición de # 24HorasCentral junto a Iván Núñez... [UNK] Sigue la señal en vivo # 24Play [UNK] https : / / t. co / mi3yDUaS2F https : / / t. co / Y6qU6gwdCL [SEP]                                                                                     
1      Celebrate Lunar New Year with the new Tigeress and Swift Outfits in the Item Shop now! https : / / t. co / oJIWHkCp5o [SEP]                                                                                                                                                                          
2      The media tend not to feature abortion survivors like Claire Culwell, a woman whose story has in recent years captivated the pro - life world.. https : / / t. co / COWpNipsgk [SEP]                                                                                                                 
3      porto alegre, tem novidade na agenda [UNK] 14 de março eu toco no teatro do @ sesc _ rs e 

In [165]:
def tokenize_text(row, index):
    if index % 100000 == 0:
        print(index)
    row1 = row.replace('[CLS]','')
    row2 = row1.replace('[SEP]', '')
    result = re.sub(r"http\S+", "", row2)
    tokenizer = RegexpTokenizer(r'\w+')
    token_list = tokenizer.tokenize(result)
#     token_list = wordpunct_tokenize(result)
    feature =  ' '.join([w.lower() for w in token_list])
#     print(row)
#     print(feature)
    return feature

In [None]:
# subset = train_lda_input.head(101)
# subset_subset = subset.apply(lambda x: tokenize_text(x.user_text, x.name), axis = 1)
train_lda_input['cleaned_text'] = train_lda_input.apply(lambda x: tokenize_text(x.user_text, x.name), axis = 1)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000
6900000
7000000
7100000
7200000
7300000
7400000
7500000
7600000
7700000
7800000
7900000
8000000
8100000
8200000
8300000
8400000
8500000
8600000
8700000
8800000
8900000
9000000
9100000
9200000
9300000
9400000
9500000
9600000
9700000
9800000
9900000
10000000
10100000
10200000
10300000
10400000
10500000
10600000
10700000
10800000
10900000
11000000
11100000
11200000
11300000
11400000
11500000
11600000
11700000
11800000
11900000
12000000
12100000
12200000
12300000

In [None]:
train_lda_input

In [92]:
subset_subset

0       cls envivo buenas noches comienza una nueva edición de 24horascentral junto iván núñez unk sigue la señal en vivo 24play unk co mi3yduas2f co y6qu6gwdcl sep                
1       cls celebrate lunar new year with the new tigeress and swift outfits in the item shop now co ojiwhkcp5o sep                                                                 
2       cls the media tend not to feature abortion survivors like claire culwell woman whose story has in recent years captivated the pro life world co cowpnipsgk sep              
3       cls porto alegre tem novidade na agenda unk 14 de março eu toco no teatro do sesc rs já tô muito ansiosa pra rever todo mundo ingressos aqui co ihrnvhrb1p co hyjf3917d1 sep
4       cls karnataka boy who guided ambulance during floods to be awarded on republicday co qotzvndlme co hrgzbclncv sep                                                           
                                                              ...                              

In [80]:
train_lda_input.Language.unique()

array(['06D61DCBBE938971E1EA0C38BD9B5446',
       'D3164C7FBCF2565DDF915B1B3AEFB1DC',
       'ECED8A16BE2A5E8871FD55F4842F16B1',
       '9BF3403E0EB7EA8A256DA9019C0B0716',
       '167115458A0DBDFF7E9C0C53A83BAC9B',
       'B9175601E87101A984A50F8A62A1C374',
       '22C448FF81263D4BAF2A176145EE9EAD',
       '975B38F44D65EE42A547283787FF5A21',
       '125C57F4FA6D4E110983FB11B52EFD4E',
       '022EC308651FACB02794A8147AEE1B78',
       '4DC22C3F31C5C43721E6B5815A595ED6',
       '2996EB2FE8162C076D070A4C8D6532CD',
       'FA3F382BC409C271E3D6EAF8BE4648DD',
       '1FFD2FE4297F5E70EBC6C3230D95CB9C',
       '89616CFF8EC8637092F885C7EFF43D74',
       '190BA7DA361BC06BC1D7E824C378064D',
       '4249CE88433AEA3F8DCEECF008B3CB95',
       '6431A618DCF7F4CB7F62A95A39BAB77A',
       '717293301FE296B0B61950D041485825',
       '76B8A9C3013AE6414A3E6012413CDC3B',
       '9ECD42BC079C20F156F53CB3B99E600E',
       'FF60A88F53E63000266F8B9149E35AD9',
       '3820C29CBCA409A33BADF68852057C4A',
       '3E1

In [86]:
subset = train_lda_input[train_lda_input.Language == '06D61DCBBE938971E1EA0C38BD9B5446'].head(10)

In [99]:
def language_detect(str1, str2):
    return str1,detect(str2)

In [72]:
pd.set_option('display.max_colwidth', -1)

In [114]:
language_code_name_pairs = pd.DataFrame()

In [115]:
language_code_name_pairs['code', 'name'] = group.apply(lambda x : language_detect(x.Language, x.user_text), axis = 1)

In [117]:
language_code_name_pairs

Unnamed: 0,"(code, name)"
0,"(022EC308651FACB02794A8147AEE1B78, th)"
1,"(0331BF70E606D62D92C96CE9AD71A7CF, fi)"
2,"(06BEAB41D66CCFF329D1ED8BA120A6C2, he)"
3,"(06D61DCBBE938971E1EA0C38BD9B5446, es)"
4,"(125C57F4FA6D4E110983FB11B52EFD4E, ko)"
...,...
61,"(F4FD40A716F1572C9A28E9CAA58BE3A5, sl)"
62,"(F73266A79468BB89C4325FDEDB0B533C, pa)"
63,"(FA3F382BC409C271E3D6EAF8BE4648DD, fr)"
64,"(FF60A88F53E63000266F8B9149E35AD9, en)"


In [109]:
group = train_lda_input.groupby('Language').first().reset_index()

In [110]:
group

Unnamed: 0,Language,index,tweet_timestamp,engaging_user_id,reply_timestamp,user_text
0,022EC308651FACB02794A8147AEE1B78,34048091,1580947200,D6E09F8CD4FAC1912DA101B032C0C094,1.580961e+09,[CLS] เก๊ง เก๊ง เก๊ง เก๊ง เก๊ง เก๊ง เก๊ง ( ๐๗. ๐๐ ) [SEP]
1,0331BF70E606D62D92C96CE9AD71A7CF,80920386,1580947206,5F0C0A5415807487403B49823D03A5B8,1.580948e+09,[CLS] RT @ SuomiSOS : Vainojen uhrien muistopäivä lähestyy : juutalaisten holokaustin myytti 6 miljoonasta uhrista [UNK] https : / / t. co / kZYDJMNnE2 [SEP]
2,06BEAB41D66CCFF329D1ED8BA120A6C2,101774136,1580947249,7F6876EC2E5EC0BA1A3D1F54F7242894,1.580972e+09,[CLS] RT @ ebendavid5 : יש לו גנים של ברווז. [UNK]. https : / / t. co / VSND0mOUCG [SEP]
3,06D61DCBBE938971E1EA0C38BD9B5446,19856362,1580947200,12A1AF0088C5B4FCBFA024D4A1971323,1.580952e+09,[CLS] # ENVIVO | ¡ Buenas noches! Comienza una nueva edición de # 24HorasCentral junto a Iván Núñez... [UNK] Sigue la señal en vivo # 24Play [UNK] https : / / t. co / mi3yDUaS2F https : / / t. co / Y6qU6gwdCL [SEP]
4,125C57F4FA6D4E110983FB11B52EFD4E,33433623,1580947200,CE5D706649EEA7D04F2D9BFFACEBF02E,1.580970e+09,[CLS] [UNK] 전하는 새해 복이 여러분 모두에게 [UNK]. [UNK]. 조철강 빼고 모두 새해 복 많이 받으세요 ~ ( [UNK] ). [UNK]. 매주 [ 토일 ] 밤 9시 방송. # tvN # 토일드라마 # 사랑의불시착. # 현빈 # 손예진 # 서지혜 # 김정현 https : / / t. co / RE3oGzC0Hu [SEP]
...,...,...,...,...,...,...
61,F4FD40A716F1572C9A28E9CAA58BE3A5,143141987,1580947210,BBB1EF7D99D4971ACE64CC46113C2438,1.580949e+09,[CLS] parcerita dime kiubo [SEP]
62,F73266A79468BB89C4325FDEDB0B533C,110463272,1580948140,F7A0DBE44182EBC4FBE6A5ABF737A0ED,1.581017e+09,[CLS] ਵਾਹਿਗੁਰੂ ਤੇਰਾ ਸ਼ੁਕਰ ਹੈ [UNK] https : / / t. co / JLQM0c6eqc [SEP]
63,FA3F382BC409C271E3D6EAF8BE4648DD,41360732,1580947200,3BF9546AB515D665B0500BA5197AAB6D,1.580947e+09,[CLS] Plus que 2 heures avant le retour de # cjdltv sur @ iciartv! [UNK] [SEP]
64,FF60A88F53E63000266F8B9149E35AD9,72475886,1580947201,EA4CE40574D2295485331E36383058FF,1.580953e+09,"[CLS] hrrngh.. no boobie?? no girl,..... [SEP]"
