In [1]:
import pandas as pd
import numpy as np
import plotly
import glob

import tqdm

In [2]:
def read_csv_from_blob(path, suffix='*.csv', sep=',', header=0):
    list_of_files = glob.glob(path + suffix)
    print(f'Total {len(list_of_files)} files to be read.')
    
    df_return = pd.DataFrame()
    for file in tqdm.tqdm(list_of_files):
        df_return = pd.concat([
            df_return,
            pd.read_csv(f'{file}',
                        encoding='utf-8', 
                        engine='python',
                        sep=sep, 
                        quotechar='"', 
                        header = header,
                        quoting=3)
        ])
        
    return df_return

In [3]:
def searchdf(df, **criteria):
    for col, val in criteria.items():
        df=df[df[col]==val]
    return df

In [4]:
def remove_duplicated(df):
    print(len(df))
    print(f'\
    {len(df)} - {df.duplicated().sum()} = {len(df.drop_duplicates())} |\
    {len(df) - df.duplicated().sum() == len(df.drop_duplicates())}'
         )
    df = df.drop_duplicates()
    return df

In [5]:
df_emot_out = read_csv_from_blob('./world_cup_data/emotion-output/', suffix='*.csv')

Total 77 files to be read.


100%|███████████████████████████████████████████████| 77/77 [00:19<00:00,  4.02it/s]


In [6]:
df_bothunter = read_csv_from_blob('./world_cup_data/bothunter-data/', suffix='*.csv')

Total 77 files to be read.


100%|███████████████████████████████████████████████| 77/77 [00:11<00:00,  6.85it/s]


In [7]:
df_location_out = read_csv_from_blob('./world_cup_data/location-output/', suffix='*.tsv', sep='\t', header=None)

Total 170 files to be read.


100%|█████████████████████████████████████████████| 170/170 [00:17<00:00,  9.51it/s]


In [8]:
df_location_out = df_location_out.rename(columns={0: 'status_id', 1:'location'})

In [9]:
df_emot_out = remove_duplicated(df_emot_out)
df_emot_out

1501056
    1501056 - 30642 = 1470414 |    True


Unnamed: 0,userid,tweetid,text,anger,disgust,fear,joy,neutral,sadness,surprise
0,1486740205654757379,1.486740e+18,#WorldCup2022 Why didn´t Mexico play this way ...,0.027185,0.001112,0.557336,0.005496,0.007620,0.314508,0.086744
1,18908644,1.890864e+07,Arjantin atak üstüne atak yapıyor ama aradığı ...,0.112710,0.031122,0.138428,0.064882,0.439563,0.065871,0.147424
2,1200599170412875776,1.200599e+18,C’est pas Messi qui le rate c’est un arrêt exc...,0.041187,0.001330,0.940288,0.008016,0.002105,0.004824,0.002250
3,1183565334,1.183565e+09,مشاهدة مباراة السعودية والمكسيك بث مباشر اليوم...,0.180309,0.026082,0.291065,0.032152,0.407928,0.049636,0.012828
4,1598001705710686208,1.598002e+18,🔴Live Streaming 🏴🇵🇱Poland vs Argentina🇦🇷🏴👉 ht...,0.023232,0.002056,0.043861,0.208482,0.464637,0.060417,0.197315
...,...,...,...,...,...,...,...,...,...,...
19726,1034068444600311812,1.034068e+18,RT @q_slavic: #Polska #WorldcupQatar2022. 26.1...,0.066131,0.001013,0.907271,0.008250,0.002006,0.012841,0.002488
19727,825341312006287361,8.253413e+17,RT @SuperExet: Vote for your favourite teams f...,0.100215,0.001554,0.560896,0.161231,0.104611,0.014567,0.056926
19728,99849659,9.984966e+07,Suiza vendiste cara esa clasificación de Brasi...,0.243749,0.003676,0.658908,0.035601,0.028908,0.010284,0.018874
19729,1487871301171744771,1.487871e+18,Ganhamos mas lembrando que Tite escalou mal e ...,0.351239,0.000996,0.486202,0.122127,0.006981,0.025267,0.007188


In [10]:
df_bothunter = remove_duplicated(df_bothunter)
df_bothunter

1909390
    1909390 - 237 = 1909153 |    True


Unnamed: 0,status_id,user_id,screen_name,prediction,probability
0,1598040909949636635,1486740205654757379,CQuillwitch,False,0.287861
1,1598040910805467136,18908644,Alperist,True,0.693283
2,1598040911921172480,1200599170412875776,EkweFranck,False,0.480079
3,1598040915209523200,1183565334,lilo6y,False,0.204190
4,1598040917654777856,1598001705710686208,DonalRakib,True,0.878721
...,...,...,...,...,...
19995,1597289274776641536,1034068444600311812,gritgut,True,0.951746
19996,1597289275393216512,825341312006287361,Muma1970,False,0.258678
19997,1597289276114628608,99849659,Piperayo,False,0.098239
19998,1597289276806684674,1487871301171744771,mariliazuck,False,0.248117


In [11]:
df_location_out = remove_duplicated(df_location_out)
df_location_out

4138780
    4138780 - 772 = 4138008 |    True


Unnamed: 0,status_id,location
0,1597135045692882944,SA
1,1597135046074589186,DE
2,1597135046808588288,JP
3,1597135047035084800,DE
4,1597135047701966848,FR
...,...,...
19995,1597888190236721152,SA
19996,1597888208091897858,ID
19997,1597888208020606977,PL
19998,1597888211908694016,GB


In [12]:
## Are there any bots that dont have a location
df_bothunter[df_bothunter['status_id'].isin(df_location_out['status_id'])==False]

Unnamed: 0,status_id,user_id,screen_name,prediction,probability


In [13]:
## a tweet has a user_id. So single user can send multiple tweets.
## each tweet should have been passed through bothunter
## so each userid will have multiple bothunter passes based on how many tweets they sent
## For each bothunter pass, we have a status_id
## so single userid can have multiple tweets that correlate to a unique status_id


In [14]:
df_emot_out[df_emot_out['userid']==1486740205654757379]

Unnamed: 0,userid,tweetid,text,anger,disgust,fear,joy,neutral,sadness,surprise
0,1486740205654757379,1.48674e+18,#WorldCup2022 Why didn´t Mexico play this way ...,0.027185,0.001112,0.557336,0.005496,0.00762,0.314508,0.086744
14106,1486740205654757379,1.48674e+18,After that heart-stopping breath-taking 90 min...,0.008794,0.000632,0.893822,0.031918,0.00918,0.004004,0.05165
18288,1486740205654757379,1.48674e+18,#Worlds2022 #WorldcupQatar2022 Wait how could...,0.103646,0.000885,0.373961,0.012789,0.02786,0.068543,0.412316


In [15]:
df_bothunter[df_bothunter['user_id']==1486740205654757379]

Unnamed: 0,status_id,user_id,screen_name,prediction,probability
0,1598040909949636635,1486740205654757379,CQuillwitch,False,0.287861
14325,1598063782559092737,1486740205654757379,CQuillwitch,False,0.287861
18469,1598039182584459265,1486740205654757379,CQuillwitch,False,0.28293


In [16]:
df_location_out[df_location_out['status_id']==1598040909949636635]

Unnamed: 0,status_id,location
0,1598040909949636635,mexico city-09-mx
0,1598040909949636635,MX


In [17]:
df_bothunter[df_bothunter['status_id'].duplicated()==True]

Unnamed: 0,status_id,user_id,screen_name,prediction,probability
4,1596584749316771840,1266567858554679296,Maria16184112,True,0.952359
22,1596584759786082305,1574706723218427904,Q_Worldcup_2022,True,0.732655
34,1596584763682263040,1530649349629558784,FestivalMillion,True,0.634912
35,1596584767142522881,838180123723591680,edpelicer,False,0.137376
40,1596584768707010561,914302319486840832,es_Steph_,False,0.259629
...,...,...,...,...,...
738,1596585161629782016,69568167,happyy08,False,0.181187
743,1596585162422489088,3362741,sopitas,False,0.316162
757,1596585173055066112,1584080024906317827,AlmondCookiesx,True,0.611645
766,1596585179371671552,1518574182342311936,DinzedNFT,True,0.553963


In [18]:
searchdf(df_bothunter, status_id=1596584749316771840)

Unnamed: 0,status_id,user_id,screen_name,prediction,probability
183,1596584749316771840,1266567858554679296,Maria16184112,True,0.951178
4,1596584749316771840,1266567858554679296,Maria16184112,True,0.952359


In [21]:
t = df_bothunter.groupby(['status_id', 'user_id', 'screen_name']).mean().reset_index()

In [None]:
df_emot_out[df_emot_out.duplicated()==True]

In [None]:
searchdf(df_emot_out, userid=1597465625579769856)

In [None]:
## check of each status_id has how many locations
df_location_out[df_location_out['status_id']==i for i in df_bothunter]

In [None]:
df_emot_out['userid'].isna().sum()