In [1]:
import pandas as pd
import numpy as np
import plotly
import glob

import tqdm

In [2]:
import plotly
import plotly.graph_objs as go 
import plotly.offline as offline
import plotly.express as px

In [3]:
def read_csv_from_blob(path, suffix='*.csv', sep=',', header=0):
    list_of_files = glob.glob(path + suffix)
    print(f'Total {len(list_of_files)} files to be read.')
    
    df_return = pd.DataFrame()
    for file in tqdm.tqdm(list_of_files):
        df_return = pd.concat([
            df_return,
            pd.read_csv(f'{file}',
                        encoding='utf-8', 
                        engine='python',
                        sep=sep, 
                        quotechar='"', 
                        header = header,
                        quoting=3)
        ])
        
    return df_return

In [4]:
def searchdf(df, **criteria):
    for col, val in criteria.items():
        df=df[df[col]==val]
    return df

In [5]:
def remove_duplicated(df):
    print(len(df))
    print(f'\
    {len(df)} - {df.duplicated().sum()} = {len(df.drop_duplicates())} |\
    {len(df) - df.duplicated().sum() == len(df.drop_duplicates())}'
         )
    df = df.drop_duplicates()
    return df

## Read Files

In [6]:
df_emot_out = read_csv_from_blob('./world_cup_data/emotion-output/', suffix='*.csv')

Total 77 files to be read.


100%|█████████████████████████████████████████| 77/77 [00:19<00:00,  4.01it/s]


In [7]:
df_bothunter = read_csv_from_blob('./world_cup_data/bothunter-data/', suffix='*.csv')

Total 77 files to be read.


100%|█████████████████████████████████████████| 77/77 [00:11<00:00,  6.77it/s]


In [8]:
df_morals = read_csv_from_blob('./morals-netmapper-output/', suffix='*.tsv', sep='\t', header=0)

Total 76 files to be read.


100%|█████████████████████████████████████████| 76/76 [01:06<00:00,  1.14it/s]


In [9]:
df_location_out = read_csv_from_blob('./world_cup_data/location-output/', suffix='*.tsv', sep='\t', header=None)
df_location_out = df_location_out.rename(columns={0: 'status_id', 1:'location'})

Total 170 files to be read.


100%|███████████████████████████████████████| 170/170 [00:18<00:00,  9.29it/s]


In [10]:
df_latlon = pd.read_csv('./city.txt', sep='\t', header=None)
df_latlon = df_latlon.rename(columns={0:'location', 1:'region', 2: 'lat', 3: 'lon', 4: 'code'})
df_latlon

Unnamed: 0,location,region,lat,lon,code
0,bissau-11-gw,africa/bissau,11.86357,-15.59767,388028
1,chimaltenango-03-gt,america/guatemala,14.66861,-90.81667,194701
2,quetzaltenango-13-gt,america/guatemala,14.83333,-91.51667,283584
3,escuintla-06-gt,america/guatemala,14.30500,-90.78500,239247
4,guatemala city-07-gt,america/guatemala,14.64072,-90.51327,2653501
...,...,...,...,...,...
3704,khartoum-29-sd,africa/khartoum,15.55177,32.53241,3174647
3705,atbara-53-sd,africa/khartoum,17.70217,33.98638,234266
3706,kosti-41-sd,africa/khartoum,13.16290,32.66347,480349
3707,port sudan-36-sd,africa/khartoum,19.61745,37.21644,489725


## Clean Data

### Remove Duplicates

In [11]:
df_emot_out = remove_duplicated(df_emot_out)
df_emot_out

1501056
    1501056 - 30642 = 1470414 |    True


Unnamed: 0,userid,tweetid,text,anger,disgust,fear,joy,neutral,sadness,surprise
0,1486740205654757379,1.486740e+18,#WorldCup2022 Why didn´t Mexico play this way ...,0.027185,0.001112,0.557336,0.005496,0.007620,0.314508,0.086744
1,18908644,1.890864e+07,Arjantin atak üstüne atak yapıyor ama aradığı ...,0.112710,0.031122,0.138428,0.064882,0.439563,0.065871,0.147424
2,1200599170412875776,1.200599e+18,C’est pas Messi qui le rate c’est un arrêt exc...,0.041187,0.001330,0.940288,0.008016,0.002105,0.004824,0.002250
3,1183565334,1.183565e+09,مشاهدة مباراة السعودية والمكسيك بث مباشر اليوم...,0.180309,0.026082,0.291065,0.032152,0.407928,0.049636,0.012828
4,1598001705710686208,1.598002e+18,🔴Live Streaming 🏴🇵🇱Poland vs Argentina🇦🇷🏴👉 ht...,0.023232,0.002056,0.043861,0.208482,0.464637,0.060417,0.197315
...,...,...,...,...,...,...,...,...,...,...
19726,1034068444600311812,1.034068e+18,RT @q_slavic: #Polska #WorldcupQatar2022. 26.1...,0.066131,0.001013,0.907271,0.008250,0.002006,0.012841,0.002488
19727,825341312006287361,8.253413e+17,RT @SuperExet: Vote for your favourite teams f...,0.100215,0.001554,0.560896,0.161231,0.104611,0.014567,0.056926
19728,99849659,9.984966e+07,Suiza vendiste cara esa clasificación de Brasi...,0.243749,0.003676,0.658908,0.035601,0.028908,0.010284,0.018874
19729,1487871301171744771,1.487871e+18,Ganhamos mas lembrando que Tite escalou mal e ...,0.351239,0.000996,0.486202,0.122127,0.006981,0.025267,0.007188


In [12]:
df_emot_out[df_emot_out['userid'].duplicated()==True]

Unnamed: 0,userid,tweetid,text,anger,disgust,fear,joy,neutral,sadness,surprise
17,1598001705710686208,1.598002e+18,🔴Live Streaming 🏴🇵🇱Poland vs Argentina🇦🇷🏴👉 ht...,0.028865,0.002020,0.057799,0.219091,0.432225,0.066835,0.193164
30,1598001705710686208,1.598002e+18,🔴Live Streaming 🏴🇵🇱Poland vs Argentina🇦🇷🏴👉 ht...,0.022482,0.002011,0.040523,0.221060,0.477265,0.057408,0.179251
35,1598001705710686208,1.598002e+18,🔴Live Streaming 🏴🇵🇱Poland vs Argentina🇦🇷🏴👉 ht...,0.023643,0.002337,0.046009,0.222982,0.452679,0.058840,0.193510
42,1131397009431310343,1.131397e+18,RT @99jj88: 🔥↪️❤️,0.006397,0.002703,0.032503,0.203560,0.716024,0.022365,0.016447
47,1598001705710686208,1.598002e+18,🔴Live Streaming 🏴🇵🇱Poland vs Argentina🇦🇷🏴👉 ht...,0.022493,0.002028,0.042642,0.193485,0.472587,0.057484,0.209281
...,...,...,...,...,...,...,...,...,...,...
19724,1595371749016850432,1.595372e+18,RT @Balalee64: اداء مشرف من فريقنا الاخضر السع...,0.077918,0.002407,0.895646,0.007260,0.004893,0.011241,0.000634
19725,3044740105,3.044740e+09,#CopaDoMundo2022#WorldCup2022#WorldcupQatar202...,0.126320,0.000944,0.775281,0.064307,0.003874,0.020606,0.008668
19726,1034068444600311812,1.034068e+18,RT @q_slavic: #Polska #WorldcupQatar2022. 26.1...,0.066131,0.001013,0.907271,0.008250,0.002006,0.012841,0.002488
19729,1487871301171744771,1.487871e+18,Ganhamos mas lembrando que Tite escalou mal e ...,0.351239,0.000996,0.486202,0.122127,0.006981,0.025267,0.007188


<h4>
    
Remarks - 

emot_out has tweets sent by userid. 

Each userid can have multiple tweets (possibly even identical) so the userid column won't be unique.

In [13]:
df_bothunter = remove_duplicated(df_bothunter)
df_bothunter

1909390
    1909390 - 237 = 1909153 |    True


Unnamed: 0,status_id,user_id,screen_name,prediction,probability
0,1598040909949636635,1486740205654757379,CQuillwitch,False,0.287861
1,1598040910805467136,18908644,Alperist,True,0.693283
2,1598040911921172480,1200599170412875776,EkweFranck,False,0.480079
3,1598040915209523200,1183565334,lilo6y,False,0.204190
4,1598040917654777856,1598001705710686208,DonalRakib,True,0.878721
...,...,...,...,...,...
19995,1597289274776641536,1034068444600311812,gritgut,True,0.951746
19996,1597289275393216512,825341312006287361,Muma1970,False,0.258678
19997,1597289276114628608,99849659,Piperayo,False,0.098239
19998,1597289276806684674,1487871301171744771,mariliazuck,False,0.248117


<h4>Each tweet passed to bothunter by a "userid" has a "status_id" column created in df_bothunter

In [14]:
df_bothunter[df_bothunter['status_id'].duplicated()==True]

Unnamed: 0,status_id,user_id,screen_name,prediction,probability
4,1596584749316771840,1266567858554679296,Maria16184112,True,0.952359
22,1596584759786082305,1574706723218427904,Q_Worldcup_2022,True,0.732655
34,1596584763682263040,1530649349629558784,FestivalMillion,True,0.634912
35,1596584767142522881,838180123723591680,edpelicer,False,0.137376
40,1596584768707010561,914302319486840832,es_Steph_,False,0.259629
...,...,...,...,...,...
738,1596585161629782016,69568167,happyy08,False,0.181187
743,1596585162422489088,3362741,sopitas,False,0.316162
757,1596585173055066112,1584080024906317827,AlmondCookiesx,True,0.611645
766,1596585179371671552,1518574182342311936,DinzedNFT,True,0.553963


In [15]:
## Lets take average of the bothunter prediction
df_bothunter = df_bothunter.groupby(['status_id', 'user_id', 'screen_name']).mean().reset_index()
print(df_bothunter['status_id'].duplicated().sum())
df_bothunter

0


Unnamed: 0,status_id,user_id,screen_name,prediction,probability
0,1594177624171237376,149538421,raidou_survivor,0.0,0.259450
1,1594178011376455681,1376067487297601537,Nathandrake558,1.0,0.635951
2,1594178020062863360,814034558492545024,SanjayS02169973,1.0,0.788399
3,1594178104750010370,1558118121587752960,Right130813311,1.0,0.715715
4,1594178164456312832,1159168941870256129,Tatastouu,1.0,0.832714
...,...,...,...,...,...
1909036,1600875526201217024,3463820669,theressantv91,1.0,0.778049
1909037,1600875544073056256,1599400388297474049,LenaHar04634640,1.0,0.552477
1909038,1600875577380327425,1401931355554926592,SANTOSH83819519,1.0,0.842683
1909039,1600875583814406145,892232820407418880,Wata__Nao,0.0,0.229234


In [16]:
df_location_out = remove_duplicated(df_location_out)
df_location_out

4138780
    4138780 - 772 = 4138008 |    True


Unnamed: 0,status_id,location
0,1597135045692882944,SA
1,1597135046074589186,DE
2,1597135046808588288,JP
3,1597135047035084800,DE
4,1597135047701966848,FR
...,...,...
19995,1597888190236721152,SA
19996,1597888208091897858,ID
19997,1597888208020606977,PL
19998,1597888211908694016,GB


In [17]:
## Each status_id can have multiple locations (esp if they are bots)
df_location_out['status_id'].duplicated().sum()

2069015

In [18]:
## Check how many unqiue locations exist for each status_id. 
## Typically each status_id has 2 locations - region and country.
df_location_out['status_id'].value_counts()

1596584897413795841    4
1596585081744674817    4
1596585125000744965    4
1596585005869764608    4
1596584863364251649    4
                      ..
1604034208074469376    2
1604034202819002368    2
1604034189795348480    2
1604034189820583936    2
1599746458290298887    2
Name: status_id, Length: 2068993, dtype: int64

In [19]:
searchdf(df_location_out, status_id=1596584897413795841)

Unnamed: 0,status_id,location
117,1596584897413795841,AR
275,1596584897413795841,US
275,1596584897413795841,greenville-sc045-us
117,1596584897413795841,buenos aires-07-ar


## Location Coordinates

In [20]:
df_latlon

Unnamed: 0,location,region,lat,lon,code
0,bissau-11-gw,africa/bissau,11.86357,-15.59767,388028
1,chimaltenango-03-gt,america/guatemala,14.66861,-90.81667,194701
2,quetzaltenango-13-gt,america/guatemala,14.83333,-91.51667,283584
3,escuintla-06-gt,america/guatemala,14.30500,-90.78500,239247
4,guatemala city-07-gt,america/guatemala,14.64072,-90.51327,2653501
...,...,...,...,...,...
3704,khartoum-29-sd,africa/khartoum,15.55177,32.53241,3174647
3705,atbara-53-sd,africa/khartoum,17.70217,33.98638,234266
3706,kosti-41-sd,africa/khartoum,13.16290,32.66347,480349
3707,port sudan-36-sd,africa/khartoum,19.61745,37.21644,489725


In [21]:
## Locations not existing in city.txt
df_location_out[df_location_out['location'].isin(df_latlon['location'])==False]['location'].unique()

array(['SA', 'DE', 'JP', 'FR', 'GH', 'IT', 'MA', 'ID', 'PL', 'US', 'BE',
       'NG', 'ES', 'RS', 'PK', 'PH', 'IN', 'TR', 'MY', 'JO', 'GB', 'TH',
       'CA', 'IR', 'PT', 'AR', 'BR', 'CR', 'GR', 'QA', 'MX', 'LK', 'SG',
       'IL', 'CG', 'KR', 'VN', 'KE', 'LU', 'YE', 'ZA', 'FI', 'NL', 'BD',
       'CM', 'CH', 'CN', 'ZM', 'CU', 'IS', 'AU', 'KH', 'IE', 'UA', 'AT',
       'EG', 'DK', 'AE', 'KZ', 'CD', 'NP', 'TN', 'AL', 'RU', 'SN', 'SE',
       'CY', 'HK', 'LB', 'AZ', 'SI', 'RO', 'BG', 'KW', 'OM', 'RW', 'HU',
       'BY', 'BJ', 'UG', 'TW', 'AF', 'TZ', 'IQ', 'ZW', 'EC', 'HR', 'VE',
       'BA', 'JM', 'BO', 'CL', 'BW', 'SY', 'PE', 'DZ', 'CW', 'UY', 'DO',
       'CI', 'CO', 'ET', 'GI', 'MK', 'BH', 'LV', 'GA', 'AD', 'PG', 'DJ',
       'GE', 'MV', 'SO', 'ML', 'GQ', 'KG', 'LA', 'SD', 'PY', 'GT', 'TG',
       'LY', 'MN', 'MD', 'MT', 'PA', 'TJ', 'TL', 'NZ', 'LT', 'MW', 'UZ',
       'NO', 'RE', 'SV', 'AO', 'GN', 'EE', 'CZ', 'XK', 'IM', 'HT', 'NI',
       'MP', 'SZ', 'ME', 'TT', 'MM', 'BB', 'SK', 'H

In [22]:
df_location_latlon = df_location_out[df_location_out['location'].isin(df_latlon['location'])==True].merge(df_latlon, on='location')
df_location_latlon

Unnamed: 0,status_id,location,region,lat,lon,code
0,1604519793281085440,tokyo-40-jp,asia/tokyo,35.68950,139.69171,13126393
1,1604519820334366722,tokyo-40-jp,asia/tokyo,35.68950,139.69171,13126393
2,1604519820523106304,tokyo-40-jp,asia/tokyo,35.68950,139.69171,13126393
3,1604519821130928129,tokyo-40-jp,asia/tokyo,35.68950,139.69171,13126393
4,1604519824541245441,tokyo-40-jp,asia/tokyo,35.68950,139.69171,13126393
...,...,...,...,...,...,...
2069003,1594273027298148352,troitsk-13-ru,asia/yekaterinburg,54.09790,61.57730,121538
2069004,1599412673816711169,smolensk-69-ru,europe/moscow,54.78180,32.04010,320991
2069005,1599435234814365699,jhalawar-24-in,asia/kolkata,24.59676,76.16503,176133
2069006,1599445216674516992,biysk-04-ru,asia/omsk,52.53639,85.20722,215430


In [23]:
df_morals

Unnamed: 0,twitter_id,Author,Date,concept count,# thesaurus replacements,reading difficulty,named entity,abusive,exclusive,poweranger,...,# job search,# management frame,# health frame,# political frame,# legal frame,# home-life,# cultural frame,# security and defense frame,# capacity frame,# crime and punishment frame
0,1597317398516072450,1496198690188500994,Mon Nov 28 19:51:59 +0000 2022,20.0,16.0,0.098485,2.0,,,,...,,,1.0,,,,,,,
1,1597317406577561600,1479077399446929409,Mon Nov 28 19:52:01 +0000 2022,14.0,10.0,0.190476,1.0,,,,...,,,,,,,,,,
2,1597317407810277377,1307950338993987589,Mon Nov 28 19:52:01 +0000 2022,26.0,8.0,0.060606,3.0,,,,...,,,,,,,,,,
3,1597317409513144320,3878477773,Mon Nov 28 19:52:01 +0000 2022,10.0,7.0,0.260870,6.0,,,,...,,,,,,,,,,
4,1597317410184638464,1589883002770915328,Mon Nov 28 19:52:01 +0000 2022,20.0,11.0,0.094340,4.0,,,,...,,1.0,,1.0,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1596945369971445760,1579998556026724352,Sun Nov 27 19:13:40 +0000 2022,22.0,,0.384058,,,,,...,,,,,,,,,,
19996,1596945370189553664,1425018242,Sun Nov 27 19:13:40 +0000 2022,22.0,14.0,0.079710,6.0,,,,...,,,,,,,,,,
19997,1596945370936119297,1593899630152286208,Sun Nov 27 19:13:40 +0000 2022,35.0,14.0,0.103896,6.0,,,,...,,,,,,,,,,
19998,1596945371762425857,1033589901936287744,Sun Nov 27 19:13:41 +0000 2022,23.0,10.0,0.069444,3.0,,,,...,,,,,,,,,,


In [24]:
searchdf(df_bothunter, user_id=1496198690188500994)

Unnamed: 0,status_id,user_id,screen_name,prediction,probability
789199,1597317398516072450,1496198690188500994,dawiddyrda87,0.0,0.498821
1320740,1598714733875576832,1496198690188500994,dawiddyrda87,0.0,0.496803
1607086,1599740313894912000,1496198690188500994,dawiddyrda87,1.0,0.595322


so Author column in morals is same as user_id for bothunter

In [25]:
list(df_morals.columns)

['twitter_id',
 'Author',
 'Date',
 'concept count',
 '# thesaurus replacements',
 'reading difficulty',
 'named entity',
 'abusive',
 'exclusive',
 'poweranger',
 'powerencourage',
 'powerfear',
 'powerforbidden',
 'powergreed',
 'powerlust',
 'powersafety',
 'absolutist',
 'equivocal',
 'connective',
 'positive',
 'negative',
 '1st person',
 '2nd person',
 '3rd person',
 'pronoun#',
 'numbers',
 'multi-punctuation',
 '# all caps',
 'avg sentence length',
 '# sentences',
 'avg word length',
 'inclusive',
 '# exclamation points',
 '# question marks',
 'is in all caps',
 '# happy emots/emojis',
 '# sad emots/emojis',
 '# angry emots/emojis',
 '# embarrased emots/emojis',
 '# symbol concepts',
 '# identities',
 '# positive emoticons',
 '# positive emoji',
 '# neutral emoticons',
 '# neutral emoji',
 '# negative emoticons',
 '# negative emoji',
 'Avg positive affect mean',
 'Avg negative affect mean',
 '# family',
 '# political',
 '# gender',
 '# religion',
 '# race/nationality',
 '# job'

In [26]:
## columns with prefix mv are morals 
df_morals = df_morals[['Author', 'twitter_id', 'Date', *df_morals.filter(regex='^# mv', axis=1).columns]]
df_morals

Unnamed: 0,Author,twitter_id,Date,# mv_care_virtue,# mv_care_vice_harm,# mv_fairness_virtue,# mv_fairness_vice_cheating,# mv_loyalty_virtue,# mv_loyalty_vice_betrayal,# mv_authority_virtue,# mv_authority_vice_subversion,# mv_sanctity_virtue,# mv_sanctity_vice_degradation,# mv_liberty_virtue,# mv_liberty_vice_oppression
0,1496198690188500994,1597317398516072450,Mon Nov 28 19:51:59 +0000 2022,,,,,,,,,,,,
1,1479077399446929409,1597317406577561600,Mon Nov 28 19:52:01 +0000 2022,,,,,1.0,,,,,,,
2,1307950338993987589,1597317407810277377,Mon Nov 28 19:52:01 +0000 2022,,,,,,,,,,,,
3,3878477773,1597317409513144320,Mon Nov 28 19:52:01 +0000 2022,,,,,,,,,,,,
4,1589883002770915328,1597317410184638464,Mon Nov 28 19:52:01 +0000 2022,,,,,1.0,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1579998556026724352,1596945369971445760,Sun Nov 27 19:13:40 +0000 2022,,,,,,,,,,,,
19996,1425018242,1596945370189553664,Sun Nov 27 19:13:40 +0000 2022,,,,,,,,,,,,
19997,1593899630152286208,1596945370936119297,Sun Nov 27 19:13:40 +0000 2022,,,,,,,,,,,,
19998,1033589901936287744,1596945371762425857,Sun Nov 27 19:13:41 +0000 2022,,,,,,,,,,,,


In [27]:
df_morals = df_morals.fillna(value=0)
df_morals.describe()

Unnamed: 0,Author,twitter_id,# mv_care_virtue,# mv_care_vice_harm,# mv_fairness_virtue,# mv_fairness_vice_cheating,# mv_loyalty_virtue,# mv_loyalty_vice_betrayal,# mv_authority_virtue,# mv_authority_vice_subversion,# mv_sanctity_virtue,# mv_sanctity_vice_degradation,# mv_liberty_virtue,# mv_liberty_vice_oppression
count,1520000.0,1520000.0,1520000.0,1520000.0,1520000.0,1520000.0,1520000.0,1520000.0,1520000.0,1520000.0,1520000.0,1520000.0,1520000.0,1520000.0
mean,9.575819e+17,1.598382e+18,0.0631,0.05743684,0.01563487,0.01122829,0.1771704,0.001576974,0.02687697,0.009836842,0.02537434,0.01117763,0.02749079,0.01796908
std,6.626935e+17,1201532000000000.0,0.2598431,0.3019908,0.1337404,0.1117486,0.4305408,0.04082393,0.1777547,0.107155,0.2023259,0.1212438,0.1717658,0.1417889
min,1984.0,1.596585e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2662943000.0,1.597289e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.287821e+18,1.598348e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.532693e+18,1.599449e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.600871e+18,1.600876e+18,7.0,7.0,6.0,20.0,10.0,3.0,5.0,8.0,13.0,10.0,7.0,5.0


In [28]:
df_morals_grp = df_morals.groupby(['Author', 'twitter_id', 'Date']).mean().reset_index()
df_morals_grp

Unnamed: 0,Author,twitter_id,Date,# mv_care_virtue,# mv_care_vice_harm,# mv_fairness_virtue,# mv_fairness_vice_cheating,# mv_loyalty_virtue,# mv_loyalty_vice_betrayal,# mv_authority_virtue,# mv_authority_vice_subversion,# mv_sanctity_virtue,# mv_sanctity_vice_degradation,# mv_liberty_virtue,# mv_liberty_vice_oppression
0,1984,1596918541261111296,Sun Nov 27 17:27:04 +0000 2022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1984,1596919256826142722,Sun Nov 27 17:29:54 +0000 2022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1984,1596919785304256513,Sun Nov 27 17:32:00 +0000 2022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2360,1599158886006927361,Sat Dec 03 21:49:24 +0000 2022,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,2638,1600186089754755072,Tue Dec 06 17:51:08 +0000 2022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1519834,1600870000306290694,1600871536726249472,Thu Dec 08 15:14:51 +0000 2022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1519835,1600870006556078080,1600870723895296001,Thu Dec 08 15:11:37 +0000 2022,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1519836,1600870378334822401,1600874716461342721,Thu Dec 08 15:27:29 +0000 2022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1519837,1600870668521902082,1600874052658221056,Thu Dec 08 15:24:51 +0000 2022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save Cleaned Data

In [None]:
# df_bothunter.to_csv('./02_cleaned/bothunter.csv')

In [None]:
# df_emot_out.to_csv('./02_cleaned/emotion.csv')

In [None]:
# df_location_latlon.to_csv('./02_cleaned/location.csv')

In [None]:
df_morals_grp.to_csv('./02_cleaned/morals.csv')

In [None]:
df_location_freq = pd.DataFrame(df_location_latlon.value_counts('location')).merge(df_latlon, on='location')
df_location_freq

In [None]:
px.scatter_geo(df_location_freq,
               lat=df_location_freq['lat'],
               lon=df_location_freq['lon'],
               hover_name=df_location_freq['region'],
#                projection='natural earth',
               size=df_location_freq[0]
              )

In [None]:
px.scatter_mapbox(df_location_freq[:10],
                  lat=df_location_freq['lat'][:10],
                  lon=df_location_freq['lon'][:10],
                  hover_name=df_location_freq['region'][:10],
                  mapbox_style="carto-positron")

## Merge Data

In [None]:
df_emot_out

In [None]:
searchdf(df_bothunter, user_id=18908644)

In [None]:
df_emot_out.merge()