# Create a Joint Tweetplomacy 23 Dataset
This step-by-step tutorial provides a gentle introduction on how to create a joint tweetplomcy dataset from the separate language versions files. The tutorial covers both the private and public parts of the data, but it can easily be adapted to use only the public part.

The file extension “_v2-0-0” indicates that we are using the dataset version containing not only original tweets but also retweets (DOI: ). The initial version of the dataset (DOI: ) contained only the original tweets and included some minor extraction errors.

## Target Audience

This tutorial is aimed at an basic level. You should have basic knowledge of Pandas and of Python programming.

## Duration

About half a work-day.

## Use Cases

The created file will be used in subsequent analyses.

## Environment Setup

Run the cells below:

In [1]:
import pandas as pd
from pandas.io.json import json_normalize

pd.options.display.float_format = '{:.4f}'.format

import itertools
from urllib.parse import urlparse

## Definition of Some Helper Functions 

In [2]:
def check_topic(matchingKeywords, topic):
    #print(topic)
    #print(matchingKeywords)
    if len(set(matchingKeywords).intersection(topic)) >= 1: 
        return True
    else:
        return False

In [3]:
def expand_urls(row):
    try:
        resolved = json_normalize(row)['resolved'].tolist()
        #print(resolved)
        #print(urlparse(resolved[0]).netloc)
        plds = [urlparse(url).netloc for url in resolved]
        if len(plds) != len(resolved):
            print("error")
        return plds    
    except:
        return []

In [4]:
def prepare_data(df, lang):
    df['language']=lang
    
    if lang!= 'ES':
        sentiments = json_normalize(df['sentiments'])
        sentiments.columns=['software', 'version', 'sentimentCompound','sentimentNegative', 'sentimentPositive','sentimentNeutral']
        df['sentimentCompound']=sentiments['sentimentCompound']
        df['sentimentNegative']=sentiments['sentimentNegative']
        df['sentimentPositive']=sentiments['sentimentPositive']
        df['sentimentNeutral']=sentiments['sentimentNeutral']
    
    df['hashedUserName']=json_normalize(df.userName)['hashed']
    
    df['plds'] = df['urls'].apply(expand_urls)
    
    
    return df

## German Tweets by Private Users

In [5]:
language = "DE"

In [6]:
df_private_de = pd.read_json('./tweetplomacy-private-23-'+language+'_v2-0-0.jsonl', lines=True, orient='records', convert_dates=False) 

In [7]:
df_private_de = prepare_data(df_private_de, language)

In [8]:
df_private_de

Unnamed: 0,tweetId,timeStamp,userName,followers,followees,retweets,favorites,replies,matchingKeywords,matchingUserMentions,...,retweetId,retweetTimeStamp,retweetUserName,language,sentimentCompound,sentimentNegative,sentimentPositive,sentimentNeutral,hashedUserName,plds
0,1000057711093043200,Fri May 25 16:55:13 +0000 2018,{'hashed': 'fa8c8b30177759fd037cf4b05c03b85e'},685,400,1,1,1,[Impf],[@realDonaldTrump],...,1000059835524829184.0000,Fri May 25 17:03:39 +0000 2018,{'hashed': 'b5cb2630ebb3230c501a35465dc6cb20'},DE,0.7783,0.0820,0.2880,0.6300,fa8c8b30177759fd037cf4b05c03b85e,[twitter.com]
1,1000786044223213568,Sun May 27 17:09:21 +0000 2018,{'hashed': 'c10c09d649ad32390893080f2064c71d'},53517,752,6,18,1,[Impf],[@HeikoMaas],...,1000796925728362496.0000,Sun May 27 17:52:35 +0000 2018,{'hashed': '6f60a286cc7c18cc9ba35c674f64360e'},DE,0.1260,0.1230,0.1340,0.7430,c10c09d649ad32390893080f2064c71d,[twitter.com]
2,1000777316392493057,Sun May 27 16:34:40 +0000 2018,{'hashed': 'a018d5706d8d5204b9bd8fa2bf46385c'},1915,210,6,9,1,[Klima],[@sebastiankurz],...,1000798964160122880.0000,Sun May 27 18:00:41 +0000 2018,{'hashed': 'bf1ff5cdf6e68656e831076ff3d6d148'},DE,0.2732,0.0000,0.0580,0.9420,a018d5706d8d5204b9bd8fa2bf46385c,[]
3,1001065147295813632,Mon May 28 11:38:24 +0000 2018,{'hashed': 'e0f116b965e2adb2e44deadd216f421b'},656,890,0,0,0,[Energie],[@UN],...,,,,DE,0.4215,0.0000,0.1410,0.8590,e0f116b965e2adb2e44deadd216f421b,[]
4,1001421667330154497,Tue May 29 11:15:05 +0000 2018,{'hashed': 'c9307f56114eaeb377133c3604bfe962'},11,54,0,0,0,[Klima],[@HeikoMaas],...,,,,DE,-0.6390,0.0950,0.0000,0.9050,c9307f56114eaeb377133c3604bfe962,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32683,998559643558400000,Mon May 21 13:42:26 +0000 2018,{'hashed': 'ed0460c81a06e6e2c0c6fe25153c46c3'},709,203,3,6,2,"[Gas, Ukraine]",[@poroshenko],...,998563442096254976.0000,Mon May 21 13:57:31 +0000 2018,{'hashed': '2be665bf4b49c076d5240676cc6eb96c'},DE,0.3400,0.0000,0.1030,0.8970,ed0460c81a06e6e2c0c6fe25153c46c3,[twitter.com]
32684,998559643558400000,Mon May 21 13:42:26 +0000 2018,{'hashed': 'ed0460c81a06e6e2c0c6fe25153c46c3'},709,203,4,7,2,"[Gas, Ukraine]",[@poroshenko],...,998564578740133888.0000,Mon May 21 14:02:02 +0000 2018,{'hashed': 'a6b03f17334f8c710321000950c21b70'},DE,0.3400,0.0000,0.1030,0.8970,ed0460c81a06e6e2c0c6fe25153c46c3,[twitter.com]
32685,999245402380689408,Wed May 23 11:07:23 +0000 2018,{'hashed': 'cff6c67f13822fc86e6bc06d01d82eb7'},463,1447,0,0,0,[Impf],[@RegSprecher],...,,,,DE,0.7160,0.0560,0.1790,0.7650,cff6c67f13822fc86e6bc06d01d82eb7,[]
32686,999685330340114432,Thu May 24 16:15:30 +0000 2018,{'hashed': 'f7c524a16f5225d59ef7afb7b758b850'},4,22,0,0,0,[Ukraine],"[@HeikoMaas, @RegSprecher]",...,,,,DE,0.0000,0.0000,0.0000,1.0000,f7c524a16f5225d59ef7afb7b758b850,[de.sputniknews.com]


## English Tweets by Private Users

In [9]:
language = "EN"

In [10]:
df_private_en = pd.read_json('./tweetplomacy-private-23-'+language+'_v2-0-0.jsonl', lines=True, orient='records', convert_dates=False) 

In [11]:
df_private_en = prepare_data(df_private_en, language)

In [12]:
df_private_en

Unnamed: 0,tweetId,timeStamp,userName,followers,followees,retweets,favorites,replies,matchingKeywords,matchingUserMentions,...,retweetId,retweetTimeStamp,retweetUserName,language,sentimentCompound,sentimentNegative,sentimentPositive,sentimentNeutral,hashedUserName,plds
0,1000000003803353088,Fri May 25 13:05:54 +0000 2018,{'hashed': 'b51e2b351e6f30c3f43c6423ea35e7e1'},50,658,0,0,0,[gas],[@realDonaldTrump],...,,,,EN,-0.2732,0.1490,0.0000,0.8510,b51e2b351e6f30c3f43c6423ea35e7e1,[]
1,999742426188312578,Thu May 24 20:02:23 +0000 2018,{'hashed': '8649c0fb550acb231be50b5213067c7f'},427150,398,577,1031,96,[gas],[@realDonaldTrump],...,1000001551501537280.0000,Fri May 25 13:12:03 +0000 2018,{'hashed': 'e85a44a127966982834409aed2e3607d'},EN,0.2263,0.0000,0.0490,0.9510,8649c0fb550acb231be50b5213067c7f,[]
2,999998494994829314,Fri May 25 12:59:54 +0000 2018,{'hashed': '18b3f035cce9b106cb9ee2e1ac57e51f'},8980,1784,1,1,0,[energy],[@realDonaldTrump],...,1000001824127152128.0000,Fri May 25 13:13:08 +0000 2018,{'hashed': 'eaacaf630208dcf73605e55d7344778e'},EN,0.8205,0.0540,0.2080,0.7380,18b3f035cce9b106cb9ee2e1ac57e51f,[]
3,999962476278988800,Fri May 25 10:36:47 +0000 2018,{'hashed': 'ac8dc1fda2723a05cb64485644f0adb0'},9755633,2,195,595,34,[gas],[@narendramodi],...,1000004122580418560.0000,Fri May 25 13:22:16 +0000 2018,{'hashed': 'a8f72085e75d193f759e0b214ee708f4'},EN,0.5994,0.0000,0.1090,0.8910,ac8dc1fda2723a05cb64485644f0adb0,"[www.facebook.com, twitter.com]"
4,1000004143547793409,Fri May 25 13:22:21 +0000 2018,{'hashed': 'c4f6d055a97c3f8b378877ac99b5170b'},46,404,0,0,0,[gas],[@realDonaldTrump],...,,,,EN,0.0000,0.0000,0.0000,1.0000,c4f6d055a97c3f8b378877ac99b5170b,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852236,999993297111343104,Fri May 25 12:39:15 +0000 2018,{'hashed': '10526a6c2c9adc456496f52714274ba0'},7,2,0,0,0,[gas],[@realDonaldTrump],...,,,,EN,-0.9476,0.3370,0.0510,0.6120,10526a6c2c9adc456496f52714274ba0,[]
1852237,999994643478532097,Fri May 25 12:44:36 +0000 2018,{'hashed': '6ede857f423384b79d03a574471e457b'},3342,4141,0,0,0,[\black\b],[@realDonaldTrump],...,,,,EN,0.6124,0.0830,0.2640,0.6520,6ede857f423384b79d03a574471e457b,[]
1852238,999435971530842115,Wed May 23 23:44:38 +0000 2018,{'hashed': '124ad59c6bbd23e6747db108b7ae2382'},502,2723,2,0,0,"[gas, \boil\b]",[@cafreeland],...,999994706372321280.0000,Fri May 25 12:44:51 +0000 2018,{'hashed': 'c1f8729b9096db607228b3b833de3cc9'},EN,0.3400,0.0000,0.0700,0.9300,124ad59c6bbd23e6747db108b7ae2382,[]
1852239,999742426188312578,Thu May 24 20:02:23 +0000 2018,{'hashed': '8649c0fb550acb231be50b5213067c7f'},427158,398,562,1002,95,[gas],[@realDonaldTrump],...,999995125794209792.0000,Fri May 25 12:46:31 +0000 2018,{'hashed': 'afae73337c0526c016ef4733ce23932f'},EN,0.2263,0.0000,0.0490,0.9510,8649c0fb550acb231be50b5213067c7f,[]


## French Tweets by Private Users

In [13]:
language = "FR"

In [14]:
df_private_fr = pd.read_json('./tweetplomacy-private-23-'+language+'_v2-0-0.jsonl', lines=True, orient='records', convert_dates=False) 

In [15]:
df_private_fr = prepare_data(df_private_fr, language)

In [16]:
df_private_fr

Unnamed: 0,tweetId,timeStamp,userName,followers,followees,retweets,favorites,replies,matchingKeywords,matchingUserMentions,...,retweetId,retweetTimeStamp,retweetUserName,language,sentimentCompound,sentimentNegative,sentimentPositive,sentimentNeutral,hashedUserName,plds
0,1000038171974004736,Fri May 25 15:37:34 +0000 2018,{'hashed': '58ab89c335ce542acfc27e7ee5c39b08'},1109,1867,0,0,0,[climat],[@Elisabeth_Borne],...,,,,FR,0.0000,0.0000,0.0000,1.0000,58ab89c335ce542acfc27e7ee5c39b08,[]
1,1000056979656728576,Fri May 25 16:52:18 +0000 2018,{'hashed': '0f713e98d9703afe73b4b1333a8b90dc'},1082,505,3,3,0,[Ukraine],[@EmmanuelMacron],...,1000057432180166656.0000,Fri May 25 16:54:06 +0000 2018,{'hashed': 'daa2577e2a4a7f67b641c526f7b951fb'},FR,0.3822,0.0000,0.0690,0.9310,0f713e98d9703afe73b4b1333a8b90dc,[]
2,1000025214355460097,Fri May 25 14:46:05 +0000 2018,{'hashed': '777f2b3cff0fd8c38d71c6031ef03438'},27891,8455,10,31,5,[climat],[@Elisabeth_Borne],...,1000066810673278976.0000,Fri May 25 17:31:22 +0000 2018,{'hashed': '2f7128a524d5881b55aa564271d00177'},FR,-0.6230,0.1830,0.0420,0.7760,777f2b3cff0fd8c38d71c6031ef03438,[www.europe1.fr]
3,999889121802829829,Fri May 25 05:45:18 +0000 2018,{'hashed': '814b942ebf7558655c5bfbec2649180d'},4128,1957,76,55,8,[manque],"[@EmmanuelMacron, @realDonaldTrump]",...,1000070195447296000.0000,Fri May 25 17:44:49 +0000 2018,{'hashed': '2b33b385c5313af72a68b0c0cc0ca998'},FR,0.2120,0.1400,0.0760,0.7840,814b942ebf7558655c5bfbec2649180d,[twitter.com]
4,995826986131120128,Mon May 14 00:43:49 +0000 2018,{'hashed': '5b77ac6550a6a0df5c0c1a26a60cb6e0'},2415,2584,2,3,0,[gaz],[@UN],...,1000099899545923584.0000,Fri May 25 19:42:51 +0000 2018,{'hashed': 'd6c414ef30f6671f2607a2b7caa105c9'},FR,0.8433,0.0300,0.1840,0.7860,5b77ac6550a6a0df5c0c1a26a60cb6e0,[twitter.com]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68774,999758535771742208,Thu May 24 21:06:24 +0000 2018,{'hashed': '6f46ea044bbf8b7a7f91bb747c71767e'},2412,35,1,0,0,"[Ukraine, climat]",[@EmmanuelMacron],...,999759049431375872.0000,Thu May 24 21:08:26 +0000 2018,{'hashed': '290033594fd72b39c942b52c31fa21ba'},FR,-0.4215,0.0940,0.0310,0.8740,6f46ea044bbf8b7a7f91bb747c71767e,[www.rtbf.be]
68775,999894807605473281,Fri May 25 06:07:53 +0000 2018,{'hashed': 'fe64f3a2808713fadad834ee7a6edaf1'},527,808,1,0,1,"[gaz, manque]",[@dreynders],...,999901173418082304.0000,Fri May 25 06:33:11 +0000 2018,{'hashed': '330c91df08c13fbf7c89fb07e8d8b3a1'},FR,-0.7778,0.0780,0.0370,0.8850,fe64f3a2808713fadad834ee7a6edaf1,[twitter.com]
68776,999895697322229761,Fri May 25 06:11:26 +0000 2018,{'hashed': 'fe64f3a2808713fadad834ee7a6edaf1'},527,808,3,1,0,"[gaz, manque]",[@dreynders],...,999905803900403712.0000,Fri May 25 06:51:35 +0000 2018,{'hashed': '16fd04be2a69a0364fd372d4376a8992'},FR,-0.8426,0.0950,0.0450,0.8590,fe64f3a2808713fadad834ee7a6edaf1,[twitter.com]
68777,999889121802829829,Fri May 25 05:45:18 +0000 2018,{'hashed': '814b942ebf7558655c5bfbec2649180d'},4124,1962,21,13,0,[manque],"[@EmmanuelMacron, @realDonaldTrump]",...,999916398733279232.0000,Fri May 25 07:33:41 +0000 2018,{'hashed': '9ba33f318b801fb94007e1824d47d648'},FR,0.2120,0.1400,0.0760,0.7840,814b942ebf7558655c5bfbec2649180d,[twitter.com]


## Spanish Tweets by Private Users

In [17]:
language = "ES"

In [18]:
df_private_es = pd.read_json('./tweetplomacy-private-23-'+language+'_v2-0-0.jsonl', lines=True, orient='records', convert_dates=False) 

In [19]:
df_private_es = prepare_data(df_private_es, language)

## Create Private Part Dataframe

In [20]:
df_private_len = len(df_private_de)+len(df_private_en)+len(df_private_fr)+len(df_private_es)

In [21]:
len(df_private_de.tweetId.unique())+len(df_private_en.tweetId.unique())+len(df_private_fr.tweetId.unique())+len(df_private_es.tweetId.unique())

1942425

In [22]:
df_private_len

3016984

In [23]:
df_private = df_private_de.append(df_private_en).append(df_private_fr).append(df_private_es)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [24]:
df_private

Unnamed: 0,entities,favorites,followees,followers,hashedUserName,hashtags,language,matchingKeywords,matchingUserMentions,matchingUserName,...,retweets,sentimentCompound,sentimentNegative,sentimentNeutral,sentimentPositive,sentiments,timeStamp,tweetId,urls,userName
0,"{'software': 'entity-fishing', 'version': '0.0...",1,400,685,fa8c8b30177759fd037cf4b05c03b85e,[],DE,[Impf],[@realDonaldTrump],,...,1,0.7783,0.0820,0.6300,0.2880,"{'software': 'vaderSentiment', 'version': '3.3...",Fri May 25 16:55:13 +0000 2018,1000057711093043200,"[{'short': 'https://t.co/PZqrqME1yx', 'resolve...",{'hashed': 'fa8c8b30177759fd037cf4b05c03b85e'}
1,"{'software': 'entity-fishing', 'version': '0.0...",18,752,53517,c10c09d649ad32390893080f2064c71d,[Russland-Politik],DE,[Impf],[@HeikoMaas],,...,6,0.1260,0.1230,0.7430,0.1340,"{'software': 'vaderSentiment', 'version': '3.3...",Sun May 27 17:09:21 +0000 2018,1000786044223213568,"[{'short': 'https://t.co/35qiJ6R7a2', 'resolve...",{'hashed': 'c10c09d649ad32390893080f2064c71d'}
2,"{'software': 'entity-fishing', 'version': '0.0...",9,210,1915,a018d5706d8d5204b9bd8fa2bf46385c,"[Klimastrategie, Regierungsklausur]",DE,[Klima],[@sebastiankurz],,...,6,0.2732,0.0000,0.9420,0.0580,"{'software': 'vaderSentiment', 'version': '3.3...",Sun May 27 16:34:40 +0000 2018,1000777316392493057,[],{'hashed': 'a018d5706d8d5204b9bd8fa2bf46385c'}
3,"{'software': 'entity-fishing', 'version': '0.0...",0,890,656,e0f116b965e2adb2e44deadd216f421b,[],DE,[Energie],[@UN],,...,0,0.4215,0.0000,0.8590,0.1410,"{'software': 'vaderSentiment', 'version': '3.3...",Mon May 28 11:38:24 +0000 2018,1001065147295813632,[],{'hashed': 'e0f116b965e2adb2e44deadd216f421b'}
4,"{'software': 'entity-fishing', 'version': '0.0...",0,54,11,c9307f56114eaeb377133c3604bfe962,[],DE,[Klima],[@HeikoMaas],,...,0,-0.6390,0.0950,0.9050,0.0000,"{'software': 'vaderSentiment', 'version': '3.3...",Tue May 29 11:15:05 +0000 2018,1001421667330154497,[],{'hashed': 'c9307f56114eaeb377133c3604bfe962'}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063271,"{'software': 'entity-fishing', 'version': '0.0...",133,1624,8401,27c6a4a1b0f00224690766c56f71034d,[ElDebate],ES,[gas],[@petrogustavo],,...,59,,,,,,Fri May 25 02:56:17 +0000 2018,999846586674016257,[],{'hashed': '27c6a4a1b0f00224690766c56f71034d'}
1063272,"{'software': 'entity-fishing', 'version': '0.0...",4509,1759,36527,9ecc98199bbd5833682f073f0d3c014d,[],ES,[gas],"[@IvanDuque, @petrogustavo]",,...,2147,,,,,,Fri May 25 01:30:19 +0000 2018,999824952609181697,[],{'hashed': '9ecc98199bbd5833682f073f0d3c014d'}
1063273,"{'software': 'entity-fishing', 'version': '0.0...",88,69,781625,8c300459233cd3c8d0649444ada0525b,"[ElDebate, VargasLlerasPresidente]",ES,[gas],[@petrogustavo],,...,65,,,,,,Fri May 25 01:20:51 +0000 2018,999822572333293568,[],{'hashed': '8c300459233cd3c8d0649444ada0525b'}
1063274,"{'software': 'entity-fishing', 'version': '0.0...",3,413,18250,ebfa86a7ce6c4d2fa46c45414dcdd3d1,[DuqueSeGanóMiVoto],ES,[falta],[@IvanDuque],,...,8,,,,,,Fri May 25 12:54:30 +0000 2018,999997132634509313,"[{'short': 'https://t.co/UYKkKVHa5t', 'resolve...",{'hashed': 'ebfa86a7ce6c4d2fa46c45414dcdd3d1'}


In [25]:
df_private['public_person']=False

In [26]:
df_private

Unnamed: 0,entities,favorites,followees,followers,hashedUserName,hashtags,language,matchingKeywords,matchingUserMentions,matchingUserName,...,sentimentCompound,sentimentNegative,sentimentNeutral,sentimentPositive,sentiments,timeStamp,tweetId,urls,userName,public_person
0,"{'software': 'entity-fishing', 'version': '0.0...",1,400,685,fa8c8b30177759fd037cf4b05c03b85e,[],DE,[Impf],[@realDonaldTrump],,...,0.7783,0.0820,0.6300,0.2880,"{'software': 'vaderSentiment', 'version': '3.3...",Fri May 25 16:55:13 +0000 2018,1000057711093043200,"[{'short': 'https://t.co/PZqrqME1yx', 'resolve...",{'hashed': 'fa8c8b30177759fd037cf4b05c03b85e'},False
1,"{'software': 'entity-fishing', 'version': '0.0...",18,752,53517,c10c09d649ad32390893080f2064c71d,[Russland-Politik],DE,[Impf],[@HeikoMaas],,...,0.1260,0.1230,0.7430,0.1340,"{'software': 'vaderSentiment', 'version': '3.3...",Sun May 27 17:09:21 +0000 2018,1000786044223213568,"[{'short': 'https://t.co/35qiJ6R7a2', 'resolve...",{'hashed': 'c10c09d649ad32390893080f2064c71d'},False
2,"{'software': 'entity-fishing', 'version': '0.0...",9,210,1915,a018d5706d8d5204b9bd8fa2bf46385c,"[Klimastrategie, Regierungsklausur]",DE,[Klima],[@sebastiankurz],,...,0.2732,0.0000,0.9420,0.0580,"{'software': 'vaderSentiment', 'version': '3.3...",Sun May 27 16:34:40 +0000 2018,1000777316392493057,[],{'hashed': 'a018d5706d8d5204b9bd8fa2bf46385c'},False
3,"{'software': 'entity-fishing', 'version': '0.0...",0,890,656,e0f116b965e2adb2e44deadd216f421b,[],DE,[Energie],[@UN],,...,0.4215,0.0000,0.8590,0.1410,"{'software': 'vaderSentiment', 'version': '3.3...",Mon May 28 11:38:24 +0000 2018,1001065147295813632,[],{'hashed': 'e0f116b965e2adb2e44deadd216f421b'},False
4,"{'software': 'entity-fishing', 'version': '0.0...",0,54,11,c9307f56114eaeb377133c3604bfe962,[],DE,[Klima],[@HeikoMaas],,...,-0.6390,0.0950,0.9050,0.0000,"{'software': 'vaderSentiment', 'version': '3.3...",Tue May 29 11:15:05 +0000 2018,1001421667330154497,[],{'hashed': 'c9307f56114eaeb377133c3604bfe962'},False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063271,"{'software': 'entity-fishing', 'version': '0.0...",133,1624,8401,27c6a4a1b0f00224690766c56f71034d,[ElDebate],ES,[gas],[@petrogustavo],,...,,,,,,Fri May 25 02:56:17 +0000 2018,999846586674016257,[],{'hashed': '27c6a4a1b0f00224690766c56f71034d'},False
1063272,"{'software': 'entity-fishing', 'version': '0.0...",4509,1759,36527,9ecc98199bbd5833682f073f0d3c014d,[],ES,[gas],"[@IvanDuque, @petrogustavo]",,...,,,,,,Fri May 25 01:30:19 +0000 2018,999824952609181697,[],{'hashed': '9ecc98199bbd5833682f073f0d3c014d'},False
1063273,"{'software': 'entity-fishing', 'version': '0.0...",88,69,781625,8c300459233cd3c8d0649444ada0525b,"[ElDebate, VargasLlerasPresidente]",ES,[gas],[@petrogustavo],,...,,,,,,Fri May 25 01:20:51 +0000 2018,999822572333293568,[],{'hashed': '8c300459233cd3c8d0649444ada0525b'},False
1063274,"{'software': 'entity-fishing', 'version': '0.0...",3,413,18250,ebfa86a7ce6c4d2fa46c45414dcdd3d1,[DuqueSeGanóMiVoto],ES,[falta],[@IvanDuque],,...,,,,,,Fri May 25 12:54:30 +0000 2018,999997132634509313,"[{'short': 'https://t.co/UYKkKVHa5t', 'resolve...",{'hashed': 'ebfa86a7ce6c4d2fa46c45414dcdd3d1'},False


In [27]:
df_private.columns

Index(['entities', 'favorites', 'followees', 'followers', 'hashedUserName',
       'hashtags', 'language', 'matchingKeywords', 'matchingUserMentions',
       'matchingUserName', 'mentions', 'plds', 'replies', 'retweetId',
       'retweetTimeStamp', 'retweetUserName', 'retweets', 'sentimentCompound',
       'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',
       'sentiments', 'timeStamp', 'tweetId', 'urls', 'userName',
       'public_person'],
      dtype='object')

## German Tweets by Public Figures (Users)

In [28]:
language='DE'

In [29]:
df_public_de = pd.read_json('./tweetplomacy-public-23-'+language+'_v2-0-0.jsonl', lines=True, orient='records', convert_dates=False) 

In [30]:
df_public_de = prepare_data(df_public_de, language)

In [31]:
df_public_de

Unnamed: 0,tweetId,timeStamp,userName,userBio,followers,followees,retweets,favorites,replies,matchingKeywords,...,retweetId,retweetTimeStamp,retweetUserName,language,sentimentCompound,sentimentNegative,sentimentPositive,sentimentNeutral,hashedUserName,plds
0,1004066836592422913,Tue Jun 05 18:26:03 +0000 2018,"{'userName': 'sebastiankurz', 'hashed': '6cf04...",Chancellor of Austria / Bundeskanzler der Repu...,300091,1070,33,129,12,[Ukraine],...,1004562903243153408.0000,Thu Jun 07 03:17:14 +0000 2018,{'hashed': '55a6d1aa52a26e4882bbb40f79863af6'},DE,0.8316,0.0000,0.2240,0.7760,6cf0434f3045cb06c5c1a64243bb778b,[]
1,1004764004701605897,Thu Jun 07 16:36:20 +0000 2018,"{'userName': 'AuswaertigesAmt', 'hashed': 'dd4...",Aktuelle Nachrichten aus dem Auswärtigen Amt -...,647817,1022,9,19,3,[Ukraine],...,1004781833291255808.0000,Thu Jun 07 17:47:11 +0000 2018,{'hashed': '6c355edcd1d1d2ec8635ed65cbd56070'},DE,0.4019,0.0000,0.1190,0.8810,dd4c8ed8f1015bd4826be9de24b355a2,[twitter.com]
2,1004966895211237376,Fri Jun 08 06:02:33 +0000 2018,"{'userName': 'RegSprecher', 'hashed': 'de562ec...",Sprecher der Bundesregierung und Chef des Bund...,912028,122,4,7,1,[Klima],...,1004967414474436608.0000,Fri Jun 08 06:04:37 +0000 2018,{'hashed': 'd779c4912d2a5c96c054850aaa51b919'},DE,0.4215,0.0000,0.0980,0.9020,de562ec84c7998e0b09a4f03841143ed,[twitter.com]
3,1004966895211237376,Fri Jun 08 06:02:33 +0000 2018,"{'userName': 'RegSprecher', 'hashed': 'de562ec...",Sprecher der Bundesregierung und Chef des Bund...,912081,122,30,99,62,[Klima],...,1005188114556571648.0000,Fri Jun 08 20:41:36 +0000 2018,{'hashed': '7c053f3495e5a8f79b7c13e0d90f6d5f'},DE,0.4215,0.0000,0.0980,0.9020,de562ec84c7998e0b09a4f03841143ed,[twitter.com]
4,1006241207759097860,Mon Jun 11 18:26:13 +0000 2018,"{'userName': 'AuswaertigesAmt', 'hashed': 'dd4...",Aktuelle Nachrichten aus dem Auswärtigen Amt -...,648129,1022,14,29,11,[Ukraine],...,1006251169323343872.0000,Mon Jun 11 19:05:48 +0000 2018,{'hashed': 'b298ca69dfb1ec2f23e092e9f7e4c881'},DE,-0.3612,0.0820,0.0000,0.9180,dd4c8ed8f1015bd4826be9de24b355a2,[twitter.com]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4565,996676207046545408,Wed May 16 08:58:19 +0000 2018,"{'userName': 'ABaerbock', 'hashed': 'd7ab03945...",Bundesvorsitzende von Bündnis 90/Die Grünen * ...,15517,1250,21,45,3,[Klima],...,996758431766900736.0000,Wed May 16 14:25:03 +0000 2018,{'hashed': 'a2b39f09e62f726e66b5c85a82dec371'},DE,0.5859,0.0590,0.1450,0.7950,d7ab039457f21333653b19d95644e96d,[www.spiegel.de]
4566,997817003971874816,Sat May 19 12:31:26 +0000 2018,"{'userName': 'HeikoMaas', 'hashed': '3584d2d73...",Bundesaußenminister & Saarländer. MdB für den ...,282653,2936,24,113,34,[Knapp],...,997839773640388608.0000,Sat May 19 14:01:55 +0000 2018,{'hashed': 'f5d59ca91fec031efe18ed5230bfe810'},DE,0.9325,0.0330,0.3440,0.6220,3584d2d73a504767f27ba07decb7dd70,[twitter.com]
4567,998463684765716480,Mon May 21 07:21:07 +0000 2018,"{'userName': 'MFA_Austria', 'hashed': '6d40a0b...","Bundesministerium für Europa, Integration und ...",26567,927,0,0,0,[Energie],...,,,,DE,0.6908,0.0000,0.1550,0.8450,6d40a0bf9480f91f9d31bb0a8fc9860b,[twitter.com]
4568,998463670555435008,Mon May 21 07:21:04 +0000 2018,"{'userName': 'MFA_Austria', 'hashed': '6d40a0b...","Bundesministerium für Europa, Integration und ...",26589,930,4,2,1,[Energie],...,998977394739417088.0000,Tue May 22 17:22:25 +0000 2018,{'hashed': '30f13329073fe64a4b8fadad8c9aa14b'},DE,0.4574,0.0000,0.0880,0.9120,6d40a0bf9480f91f9d31bb0a8fc9860b,"[www.entwicklung.at, twitter.com]"


## English Tweets by Public Figures (Users)

In [32]:
language='EN'

In [33]:
df_public_en = pd.read_json('./tweetplomacy-public-23-'+language+'_v2-0-0.jsonl', lines=True, orient='records', convert_dates=False) 

In [34]:
df_public_en = prepare_data(df_public_en, language)

In [35]:
df_public_en

Unnamed: 0,tweetId,timeStamp,userName,userBio,followers,followees,retweets,favorites,replies,matchingKeywords,...,retweetId,retweetTimeStamp,retweetUserName,language,sentimentCompound,sentimentNegative,sentimentPositive,sentimentNeutral,hashedUserName,plds
0,999676007639601152,Thu May 24 15:38:27 +0000 2018,"{'userName': 'narendramodi', 'hashed': 'c45c37...",Prime Minister of India,42695193,1939,2606,14899,479,[energy],...,1000001551476244480.0000,Fri May 25 13:12:03 +0000 2018,{'hashed': 'daa861ed664a61110332e49f058c413d'},EN,0.7717,0.0000,0.2280,0.7720,c45c37cfa8e3bd7e2f111f098986603d,[twitter.com]
1,999676007639601152,Thu May 24 15:38:27 +0000 2018,"{'userName': 'narendramodi', 'hashed': 'c45c37...",Prime Minister of India,42693874,1939,2612,14915,480,[energy],...,1000002621036474368.0000,Fri May 25 13:16:18 +0000 2018,{'hashed': 'fa52a61a7d6a47511984236f7586916e'},EN,0.7717,0.0000,0.2280,0.7720,c45c37cfa8e3bd7e2f111f098986603d,[twitter.com]
2,999675638586970112,Thu May 24 15:36:59 +0000 2018,"{'userName': 'narendramodi', 'hashed': 'c45c37...",Prime Minister of India,42694472,1939,1652,9338,372,[energy],...,1000011089340452864.0000,Fri May 25 13:49:57 +0000 2018,{'hashed': 'a168e1eb46c35f47a0eb322ff69d0ab4'},EN,0.2732,0.0000,0.0750,0.9250,c45c37cfa8e3bd7e2f111f098986603d,[twitter.com]
3,999675638586970112,Thu May 24 15:36:59 +0000 2018,"{'userName': 'narendramodi', 'hashed': 'c45c37...",Prime Minister of India,42696697,1939,1659,9425,376,[energy],...,1000021881271992320.0000,Fri May 25 14:32:50 +0000 2018,{'hashed': '49e289575a8b9374371e1644d0ab743c'},EN,0.2732,0.0000,0.0750,0.9250,c45c37cfa8e3bd7e2f111f098986603d,[twitter.com]
4,999676007639601152,Thu May 24 15:38:27 +0000 2018,"{'userName': 'narendramodi', 'hashed': 'c45c37...",Prime Minister of India,42696299,1939,2654,15120,486,[energy],...,1000022762075840512.0000,Fri May 25 14:36:20 +0000 2018,{'hashed': '26b984876e21e5a392e8d00a6b9b302b'},EN,0.7717,0.0000,0.2280,0.7720,c45c37cfa8e3bd7e2f111f098986603d,[twitter.com]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626435,999676007639601152,Thu May 24 15:38:27 +0000 2018,"{'userName': 'narendramodi', 'hashed': 'c45c37...",Prime Minister of India,42691368,1939,2514,14415,458,[energy],...,999967439218724864.0000,Fri May 25 10:56:30 +0000 2018,{'hashed': 'b8942d26f74cb550f7cf2c0e5c2641e8'},EN,0.7717,0.0000,0.2280,0.7720,c45c37cfa8e3bd7e2f111f098986603d,[twitter.com]
626436,999676007639601152,Thu May 24 15:38:27 +0000 2018,"{'userName': 'narendramodi', 'hashed': 'c45c37...",Prime Minister of India,42691817,1939,2532,14511,463,[energy],...,999972757583663104.0000,Fri May 25 11:17:38 +0000 2018,{'hashed': 'c75a76d801357ae73941cbecbaf95d5c'},EN,0.7717,0.0000,0.2280,0.7720,c45c37cfa8e3bd7e2f111f098986603d,[twitter.com]
626437,999923263454134272,Fri May 25 08:00:58 +0000 2018,"{'userName': 'NOIweala', 'hashed': '962f359729...",Dr. Ngozi Okonjo-Iweala is Senior Advisor @Laz...,832686,6,14,23,1,[vaccin],...,999975081236402176.0000,Fri May 25 11:26:52 +0000 2018,{'hashed': '0619c69675d760ad3c0eca7538871784'},EN,0.6597,0.0000,0.1620,0.8380,962f35972905addd2476c14e7d3b16c5,[www.facebook.com]
626438,999929192451133440,Fri May 25 08:24:31 +0000 2018,"{'userName': 'WHO', 'hashed': '9ed1e9f759b27c2...",Official Twitter account of the World Health O...,4528647,1656,43,78,3,[energy],...,999980986820628480.0000,Fri May 25 11:50:20 +0000 2018,{'hashed': '1df9b0425cb8d45a82f22cecf4d82b51'},EN,0.2732,0.0000,0.0480,0.9520,9ed1e9f759b27c20c0ee8a56113c84e3,[twitter.com]


## French Tweets by Public Figures (Users)

In [36]:
language='FR'

In [37]:
df_public_fr = pd.read_json('./tweetplomacy-public-23-'+language+'_v2-0-0.jsonl', lines=True, orient='records', convert_dates=False) 

In [38]:
df_public_fr= prepare_data(df_public_fr, language)

In [39]:
df_public_fr

Unnamed: 0,tweetId,timeStamp,userName,userBio,followers,followees,retweets,favorites,replies,matchingKeywords,...,retweetId,retweetTimeStamp,retweetUserName,language,sentimentCompound,sentimentNegative,sentimentPositive,sentimentNeutral,hashedUserName,plds
0,999739837673549825,Thu May 24 19:52:06 +0000 2018,"{'userName': 'EmmanuelMacron', 'hashed': '4e51...",Président de la République française.,3002466,679,106,315,31,[Ukraine],...,1000131268720451584.0000,Fri May 25 21:47:30 +0000 2018,{'hashed': '14108ae52d7d97b1acf6d4bc1742eba9'},FR,0.4588,0.0000,0.1600,0.8400,4e51f43c9d8cb261960330e9a90e62b9,[]
1,1001817140595445760,Wed May 30 13:26:33 +0000 2018,"{'userName': 'JustinTrudeau', 'hashed': 'ab510...",Account run by the 23rd Prime Minister of Cana...,4160789,965,33,180,18,[climat],...,1001851109512753152.0000,Wed May 30 15:41:32 +0000 2018,{'hashed': 'f9d9ab3a77fa1743e9a6918e8f87a996'},FR,-0.2714,0.0570,0.0320,0.9110,ab510c86ea6b15db1a731375ff759085,[pm.gc.ca]
2,1001855879979896835,Wed May 30 16:00:30 +0000 2018,"{'userName': 'EmmanuelMacron', 'hashed': '4e51...",Président de la République française.,3012246,681,2,12,2,[climat],...,1001856151066210304.0000,Wed May 30 16:01:34 +0000 2018,{'hashed': 'e739f219c1f8b5c229c653039bad8b0f'},FR,0.1759,0.0000,0.0510,0.9490,4e51f43c9d8cb261960330e9a90e62b9,[]
3,1001817140595445760,Wed May 30 13:26:33 +0000 2018,"{'userName': 'JustinTrudeau', 'hashed': 'ab510...",Account run by the 23rd Prime Minister of Cana...,4160962,965,42,229,21,[climat],...,1001929031305089024.0000,Wed May 30 20:51:10 +0000 2018,{'hashed': 'fef4a9b2e456fcf52ba07cc23f4ed364'},FR,-0.2714,0.0570,0.0320,0.9110,ab510c86ea6b15db1a731375ff759085,[pm.gc.ca]
4,972825828433301504,Sun Mar 11 13:25:26 +0000 2018,"{'userName': 'EmmanuelMacron', 'hashed': '4e51...",Président de la République française.,3017403,681,349,1095,102,[climat],...,1003496451106844672.0000,Mon Jun 04 04:39:32 +0000 2018,{'hashed': '2d9d55dbfaf4b432054d8c1a1f29a67e'},FR,0.8934,0.0000,0.3140,0.6860,4e51f43c9d8cb261960330e9a90e62b9,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13172,997437634417971200,Fri May 18 11:23:58 +0000 2018,"{'userName': 'dreynders', 'hashed': '5ed8d4a17...",deputy prime minister and foreign affairs mini...,138163,4521,11,12,3,[Ukraine],...,997492703393976320.0000,Fri May 18 15:02:47 +0000 2018,{'hashed': '75d14c57d51bce65a537e6e48cfd87f3'},FR,0.0000,0.0000,0.0000,1.0000,5ed8d4a170ee4b34261580be889289e9,[twitter.com]
13173,998904222170124288,Tue May 22 12:31:39 +0000 2018,"{'userName': 'EmmanuelMacron', 'hashed': '4e51...",Président de la République française.,3021158,679,60,197,26,[énergie],...,998911678350872576.0000,Tue May 22 13:01:17 +0000 2018,{'hashed': 'f73ed8cd909eb4b20ce2e6c5c1e05df3'},FR,0.4404,0.0000,0.1000,0.9000,4e51f43c9d8cb261960330e9a90e62b9,[]
13174,999684451583000576,Thu May 24 16:12:01 +0000 2018,"{'userName': 'EmmanuelMacron', 'hashed': '4e51...",Président de la République française.,3022408,679,122,330,57,[Ukraine],...,999708004726595584.0000,Thu May 24 17:45:36 +0000 2018,{'hashed': '6f1c70738e00064de6a38973f46c9f8b'},FR,-0.0258,0.0460,0.0000,0.9540,4e51f43c9d8cb261960330e9a90e62b9,[twitter.com]
13175,999684451583000576,Thu May 24 16:12:01 +0000 2018,"{'userName': 'EmmanuelMacron', 'hashed': '4e51...",Président de la République française.,3022472,679,148,385,66,[Ukraine],...,999718335309860864.0000,Thu May 24 18:26:39 +0000 2018,{'hashed': 'e37fcfb164e580475c012527cf4ce9bd'},FR,-0.0258,0.0460,0.0000,0.9540,4e51f43c9d8cb261960330e9a90e62b9,[twitter.com]


## Spanish Tweets by Public Figures (Users)

In [40]:
language='ES'

In [41]:
df_public_es = pd.read_json('./tweetplomacy-public-23-'+language+'_v2-0-0.jsonl', lines=True, orient='records', convert_dates=False) 

In [42]:
df_public_es = prepare_data(df_public_es, language)

In [43]:
df_public_es

Unnamed: 0,tweetId,timeStamp,userName,userBio,followers,followees,retweets,favorites,replies,matchingKeywords,...,entities,hashtags,mentions,urls,retweetId,retweetTimeStamp,retweetUserName,language,hashedUserName,plds
0,999748072300269569,Thu May 24 20:24:49 +0000 2018,"{'userName': 'NicolasMaduro', 'hashed': '4270b...",Presidente de la República Bolivariana de Vene...,3379972,113,2536,1780,471,[falta],...,"{'software': 'entity-fishing', 'version': '0.0...",[],[],"[{'short': 'https://t.co/203jwJgeVP', 'resolve...",1000000767133126656.0000,Fri May 25 13:08:56 +0000 2018,{'hashed': 'd1faa4bd19e21ed4124cfcf1c82cf12c'},ES,4270bdb375288497d1cebb089c9fbc9b,[twitter.com]
1,1000005381798711297,Fri May 25 13:27:16 +0000 2018,"{'userName': 'petrogustavo', 'hashed': '4a96d2...",Perfil Oficial del dirigente político progresi...,3150286,2057,9,21,1,[gas],...,"{'software': 'entity-fishing', 'version': '0.0...",[],[],"[{'short': 'https://t.co/oiJ8p0lXda', 'resolve...",1000005611554295808.0000,Fri May 25 13:28:11 +0000 2018,{'hashed': 'e48e9c32fc5e718475a1c9b8a464036b'},ES,4a96d2bb33292ba669f888422fd5ee13,[www.las2orillas.co]
2,1000005381798711297,Fri May 25 13:27:16 +0000 2018,"{'userName': 'petrogustavo', 'hashed': '4a96d2...",Perfil Oficial del dirigente político progresi...,3150298,2057,138,229,13,[gas],...,"{'software': 'entity-fishing', 'version': '0.0...",[],[],"[{'short': 'https://t.co/oiJ8p0lXda', 'resolve...",1000007767464259584.0000,Fri May 25 13:36:45 +0000 2018,{'hashed': '9c2d26307b70784aa3ee8958da5d0b93'},ES,4a96d2bb33292ba669f888422fd5ee13,[www.las2orillas.co]
3,1000005381798711297,Fri May 25 13:27:16 +0000 2018,"{'userName': 'petrogustavo', 'hashed': '4a96d2...",Perfil Oficial del dirigente político progresi...,3150300,2057,178,289,15,[gas],...,"{'software': 'entity-fishing', 'version': '0.0...",[],[],"[{'short': 'https://t.co/oiJ8p0lXda', 'resolve...",1000008732133249024.0000,Fri May 25 13:40:35 +0000 2018,{'hashed': '6ae79735acb9f1c5442eb48b407de707'},ES,4a96d2bb33292ba669f888422fd5ee13,[www.las2orillas.co]
4,1000007791803817984,Fri May 25 13:36:51 +0000 2018,"{'userName': 'petrogustavo', 'hashed': '4a96d2...",Perfil Oficial del dirigente político progresi...,3150302,2057,69,149,16,[gas],...,"{'software': 'entity-fishing', 'version': '0.0...",[],[],"[{'short': 'https://t.co/tQ8RaGMZVg', 'resolve...",1000009155753795584.0000,Fri May 25 13:42:16 +0000 2018,{'hashed': '3ff8cc0629f09e9465237a59a208a277'},ES,4a96d2bb33292ba669f888422fd5ee13,[twitter.com]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175435,999748072300269569,Thu May 24 20:24:49 +0000 2018,"{'userName': 'NicolasMaduro', 'hashed': '4270b...",Presidente de la República Bolivariana de Vene...,3379926,113,2269,1596,443,[falta],...,"{'software': 'entity-fishing', 'version': '0.0...",[],[],"[{'short': 'https://t.co/203jwJgeVP', 'resolve...",999969855142092800.0000,Fri May 25 11:06:06 +0000 2018,{'hashed': '2d03bc97482b1ce06d823f291847eeaf'},ES,4270bdb375288497d1cebb089c9fbc9b,[twitter.com]
175436,999748072300269569,Thu May 24 20:24:49 +0000 2018,"{'userName': 'NicolasMaduro', 'hashed': '4270b...",Presidente de la República Bolivariana de Vene...,3379930,113,2281,1603,444,[falta],...,"{'software': 'entity-fishing', 'version': '0.0...",[],[],"[{'short': 'https://t.co/203jwJgeVP', 'resolve...",999971549615722496.0000,Fri May 25 11:12:50 +0000 2018,{'hashed': '07d14cf16d036c7566416f86c0bf14fa'},ES,4270bdb375288497d1cebb089c9fbc9b,[twitter.com]
175437,999748072300269569,Thu May 24 20:24:49 +0000 2018,"{'userName': 'NicolasMaduro', 'hashed': '4270b...",Presidente de la República Bolivariana de Vene...,3379931,113,2288,1609,446,[falta],...,"{'software': 'entity-fishing', 'version': '0.0...",[],[],"[{'short': 'https://t.co/203jwJgeVP', 'resolve...",999972598216871936.0000,Fri May 25 11:17:00 +0000 2018,{'hashed': '41f2317e855345ace12f72309e3f4773'},ES,4270bdb375288497d1cebb089c9fbc9b,[twitter.com]
175438,999748072300269569,Thu May 24 20:24:49 +0000 2018,"{'userName': 'NicolasMaduro', 'hashed': '4270b...",Presidente de la República Bolivariana de Vene...,3379947,113,2381,1678,452,[falta],...,"{'software': 'entity-fishing', 'version': '0.0...",[],[],"[{'short': 'https://t.co/203jwJgeVP', 'resolve...",999983721515188224.0000,Fri May 25 12:01:12 +0000 2018,{'hashed': '341693648614b4a62e64f847d01d12ae'},ES,4270bdb375288497d1cebb089c9fbc9b,[twitter.com]


## Create Public Part Dataset

In [44]:

len(df_public_de.tweetId.unique())+len(df_public_en.tweetId.unique())+len(df_public_fr.tweetId.unique())+len(df_public_es.tweetId.unique())

105810

In [45]:
df_public_len = len(df_public_de)+len(df_public_en)+len(df_public_fr)+len(df_public_es)

In [46]:
df_public_len

819627

In [47]:
df_public = df_public_de.append(df_public_en).append(df_public_fr).append(df_public_es)


In [48]:
df_public['public_person']=True

In [49]:
len(df_public)

819627

## Create a Dataframe Containing All Public and Private Tweets

In [50]:
df_all = df_public.append(df_private)

In [51]:
df_all

Unnamed: 0,entities,favorites,followees,followers,hashedUserName,hashtags,language,matchingKeywords,matchingUserMentions,matchingUserName,...,sentimentCompound,sentimentNegative,sentimentNeutral,sentimentPositive,sentiments,timeStamp,tweetId,urls,userBio,userName
0,"{'software': 'entity-fishing', 'version': '0.0...",129,1070,300091,6cf0434f3045cb06c5c1a64243bb778b,"[Russland, Syrien, Ukraine]",DE,[Ukraine],[],sebastiankurz,...,0.8316,0.0000,0.7760,0.2240,"{'software': 'vaderSentiment', 'version': '3.3...",Tue Jun 05 18:26:03 +0000 2018,1004066836592422913,[],Chancellor of Austria / Bundeskanzler der Repu...,"{'userName': 'sebastiankurz', 'hashed': '6cf04..."
1,"{'software': 'entity-fishing', 'version': '0.0...",19,1022,647817,dd4c8ed8f1015bd4826be9de24b355a2,[Ukraine:],DE,[Ukraine],[],AuswaertigesAmt,...,0.4019,0.0000,0.8810,0.1190,"{'software': 'vaderSentiment', 'version': '3.3...",Thu Jun 07 16:36:20 +0000 2018,1004764004701605897,"[{'short': 'https://t.co/c98ToQ62Ez', 'resolve...",Aktuelle Nachrichten aus dem Auswärtigen Amt -...,"{'userName': 'AuswaertigesAmt', 'hashed': 'dd4..."
2,"{'software': 'entity-fishing', 'version': '0.0...",7,122,912028,de562ec84c7998e0b09a4f03841143ed,"[Merkel, G7-Gipfel, G7Charlevoix]",DE,[Klima],[],RegSprecher,...,0.4215,0.0000,0.9020,0.0980,"{'software': 'vaderSentiment', 'version': '3.3...",Fri Jun 08 06:02:33 +0000 2018,1004966895211237376,"[{'short': 'https://t.co/A56faXkzRS', 'resolve...",Sprecher der Bundesregierung und Chef des Bund...,"{'userName': 'RegSprecher', 'hashed': 'de562ec..."
3,"{'software': 'entity-fishing', 'version': '0.0...",99,122,912081,de562ec84c7998e0b09a4f03841143ed,"[Merkel, G7-Gipfel, G7Charlevoix]",DE,[Klima],[],RegSprecher,...,0.4215,0.0000,0.9020,0.0980,"{'software': 'vaderSentiment', 'version': '3.3...",Fri Jun 08 06:02:33 +0000 2018,1004966895211237376,"[{'short': 'https://t.co/A56faXkzRS', 'resolve...",Sprecher der Bundesregierung und Chef des Bund...,"{'userName': 'RegSprecher', 'hashed': 'de562ec..."
4,"{'software': 'entity-fishing', 'version': '0.0...",29,1022,648129,dd4c8ed8f1015bd4826be9de24b355a2,[Ukraine],DE,[Ukraine],[@HeikoMaas],AuswaertigesAmt,...,-0.3612,0.0820,0.9180,0.0000,"{'software': 'vaderSentiment', 'version': '3.3...",Mon Jun 11 18:26:13 +0000 2018,1006241207759097860,"[{'short': 'https://t.co/2S0RWW6U5y', 'resolve...",Aktuelle Nachrichten aus dem Auswärtigen Amt -...,"{'userName': 'AuswaertigesAmt', 'hashed': 'dd4..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063271,"{'software': 'entity-fishing', 'version': '0.0...",133,1624,8401,27c6a4a1b0f00224690766c56f71034d,[ElDebate],ES,[gas],[@petrogustavo],,...,,,,,,Fri May 25 02:56:17 +0000 2018,999846586674016257,[],,{'hashed': '27c6a4a1b0f00224690766c56f71034d'}
1063272,"{'software': 'entity-fishing', 'version': '0.0...",4509,1759,36527,9ecc98199bbd5833682f073f0d3c014d,[],ES,[gas],"[@IvanDuque, @petrogustavo]",,...,,,,,,Fri May 25 01:30:19 +0000 2018,999824952609181697,[],,{'hashed': '9ecc98199bbd5833682f073f0d3c014d'}
1063273,"{'software': 'entity-fishing', 'version': '0.0...",88,69,781625,8c300459233cd3c8d0649444ada0525b,"[ElDebate, VargasLlerasPresidente]",ES,[gas],[@petrogustavo],,...,,,,,,Fri May 25 01:20:51 +0000 2018,999822572333293568,[],,{'hashed': '8c300459233cd3c8d0649444ada0525b'}
1063274,"{'software': 'entity-fishing', 'version': '0.0...",3,413,18250,ebfa86a7ce6c4d2fa46c45414dcdd3d1,[DuqueSeGanóMiVoto],ES,[falta],[@IvanDuque],,...,,,,,,Fri May 25 12:54:30 +0000 2018,999997132634509313,"[{'short': 'https://t.co/UYKkKVHa5t', 'resolve...",,{'hashed': 'ebfa86a7ce6c4d2fa46c45414dcdd3d1'}


In [52]:
df_public_len + df_private_len

3836611

In [53]:
df_all.columns


Index(['entities', 'favorites', 'followees', 'followers', 'hashedUserName',
       'hashtags', 'language', 'matchingKeywords', 'matchingUserMentions',
       'matchingUserName', 'mentions', 'plds', 'public_person', 'replies',
       'retweetId', 'retweetTimeStamp', 'retweetUserName', 'retweets',
       'sentimentCompound', 'sentimentNegative', 'sentimentNeutral',
       'sentimentPositive', 'sentiments', 'timeStamp', 'tweetId', 'urls',
       'userBio', 'userName'],
      dtype='object')

## Check the Matching Keywords

In [54]:
keywords_de=list(itertools.chain.from_iterable(df_all[df_all['language']=='DE']['matchingKeywords'].tolist()))


In [55]:
set(keywords_de)

{'CO2',
 'Corona',
 'Covid',
 'Energie',
 'Erderwärmung',
 'Erdgas',
 'Erdöl',
 'Gas',
 'IPCC',
 'Impf',
 'Klima',
 'Knapp',
 'Kohlenstoff',
 'Mangel',
 'Pandemie',
 'Sars.*Cov',
 'Treibhaus',
 'Ukraine',
 '\\bÖl'}

In [56]:
#{'}
# Covid*, Corona*, Pandemie, Sars*Cov*, Impf*
topic_covid_19_de = set(['Covid', 'Corona', 'Pandemie', 'Sars.*Cov', 'Impf'])
# Energie*, Erdgas* OR Gas*, Erdöl* OR Öl*, Knapp* OR Mangel*, Ukraine
topic_energy_security_de = set(['Energie', 'Erdgas','Gas', 'Erdöl', '\\bÖl', 'Knapp', 'Mangel', 'Ukraine'])
# Klima*, Erderwärmung, Treibhaus*, Kohlenstoff* OR CO2, IPCC
topic_climate_change_de = set(['Klima', 'Erderwärmung', 'Treibhaus', 'Kohlenstoff', 'CO2', 'IPCC'])

In [57]:
keywords_en=list(itertools.chain.from_iterable(df_all[df_all['language']=='EN']['matchingKeywords'].tolist()))


In [58]:
set(keywords_en)

{'CO2',
 'IPCC',
 'Ukraine',
 '\\black\\b',
 '\\boil\\b',
 'carbon',
 'climate',
 'corona',
 'covid',
 'energy',
 'gas',
 'global warming',
 'greenhouse',
 'natural.*gas',
 'pandemic',
 'petroleum',
 'sars.*cov',
 'shortage',
 'vaccin'}

In [59]:
#{}

#covid*, corona*, pandemic, sars*cov*, vaccin*
topic_covid_19_en = set(['covid', 'corona', 'pandemic',  'sars.*cov', 'vaccin'])
#energy*, (natural*)gas, oil OR petroleum, shortage OR lack,Ukraine
topic_energy_security_en = set(['energy', 'natural.*gas', 'gas', '\\boil\\b', 'petroleum', 'shortage', '\\black\\b', 'Ukraine'])
#climate*, global warming, greenhouse*, carbon* OR CO2, IPCC
topic_climate_change_en = set(['climate',  'global warming', 'greenhouse', 'carbon', 'CO2', 'IPCC' ])

In [60]:
keywords_fr=list(itertools.chain.from_iterable(df_all[df_all['language']=='FR']['matchingKeywords'].tolist()))


In [61]:
set(keywords_fr)

{'CO2',
 'IPCC',
 'Ukraine',
 'carbone',
 'carence',
 'climat',
 'corona',
 'covid',
 'gas.*oil',
 'gaz',
 'gaz naturel',
 'manque',
 'pandémie',
 'pétrole',
 'réchauffement global',
 'sars.*cov',
 'serre',
 'vaccin',
 'énergie'}

In [62]:
#{}

#covid*, corona*, pandémie, sars*cov*, vaccin*
topic_covid_19_fr = set([ 'covid', 'corona',  'pandémie', 'sars.*cov', 'vaccin'])
#énergie, gaz (naturel), gas*oil OR pétrole, manque* OR carence, Ukraine
topic_energy_security_fr = set(['énergie', 'gaz naturel',  'gaz', 'gas.*oil', 'pétrole',  'manque',  'carence',  'Ukraine'])
#climat*, réchauffement global, serre, carbone OR CO2, IPCC
topic_climate_change_fr = set([ 'climat','réchauffement global',  'serre',  'carbone',  'CO2',  'IPCC'])

In [63]:
keywords_es=list(itertools.chain.from_iterable(df_all[df_all['language']=='ES']['matchingKeywords'].tolist()))
set(keywords_es)

{'CO2',
 'IPCC',
 'Ucrania',
 'calentamiento global',
 'carbono',
 'carencia',
 'clima',
 'corona',
 'covid',
 'energía',
 'falta',
 'gas',
 'gas natural',
 'invernadero',
 'pandemia',
 'petróleo',
 'sars.*cov',
 'vacuna'}

In [64]:
#{ }

#covid*, corona*, pandemia, sars*cov*, vacuna*
topic_covid_19_es = set([ 'covid',  'corona', 'pandemia',  'sars.*cov', 'vacuna'])
#energía, gas (natural), petróleo, carencia OR falta, Ucrania
topic_energy_security_es = set([ 'energía', 'gas',  'gas natural',  'petróleo',  'carencia',  'falta',  'Ucrania'])
#clima*, calentamiento global, invernadero, carbono OR CO2, IPCC
topic_climate_change_es = set([ 'clima',  'calentamiento global',  'invernadero',  'carbono','CO2',  'IPCC'])

## Assign Topic to Tweets Based on Matching Keywords

In [65]:
df_all['topic_covid_19']='NaN'
df_all['topic_energy_security']='NaN'
df_all['topic_climate_change']='NaN'

In [66]:
df_all.loc[df_all['language'] == 'ES','topic_covid_19'] =df_all[df_all['language']=='ES']['matchingKeywords'].apply(check_topic, args=[topic_covid_19_es])
df_all.loc[df_all['language'] == 'ES','topic_energy_security'] =df_all[df_all['language']=='ES']['matchingKeywords'].apply(check_topic, args=[topic_energy_security_es])
df_all.loc[df_all['language'] == 'ES','topic_climate_change'] =df_all[df_all['language']=='ES']['matchingKeywords'].apply(check_topic, args=[topic_climate_change_es])


In [67]:
df_all.loc[df_all['language'] == 'DE','topic_covid_19'] =df_all[df_all['language']=='DE']['matchingKeywords'].apply(check_topic, args=[topic_covid_19_de])
df_all.loc[df_all['language'] == 'DE','topic_energy_security'] =df_all[df_all['language']=='DE']['matchingKeywords'].apply(check_topic, args=[topic_energy_security_de])
df_all.loc[df_all['language'] == 'DE','topic_climate_change'] =df_all[df_all['language']=='DE']['matchingKeywords'].apply(check_topic, args=[topic_climate_change_de])


In [68]:
df_all.loc[df_all['language'] == 'EN','topic_covid_19'] =df_all[df_all['language']=='EN']['matchingKeywords'].apply(check_topic, args=[topic_covid_19_en])
df_all.loc[df_all['language'] == 'EN','topic_energy_security'] =df_all[df_all['language']=='EN']['matchingKeywords'].apply(check_topic, args=[topic_energy_security_en])
df_all.loc[df_all['language'] == 'EN','topic_climate_change'] =df_all[df_all['language']=='EN']['matchingKeywords'].apply(check_topic, args=[topic_climate_change_en])


In [69]:
df_all.loc[df_all['language'] == 'FR','topic_covid_19'] =df_all[df_all['language']=='FR']['matchingKeywords'].apply(check_topic, args=[topic_covid_19_fr])
df_all.loc[df_all['language'] == 'FR','topic_energy_security'] =df_all[df_all['language']=='FR']['matchingKeywords'].apply(check_topic, args=[topic_energy_security_fr])
df_all.loc[df_all['language'] == 'FR','topic_climate_change'] =df_all[df_all['language']=='FR']['matchingKeywords'].apply(check_topic, args=[topic_climate_change_fr])


In [70]:
df_all.columns


Index(['entities', 'favorites', 'followees', 'followers', 'hashedUserName',
       'hashtags', 'language', 'matchingKeywords', 'matchingUserMentions',
       'matchingUserName', 'mentions', 'plds', 'public_person', 'replies',
       'retweetId', 'retweetTimeStamp', 'retweetUserName', 'retweets',
       'sentimentCompound', 'sentimentNegative', 'sentimentNeutral',
       'sentimentPositive', 'sentiments', 'timeStamp', 'tweetId', 'urls',
       'userBio', 'userName', 'topic_covid_19', 'topic_energy_security',
       'topic_climate_change'],
      dtype='object')

## Pickle the Dataframe for Subsequent Analyses


In [71]:
df_all.to_pickle("joint_tweetplomacy_23.pkl")