In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import pearsonr

# Data Structure

In [2]:
data_dir = 'data'

if os.path.isdir(data_dir):
    for dirname, _, filenames in os.walk(data_dir):
        for filename in filenames:
            print(os.path.join(dirname, filename))
else:
    print('Please checkout to include-data branch.')

data/countries-iso-codes/wikipedia-iso-country-codes.csv
data/big-five-personality-test/test.csv
data/big-five-personality-test/submission.csv
data/big-five-personality-test/train.csv
data/data-final/6.csv
data/data-final/5.csv
data/data-final/4.csv
data/data-final/1.csv
data/data-final/3.csv
data/data-final/2.csv


# Handle COVID-19 Data

In [3]:
train = pd.read_csv('data/big-five-personality-test/train.csv')
test = pd.read_csv('data/big-five-personality-test/test.csv')

# Join the training and test sets
covid19 = pd.concat([train, test])

# Sort by date
covid19.sort_values('Date')
# Filter to the columns we need
covid19 = covid19.loc[:, ['Country/Region', 'Date', 'ConfirmedCases']]

print(f'Total: {len(covid19)}')
covid19.head()

Total: 30104


Unnamed: 0,Country/Region,Date,ConfirmedCases
0,Afghanistan,2020-01-22,0.0
1,Afghanistan,2020-01-23,0.0
2,Afghanistan,2020-01-24,0.0
3,Afghanistan,2020-01-25,0.0
4,Afghanistan,2020-01-26,0.0


# Filter least 50 confirmed cases, and had at least 14 days

In [4]:
covid19 = covid19[covid19.ConfirmedCases > 50]
covid19_numdays = covid19.loc[:, ['Country/Region', 'Date']]\
    .drop_duplicates()\
    .groupby('Country/Region')\
    .count()\
    .rename_axis('country')\
    .reset_index()

covid19_mindays = covid19_numdays[covid19_numdays.Date >= 14]
covid19 = covid19[covid19['Country/Region'].isin(covid19_mindays.country)]

print(f'Total: {len(covid19)}')
covid19.head()

Total: 2484


Unnamed: 0,Country/Region,Date,ConfirmedCases
678,Australia,2020-03-10,55.0
679,Australia,2020-03-11,65.0
680,Australia,2020-03-12,65.0
681,Australia,2020-03-13,92.0
682,Australia,2020-03-14,112.0


# Compute growth over 14 days

In [5]:
covid19_collapse_province = covid19\
    .groupby(['Country/Region', 'Date'])\
    .sum()\
    .reset_index()

print(f'Total: {len(covid19_collapse_province)}')
covid19_collapse_province.head()

Total: 779


Unnamed: 0,Country/Region,Date,ConfirmedCases
0,Australia,2020-03-10,55.0
1,Australia,2020-03-11,65.0
2,Australia,2020-03-12,65.0
3,Australia,2020-03-13,92.0
4,Australia,2020-03-14,112.0


In [6]:
covid19 = covid19_collapse_province\
    .groupby('Country/Region')\
    .head(14)\
    .groupby('Country/Region')\
    .tail(1)

print(f'Total: {len(covid19)}')
covid19.head()

Total: 36


Unnamed: 0,Country/Region,Date,ConfirmedCases
13,Australia,2020-03-23,1617.0
28,Austria,2020-03-19,2013.0
47,Bahrain,2020-03-17,228.0
68,Belgium,2020-03-19,1795.0
87,China,2020-02-04,22353.0


# Country Abbreviations

In [7]:
country_isos = pd.read_csv('data/countries-iso-codes/wikipedia-iso-country-codes.csv')
country_isos = country_isos.rename(columns={"English short name lower case": "Country/Region", 
                                            "Alpha-2 code": "country_abbr"})
country_isos = country_isos.loc[:, ['Country/Region', 'country_abbr']]

print(f'Total: {len(country_isos)}')
country_isos.head()

Total: 246


Unnamed: 0,Country/Region,country_abbr
0,Afghanistan,AF
1,Åland Islands,AX
2,Albania,AL
3,Algeria,DZ
4,American Samoa,AS


In [8]:
covid19 = covid19.merge(country_isos, left_on='Country/Region', right_on='Country/Region')
covid19 = covid19.dropna()

print(f'Total: {len(covid19)}')
covid19.head()

Total: 34


Unnamed: 0,Country/Region,Date,ConfirmedCases,country_abbr
0,Australia,2020-03-23,1617.0,AU
1,Austria,2020-03-19,2013.0,AT
2,Bahrain,2020-03-17,228.0,BH
3,Belgium,2020-03-19,1795.0,BE
4,China,2020-02-04,22353.0,CN


# Handle Big Five Personality Data

In [9]:
data_final_dir = 'data/data-final'

csv_reading = list()

if os.path.isdir(data_final_dir):
    for dirname, _, filenames in os.walk(data_final_dir):
        for filename in filenames:
            print(f'Reading: {os.path.join(dirname, filename)}')
            csv_reading.append(pd.read_csv(os.path.join(dirname, filename), sep='\t'))
            
else:
    print('Please checkout to include-data branch.')

big5 = pd.concat(csv_reading)  

print(f'Total: {len(big5)}')

Reading: data/data-final/6.csv
Reading: data/data-final/5.csv
Reading: data/data-final/4.csv
Reading: data/data-final/1.csv
Reading: data/data-final/3.csv
Reading: data/data-final/2.csv
Total: 1015341


# Reverse value

In [10]:
positively_keyed = ['EXT1', 'EXT3', 'EXT5', 'EXT7', 'EXT9',
                    'EST1', 'EST3', 'EST5', 'EST6', 'EST7', 'EST8', 'EST9', 'EST10',
                    'AGR2', 'AGR4', 'AGR6', 'AGR8', 'AGR9', 'AGR10',
                    'CSN1', 'CSN3', 'CSN5', 'CSN7', 'CSN9', 'CSN10', 
                    'OPN1', 'OPN3', 'OPN5', 'OPN7', 'OPN8', 'OPN9', 'OPN10']

negatively_keyed = ['EXT2', 'EXT4', 'EXT6', 'EXT8', 'EXT10',
                    'EST2', 'EST4',
                    'AGR1', 'AGR3', 'AGR5', 'AGR7', 
                    'CSN2', 'CSN4', 'CSN6', 'CSN8', 
                    'OPN2', 'OPN4', 'OPN6']

In [11]:
big5.loc[:, negatively_keyed] = 6 - big5.loc[:, negatively_keyed]

# Country-Level Big 5 Aggregates

In [12]:
big5_country_count = big5.country\
    .value_counts()\
    .rename_axis('country')\
    .reset_index(name='counts')

print(f'Total: {len(big5_country_count)}')
big5_country_count.head()

Total: 223


Unnamed: 0,country,counts
0,US,546403
1,GB,66596
2,CA,61849
3,AU,50030
4,PH,19847


# Use least 1000 observations.

In [13]:
big5 = big5[big5.country.isin(big5_country_count[big5_country_count.counts > 1000].country.values)]

big5 = big5.loc[:,['country'] + positively_keyed + negatively_keyed]

# Factor aggregation

In [14]:
EXT = ['EXT' + str(i) for i in range(1,11)]
EST = ['EST' + str(i) for i in range(1,11)]
AGR = ['AGR' + str(i) for i in range(1,11)]
CSN = ['CSN' + str(i) for i in range(1,11)]
OPN = ['OPN' + str(i) for i in range(1,11)]

In [15]:
big5['EXT'] = big5.loc[:, EXT].mean(axis=1)
big5['EST'] = big5.loc[:, EST].mean(axis=1)
big5['AGR'] = big5.loc[:, AGR].mean(axis=1)
big5['CSN'] = big5.loc[:, CSN].mean(axis=1)
big5['OPN'] = big5.loc[:, OPN].mean(axis=1)
big5 = big5.loc[:, ['country', 'EXT', 'EST', 'AGR', 'CSN', 'OPN']]

In [16]:
big5 = big5.dropna()
big5 = big5[big5.country != 'NONE']

In [17]:
print(f'Total: {len(big5)}')
big5.head()

Total: 975151


Unnamed: 0,country,EXT,EST,AGR,CSN,OPN
0,US,2.5,2.8,4.7,4.2,3.3
1,US,2.6,3.6,4.3,4.4,3.7
2,US,3.1,4.0,4.5,2.9,3.8
3,US,2.8,1.6,3.4,4.3,3.6
4,US,2.8,3.0,4.2,3.8,3.9


# Country-level averages

In [18]:
big5_cavgs = big5.groupby('country')\
                    .mean()\
                    .rename_axis('country')\
                    .reset_index()

In [19]:
print(f'Total: {len(big5_cavgs)}')
big5_cavgs.head()

Total: 57


Unnamed: 0,country,EXT,EST,AGR,CSN,OPN
0,AE,3.031275,3.125686,3.763399,3.392745,3.720131
1,AR,2.84863,3.230656,3.625175,3.157935,3.986265
2,AT,2.992051,2.985863,3.678952,3.238528,4.056052
3,AU,3.00047,3.033397,3.782136,3.353265,3.833542
4,BE,3.002726,3.049043,3.761206,3.197772,3.906212


# Joining Big 5 Country Data to COVID-19 Data

In [None]:
covid19_big5 = covid19.merge(big5_cavgs, left_on='country_abbr', right_on='country')
covid19_big5.head()