In [1]:
import pandas as pd
import numpy as np
import copy
import time

Location where the provenance files will be stored and formatting of the timestamp according to the naming convetion in README.md

In [2]:
PROVENANCE_FOLDER_BASE = "../data/produced/provenance/"
TIMESTAMP_FORMAT = "%d-%m-%Y-%H-%M-%S"

Helper methods to generate the provenance files

In [3]:
# helper methods
def buildTimestamp():
    return time.strftime(TIMESTAMP_FORMAT, time.localtime())
    
def saveIntermediateProvenanceFile(dataframe, name):
    filename =  name + '--' + buildTimestamp() + '.csv'
    dataframe.to_csv(PROVENANCE_FOLDER_BASE + filename, sep=',', index=False, header=True)

## 1) Load the datasets 

### Social media users

In [4]:
socialMediaUsersRaw = pd.read_csv('../data/input/social-media-users.csv')
# rename the first column
socialMediaUsersRaw.rename(columns = {'Unnamed: 0':'year'}, inplace = True)
socialMediaUsersRaw.head()

Unnamed: 0,year,Twitter users (in million),Facebook users (in million),Instagram users (in million)
0,Q1 2010,30,431,-
1,Q2 2010,40,482,-
2,Q3 2010,49,550,-
3,Q4 2010,54,608,-
4,Q1 2011,68,680,-


### Suicide rates

In [5]:
suicideRates = pd.read_csv('../data/input/who-suicide-rates.csv')
suicideRates.head()

Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,WORLDBANKINCOMEGROUP,COUNTRY,SEX,Display Value,Numeric,Low,High,Comments
0,SDGSUICIDE,PUBLISHED,2010,EMR,,AFG,FMLE,3.9 [2.3-6.2],3.88,2.26,6.23,
1,SDGSUICIDE,PUBLISHED,2004,EMR,,AFG,BTSX,5.0 [2.9-8.2],5.03,2.89,8.22,
2,SDGSUICIDE,PUBLISHED,2007,EMR,,AFG,BTSX,4.8 [2.8-7.9],4.8,2.78,7.86,
3,SDGSUICIDE,PUBLISHED,2001,EMR,,AFG,MLE,5.2 [2.9-8.7],5.22,2.93,8.65,
4,SDGSUICIDE,PUBLISHED,2001,EMR,,AFG,FMLE,4.8 [2.8-7.8],4.84,2.79,7.77,


## 2) Column selection and data aggregation

### Social media users

As we only have yearly data for the suicide rates we need to aggregate the data from the quarters to a year. Given that the suicide rates are for the year respectively we will take the user data for Q4 (end of the year).


In [6]:
socialMediaUsers = socialMediaUsersRaw[(socialMediaUsersRaw['year'].str.startswith("Q4"))]
socialMediaUsers.head()

Unnamed: 0,year,Twitter users (in million),Facebook users (in million),Instagram users (in million)
3,Q4 2010,54,608,-
7,Q4 2011,117,845,-
11,Q4 2012,185,1056,-
15,Q4 2013,241,1228,150
19,Q4 2014,288,1393,300


Next we remove the "Q4 " prefix so that we can create a common column to join the 2 datasets on.

In [7]:
def removeQuarterlyIdentifier(entry):
    return entry.replace("Q4 ", "")

socialMediaUsersCopy = copy.deepcopy(socialMediaUsers)
socialMediaUsersCopy.loc[:,'year'] = socialMediaUsersCopy['year'].apply(removeQuarterlyIdentifier)

socialMediaUsersFinal = socialMediaUsersCopy

Produce an intermediate file for validation (see README.md)

In [8]:
saveIntermediateProvenanceFile(socialMediaUsersFinal, 'social-media-users-year-aggregated')

We can convert the numeric columns - we also remove the "," in the numbers to enable conversion. Remove the instragram data as we focus on twitter and facebook.

In [9]:
# conversions
socialMediaUsersFinal['year'] = socialMediaUsersFinal['year'].astype(int)
socialMediaUsersFinal['Twitter users (in million)'] = socialMediaUsersFinal['Twitter users (in million)'].astype(int)
socialMediaUsersFinal['Facebook users (in million)'] = socialMediaUsersFinal['Facebook users (in million)'].map(lambda x: x.replace(",","")).astype(int)

# removal of Instagram
socialMediaUsersFinal = socialMediaUsersFinal.drop(columns='Instagram users (in million)')

In [10]:
socialMediaUsersFinal

Unnamed: 0,year,Twitter users (in million),Facebook users (in million)
3,2010,54,608
7,2011,117,845
11,2012,185,1056
15,2013,241,1228
19,2014,288,1393
23,2015,305,1591
27,2016,318,1860
31,2017,330,2129
35,2018,321,2320
39,2019,340,2498


Produce an intermediate file for validation (see README.md)

In [11]:
saveIntermediateProvenanceFile(socialMediaUsersFinal, 'social-media-users-final-preprocessed')

### Suicide rates

The suicide rates data are per country and per sex. We would like to get aggregate information (as our other dataset does not provide any country specific information either).

In [12]:
suicideRates.head()

Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,WORLDBANKINCOMEGROUP,COUNTRY,SEX,Display Value,Numeric,Low,High,Comments
0,SDGSUICIDE,PUBLISHED,2010,EMR,,AFG,FMLE,3.9 [2.3-6.2],3.88,2.26,6.23,
1,SDGSUICIDE,PUBLISHED,2004,EMR,,AFG,BTSX,5.0 [2.9-8.2],5.03,2.89,8.22,
2,SDGSUICIDE,PUBLISHED,2007,EMR,,AFG,BTSX,4.8 [2.8-7.9],4.8,2.78,7.86,
3,SDGSUICIDE,PUBLISHED,2001,EMR,,AFG,MLE,5.2 [2.9-8.7],5.22,2.93,8.65,
4,SDGSUICIDE,PUBLISHED,2001,EMR,,AFG,FMLE,4.8 [2.8-7.8],4.84,2.79,7.77,


First we remove the unnecessary columns that we are not interested in and keep only the country, sex and year.

In [13]:
suicideRates = suicideRates[{'COUNTRY', 'SEX', 'YEAR', 'Numeric'}]
suicideRates.rename(columns={'Numeric' : 'rate'}, inplace=True)
suicideRates.head()

Unnamed: 0,COUNTRY,rate,YEAR,SEX
0,AFG,3.88,2010,FMLE
1,AFG,5.03,2004,BTSX
2,AFG,4.8,2007,BTSX
3,AFG,5.22,2001,MLE
4,AFG,4.84,2001,FMLE


The social media dataset only starts at year 2010 so we will filter the suicides dataset to match this timerange.

In [14]:
suicideRates = suicideRates[suicideRates['YEAR'] >= 2010]

We want to guarantee the completeness of the data so we process the same amount of contries for each year. Let's filter out all of the countries that don't have 10 entries for year (so any year in the span 2010-2019) is missing.

In [15]:
countryAggregated = suicideRates.groupby(by=['COUNTRY', 'SEX']).agg(years=('YEAR', 'count'))
countriesWithIncorrectNumberOfEntries = countryAggregated[countryAggregated['years'] != 10]

assert len(countriesWithIncorrectNumberOfEntries) == 0, "There should be 0 countries with number of years different from 10"

Next we want to check that an entry exists for each of the sexes (we want take a look at how the suicide rates of each of these groups was affected). Since there are 3 unique values we will check if each **COUNTRY, YEAR** combination has exactly 3 values

In [16]:
countryAggregated = suicideRates.groupby(by=['COUNTRY', 'YEAR']).agg(genderGroups=('SEX', 'count'))
countriesWithIncorrectNumberOfEntries = countryAggregated[countryAggregated['genderGroups'] != 3]

assert len(countriesWithIncorrectNumberOfEntries) == 0, "There should be 0 countries with number entries of SEX different from 3"

The last step is to check for missing values (outliers will be ignored in this case)

In [17]:
suicideRates.isnull().sum()

COUNTRY    330
rate         0
YEAR         0
SEX          0
dtype: int64

There are some missing country labels, for the sake of computation we will remove those as we don't know where the data comes from.

In [18]:
len(suicideRates)

5820

In [19]:
len(suicideRates[suicideRates['COUNTRY'].isnull()])

330

In [20]:
suicideRatesClean = suicideRates.dropna()
len(suicideRatesClean)

5490

In [21]:
assert len(suicideRatesClean) == len(suicideRates) - 330

The final preprocessing step would be to aggregate the rates for each of the groups (males, females, combined) accross all countries as our social media users are global. We do not need to take into consideration the different sizes of the countries as the suicide rate is standardized per 100 000 citizens.

In [22]:
suicideRatesClean.columns

Index(['COUNTRY', 'rate', 'YEAR', 'SEX'], dtype='object')

In [24]:
suicideRatesFinal = suicideRatesClean.groupby(['YEAR', 'SEX']).agg(rate=('rate', 'mean'))
suicideRatesFinal

Unnamed: 0_level_0,Unnamed: 1_level_0,rate
YEAR,SEX,Unnamed: 2_level_1
2010,BTSX,10.76375
2010,FMLE,5.096074
2010,MLE,16.600605
2011,BTSX,10.621873
2011,FMLE,5.003518
2011,MLE,16.400656
2012,BTSX,10.612533
2012,FMLE,4.976508
2012,MLE,16.410945
2013,BTSX,10.453661


In [25]:
suicideRatesFinal = suicideRatesFinal.reset_index()

Produce an intermediate file for validation (see README.md)

In [26]:
saveIntermediateProvenanceFile(suicideRatesFinal, 'suicide-rates-aggregated-by-year')

## 3) Dataset merging

We join the dataset on year column. Additionally some column names are modified to fit the naming scheme.

In [27]:
mergedDf = suicideRatesFinal.set_index('YEAR').join(socialMediaUsersFinal.set_index('year'))
mergedDf = mergedDf.reset_index()
mergedDf.rename(columns = {'index':'year', 'SEX' : 'sex'}, inplace = True)
mergedDf

Unnamed: 0,year,sex,rate,Twitter users (in million),Facebook users (in million)
0,2010,BTSX,10.76375,54,608
1,2010,FMLE,5.096074,54,608
2,2010,MLE,16.600605,54,608
3,2011,BTSX,10.621873,117,845
4,2011,FMLE,5.003518,117,845
5,2011,MLE,16.400656,117,845
6,2012,BTSX,10.612533,185,1056
7,2012,FMLE,4.976508,185,1056
8,2012,MLE,16.410945,185,1056
9,2013,BTSX,10.453661,241,1228


Produce an intermediate file for validation (see README.md)

In [28]:
saveIntermediateProvenanceFile(mergedDf, 'merged-datasets')

## 4) Feature calculation

To make the trend of change visible on a similar scale we will take a look at the percentage change from the start of the measurements. This extraction will force the values to the same scale and we can see the effect of the relative increase. As there are still 3 distinct groups in the dataset this operation will be done on per group basis

In [29]:
def appendPercentChangeOverStandard(dataFrame):
    # calculate baseline to scale against
    initialRate = dataFrame.iloc[0]['rate']
    initialTwitterUsers = dataFrame.iloc[0]['Twitter users (in million)']
    initialFacebookUsers = dataFrame.iloc[0]['Facebook users (in million)']
    # calculate the changes compared to baseline
    dataFrame['Suicide Rate % change since 2010'] = dataFrame['rate'].map(lambda x: x/initialRate*100) 
    dataFrame['Twitter user count % change since 2010'] = dataFrame['Twitter users (in million)'].map(lambda x: x/initialTwitterUsers*100) 
    dataFrame['Facebook user count % change since 2010'] = dataFrame['Facebook users (in million)'].map(lambda x: x/initialFacebookUsers*100) 


# males
percentChangeFromOriginal_males = mergedDf[mergedDf['sex'] == 'MLE'][['rate', 'Twitter users (in million)', 'Facebook users (in million)']]
appendPercentChangeOverStandard(percentChangeFromOriginal_males)
# females
percentChangeFromOriginal_females = mergedDf[mergedDf['sex'] == 'FMLE'][['rate', 'Twitter users (in million)', 'Facebook users (in million)']]
appendPercentChangeOverStandard(percentChangeFromOriginal_females)
#combined
percentChangeFromOriginal_combined = mergedDf[mergedDf['sex'] == 'BTSX'][['rate', 'Twitter users (in million)', 'Facebook users (in million)']]
appendPercentChangeOverStandard(percentChangeFromOriginal_combined)


# aggregate
percentChangeDf = percentChangeFromOriginal_males.append(percentChangeFromOriginal_females).append(percentChangeFromOriginal_combined)

# rejoin with the original dataframe
percentChangeDf = percentChangeDf[{'Suicide Rate % change since 2010', 'Twitter user count % change since 2010', 'Facebook user count % change since 2010'}]
mergedDf = mergedDf.join(percentChangeDf) 

# select only the new columns
finalDf = mergedDf[['year', 'sex', 'Suicide Rate % change since 2010', 'Twitter user count % change since 2010', 'Facebook user count % change since 2010']]
finalDf

Unnamed: 0,year,sex,Suicide Rate % change since 2010,Twitter user count % change since 2010,Facebook user count % change since 2010
0,2010,BTSX,100.0,100.0,100.0
1,2010,FMLE,100.0,100.0,100.0
2,2010,MLE,100.0,100.0,100.0
3,2011,BTSX,98.681894,216.666667,138.980263
4,2011,FMLE,98.183773,216.666667,138.980263
5,2011,MLE,98.79553,216.666667,138.980263
6,2012,BTSX,98.595123,342.592593,173.684211
7,2012,FMLE,97.65375,342.592593,173.684211
8,2012,MLE,98.857511,342.592593,173.684211
9,2013,BTSX,97.119135,446.296296,201.973684


## 5) Final dataset output

In [30]:
finalDf.to_csv('../data/produced/final/social-media-impact-on-suicide-rates.csv', sep=',', index=False, header=True)

Additionally also store the final dataset for validation purposes

In [31]:
saveIntermediateProvenanceFile(finalDf, 'final-dataset')