In [1]:
# Importing necessary packages
import pandas as pd
import numpy as np
import re

In [2]:
# CLoning the repo
! git clone https://github.com/vrhughes/DS4002-Project2

Cloning into 'DS4002-Project2'...
remote: Enumerating objects: 63, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 63 (delta 13), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (63/63), 133.64 KiB | 5.81 MiB/s, done.
Resolving deltas: 100% (13/13), done.


In [3]:
# Reading in the data
# Variables
gdp_df = pd.read_csv('/content/DS4002-Project2/OUTPUTS/gdp_df.csv')
homicides_unclean_df = pd.read_csv('/content/DS4002-Project2/DATA/Homicides-Per-100000.csv')
edu_attainment_df = pd.read_csv('/content/DS4002-Project2/DATA/attainment_and_fertility.csv')
infant_mortality_df = pd.read_csv('/content/DS4002-Project2/DATA/infant_mortality.csv')
life_expectancy_df = pd.read_csv('/content/DS4002-Project2/DATA/life-expectancy.csv')

# Outcome
fert_rate_df = pd.read_csv('/content/DS4002-Project2/DATA/children-born-per-woman.csv')

###Data Cleaning/Organizing


In [24]:
# Reorganizing homicides_df
causes = ["Police conflict and executions", " Conflict and terrorism", "Interpersonal violence"]

# Summing types of homicide data together into cleaned column
homicides_df = (homicides_unclean_df[homicides_unclean_df["cause"].isin(causes)].groupby(["year", "location"], as_index = False).agg({"val": "sum"}))

homicides_df["cause"] = "Homicide"

# Renaming columns
homicides_df.rename(columns={"year": "Year", "location": "Country", "val": "Homicide_Rate", "cause": "Cause"}, inplace=True)

# Renaming countries
countries = {"Bolivarian Republic of Venezuela" : "Venezuela", "Federal Democratic Republic of Ethiopia": "Ethiopia",
             "Islamic Republic of Iran": "Iran", "Kingdom of Norway": "Norway", "United States of America": "United States",
             "Global": "World", "People's Republic of Bangladesh": "Bangladesh", "Republic of Chile": "Chile",
             "Republic of Korea": "South Korea", "Republic of Latvia": "Latvia", "Republic of Namibia": "Namibia",
             "Republic of Nauru": "Nauru", "Republic of Nicaragua": "Nicaragua", "Republic of Panama": "Panama",
             "Republic of the Niger": "Niger"}

homicides_df["Country"] = homicides_df["Country"].replace(countries)
hom = homicides_df.copy()
homicides_df = hom[['Country', 'Year', 'Homicide_Rate', 'Cause']]

# See what it looks like now
homicides_df.head()

Unnamed: 0,Country,Year,Homicide_Rate,Cause
0,Venezuela,1980,13.108855,Homicide
1,Ethiopia,1980,18.061589,Homicide
2,World,1980,6.474866,Homicide
3,Iran,1980,3.444738,Homicide
4,Norway,1980,1.364974,Homicide


In [None]:
# Exporting cleaned homicides data if needed
homicides_df.to_csv('gdp_df.csv', index=False)

In [10]:
# Organizing edu attainment data
edu_attainment_df.drop(columns=["Fertility Rate", "Population"], inplace = True)
edu_attainment_df.rename(columns={"Average Years of Education Ages 15-64": "Avg_Yrs_Education"}, inplace = True)

In [None]:
# Exporting cleaned edu attainment if needed
edu_attainment_df.to_csv('edu_attainment_df.csv', index=False)

In [11]:
# Organizing fert rate data
fert_rate_df.drop(columns=["Code"], inplace = True)
fert_rate_df.rename(columns={"Entity": "Country", "fertility_rate_hist": "Fertility_Rate"}, inplace = True)

In [None]:
# Exporting cleaned fert rate data if needed
fert_rate_df.to_csv('fert_rate_df.csv', index=False)

In [12]:
# Organizing infant mortality data
infant_mortality_df.drop(columns=["Code"], inplace = True)
infant_mortality_df.rename(columns={"Entity": "Country", "Observation value - Indicator: Infant mortality rate - Sex: Total - Wealth quintile: Total - Unit of measure: Deaths per 100 live births": "Infant_Mortality"}, inplace = True)

In [None]:
# Exporting cleaned infant mortality if needed
infant_mortality_df.to_csv('infant_mortality_df.csv', index=False)

In [13]:
# Organizing life expectancy data
life_expectancy_df.drop(columns=["Code"], inplace = True)
life_expectancy_df.rename(columns={"Entity": "Country", "life_expectancy_0__sex_total__age_0": "Life_Expectancy"}, inplace = True)

In [None]:
# Exporting cleaned life expectancy if needed
life_expectancy_df.to_csv('life_expectancy_df.csv', index=False)

In [19]:
# Organizing gdp data
countries2 = {"Korea, Rep.": "South Korea", "Venezuela, RB": "Venezuela", "Iran, Islamic Rep.": "Iran"}
gdp_df["Country"] = gdp_df["Country"].replace(countries2)


###Data Combining


In [25]:
# Making new df with all vars and the fertility rate
del fr_analysis_df

fr_analysis_df = gdp_df.merge(homicides_df, on=["Year", "Country"], how="left")
fr_analysis_df = fr_analysis_df.merge(fert_rate_df, on=["Year", "Country"], how="left")
fr_analysis_df = fr_analysis_df.merge(edu_attainment_df, on=["Year", "Country"], how="left")
fr_analysis_df = fr_analysis_df.merge(infant_mortality_df, on=["Year", "Country"], how="left")
fr_analysis_df = fr_analysis_df.merge(life_expectancy_df, on=["Year", "Country"], how="left")

fr_analysis_df.head()

Unnamed: 0,Country,Year,GDP_Per_Capita,Homicide_Rate,Cause,Fertility_Rate,Avg_Yrs_Education,Infant_Mortality,Life_Expectancy
0,Chile,1974,1542.925527,,,3.293,,5.91998,65.0853
1,Bangladesh,1974,164.9863936,,,6.778,,15.068873,49.5885
2,Nauru,1974,5199.264975,,,4.067,,,58.31
3,Nicaragua,1974,556.8661225,,,6.521,,10.367213,55.4013
4,Norway,1974,6811.527337,,,2.134,,1.103574,74.7635


In [26]:
fr_analysis_df.drop(columns=["Cause"], inplace = True)
fr_analysis_df.head(30)

Unnamed: 0,Country,Year,GDP_Per_Capita,Homicide_Rate,Fertility_Rate,Avg_Yrs_Education,Infant_Mortality,Life_Expectancy
0,Chile,1974,1542.925527,,3.293,,5.91998,65.0853
1,Bangladesh,1974,164.9863936,,6.778,,15.068873,49.5885
2,Nauru,1974,5199.264975,,4.067,,,58.31
3,Nicaragua,1974,556.8661225,,6.521,,10.367213,55.4013
4,Norway,1974,6811.527337,,2.134,,1.103574,74.7635
5,Namibia,1974,..,,6.552,,6.404083,56.943
6,New Zealand,1974,4610.570426,,2.574,,1.536911,71.9405
7,United States,1974,7225.69136,,1.872,,1.670739,71.9592
8,Niger,1974,198.2891653,,7.539,,13.793561,36.1946
9,South Korea,1974,563.3524758,,3.555,,3.819736,64.2389
