In [1]:
# Importing necessary packages
import pandas as pd
import numpy as np
import re

In [2]:
# CLoning the repo
! git clone https://github.com/vrhughes/DS4002-Project2

Cloning into 'DS4002-Project2'...
remote: Enumerating objects: 63, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 63 (delta 13), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (63/63), 133.64 KiB | 5.81 MiB/s, done.
Resolving deltas: 100% (13/13), done.


In [3]:
# Reading in the data
# Variables
gdp_df = pd.read_csv('/content/DS4002-Project2/OUTPUTS/gdp_df.csv')
homicides_unclean_df = pd.read_csv('/content/DS4002-Project2/DATA/Homicides-Per-100000.csv')
edu_attainment_df = pd.read_csv('/content/DS4002-Project2/DATA/attainment_and_fertility.csv')
infant_mortality_df = pd.read_csv('/content/DS4002-Project2/DATA/infant_mortality.csv')
life_expectancy_df = pd.read_csv('/content/DS4002-Project2/DATA/life-expectancy.csv')

# Outcome
fert_rate_df = pd.read_csv('/content/DS4002-Project2/DATA/children-born-per-woman.csv')

###Data Cleaning/Organizing


In [24]:
# Reorganizing homicides_df
causes = ["Police conflict and executions", " Conflict and terrorism", "Interpersonal violence"]

# Summing types of homicide data together into cleaned column
homicides_df = (homicides_unclean_df[homicides_unclean_df["cause"].isin(causes)].groupby(["year", "location"], as_index = False).agg({"val": "sum"}))

homicides_df["cause"] = "Homicide"

# Renaming columns
homicides_df.rename(columns={"year": "Year", "location": "Country", "val": "Homicide_Rate", "cause": "Cause"}, inplace=True)

# Renaming countries
countries = {"Bolivarian Republic of Venezuela" : "Venezuela", "Federal Democratic Republic of Ethiopia": "Ethiopia",
             "Islamic Republic of Iran": "Iran", "Kingdom of Norway": "Norway", "United States of America": "United States",
             "Global": "World", "People's Republic of Bangladesh": "Bangladesh", "Republic of Chile": "Chile",
             "Republic of Korea": "South Korea", "Republic of Latvia": "Latvia", "Republic of Namibia": "Namibia",
             "Republic of Nauru": "Nauru", "Republic of Nicaragua": "Nicaragua", "Republic of Panama": "Panama",
             "Republic of the Niger": "Niger"}

homicides_df["Country"] = homicides_df["Country"].replace(countries)
hom = homicides_df.copy()
homicides_df = hom[['Country', 'Year', 'Homicide_Rate', 'Cause']]

# See what it looks like now
homicides_df.head()

Unnamed: 0,Country,Year,Homicide_Rate,Cause
0,Venezuela,1980,13.108855,Homicide
1,Ethiopia,1980,18.061589,Homicide
2,World,1980,6.474866,Homicide
3,Iran,1980,3.444738,Homicide
4,Norway,1980,1.364974,Homicide


In [34]:
# Exporting cleaned homicides data if needed
homicides_df.to_csv('homicides_df.csv', index=False)

In [10]:
# Organizing edu attainment data
edu_attainment_df.drop(columns=["Fertility Rate", "Population"], inplace = True)
edu_attainment_df.rename(columns={"Average Years of Education Ages 15-64": "Avg_Yrs_Education"}, inplace = True)

In [28]:
# Exporting cleaned edu attainment if needed
edu_attainment_df.to_csv('edu_attainment_df.csv', index=False)

In [11]:
# Organizing fert rate data
fert_rate_df.drop(columns=["Code"], inplace = True)
fert_rate_df.rename(columns={"Entity": "Country", "fertility_rate_hist": "Fertility_Rate"}, inplace = True)

In [29]:
# Exporting cleaned fert rate data if needed
fert_rate_df.to_csv('fert_rate_df.csv', index=False)

In [12]:
# Organizing infant mortality data
infant_mortality_df.drop(columns=["Code"], inplace = True)
infant_mortality_df.rename(columns={"Entity": "Country", "Observation value - Indicator: Infant mortality rate - Sex: Total - Wealth quintile: Total - Unit of measure: Deaths per 100 live births": "Infant_Mortality"}, inplace = True)

In [30]:
# Exporting cleaned infant mortality if needed
infant_mortality_df.to_csv('infant_mortality_df.csv', index=False)

In [13]:
# Organizing life expectancy data
life_expectancy_df.drop(columns=["Code"], inplace = True)
life_expectancy_df.rename(columns={"Entity": "Country", "life_expectancy_0__sex_total__age_0": "Life_Expectancy"}, inplace = True)

In [31]:
# Exporting cleaned life expectancy if needed
life_expectancy_df.to_csv('life_expectancy_df.csv', index=False)

In [19]:
# Organizing gdp data
countries2 = {"Korea, Rep.": "South Korea", "Venezuela, RB": "Venezuela", "Iran, Islamic Rep.": "Iran"}
gdp_df["Country"] = gdp_df["Country"].replace(countries2)


In [35]:
# Exporting gdp data as needed
gdp_df.to_csv('gdp_df.csv', index=False)

AttributeError: 'Series' object has no attribute 'type'

###Data Combining


In [50]:
# Creating fertility rate analysis df by merging
del fra_df # deletes in case edits need to be made

# Merging
fra_df = fert_rate_df.merge(gdp_df, on=["Year", "Country"], how="left")
fra_df = fra_df.merge(homicides_df, on=["Year", "Country"], how="left")
fra_df = fra_df.merge(edu_attainment_df, on=["Year", "Country"], how="left")
fra_df = fra_df.merge(infant_mortality_df, on=["Year", "Country"], how="left")
fra_df = fra_df.merge(life_expectancy_df, on=["Year", "Country"], how="left")

# See work
fra_df.head()

Unnamed: 0,Country,Year,Fertility_Rate,GDP_Per_Capita,Homicide_Rate,Cause,Avg_Yrs_Education,Infant_Mortality,Life_Expectancy
0,Bangladesh,1953,6.421,,,,,21.306301,39.4449
1,Bangladesh,1954,6.472,,,,,20.684288,40.3718
2,Bangladesh,1955,6.505,,,,0.28,20.06906,41.0664
3,Bangladesh,1956,6.537,,,,,19.500477,41.6034
4,Bangladesh,1957,6.576,,,,,18.959303,42.5748


In [51]:
# Dropping extra col and seeing more of the data
fra_df.drop(columns=["Cause"], inplace = True)
fra_df.head(30)

Unnamed: 0,Country,Year,Fertility_Rate,GDP_Per_Capita,Homicide_Rate,Avg_Yrs_Education,Infant_Mortality,Life_Expectancy
0,Bangladesh,1953,6.421,,,,21.306301,39.4449
1,Bangladesh,1954,6.472,,,,20.684288,40.3718
2,Bangladesh,1955,6.505,,,0.28,20.06906,41.0664
3,Bangladesh,1956,6.537,,,,19.500477,41.6034
4,Bangladesh,1957,6.576,,,,18.959303,42.5748
5,Bangladesh,1958,6.608,,,,18.4495,43.0829
6,Bangladesh,1959,6.678,,,,17.967253,43.7158
7,Bangladesh,1960,6.742,,,0.33,17.511889,43.9795
8,Bangladesh,1961,6.78,,,,17.081934,44.8874
9,Bangladesh,1962,6.806,,,,16.684986,45.7651


In [52]:
# Exporting data
fra_df.to_csv('fr_analysis_df.csv', index=False)