# 2D DDW

## Imports

In [5639]:
# import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Data Extraction and Cleaning

For our project, the 6 factors that we are using to predict food security are:
1. GDP per capita adjusted for PPP
2. Agricultural land per capita
3. Percentage of population with basic water service
4. Percentage of population with eating disorder
5. Percentage of population employed in agriculture forestry fishery
6. co2 per agricultural land area

As such, we have obtained the following raw data for all the countries:
1. 

The code below extracts the relevant data from each file and processses it to match our 6 factors.

### Extract food supply data for 2018

The following code extracts the total food supply for each country in 2018. This data is extracted from "DDW_Food Supply.csv".

In [5640]:
# read csv
df_food_supply = pd.read_csv("DDW_Food Supply.csv")
# extract values only when item is "Grand Total"
df_grand_total = df_food_supply[(df_food_supply["Item"] == "Grand Total")]
# extract year 2017 values
df2017_food_supply = df_grand_total[(df_grand_total["Year"] == 2017)]
# extract year, country and food supply
df2017_food_supply = df2017_food_supply[["Year", "Area", "Value"]]
df2017_food_supply = df2017_food_supply.rename(columns = {"Area": "Country", "Value":"Food_supply"})
# set index from 0
df2017_food_supply = df2017_food_supply.reset_index(drop=True)
# dataframe
df2017_food_supply

Unnamed: 0,Year,Country,Food_supply
0,2017,Afghanistan,2303
1,2017,Albania,3326
2,2017,Algeria,3383
3,2017,Angola,2441
4,2017,Antigua and Barbuda,2446
...,...,...,...
209,2017,Least Developed Countries,2402
210,2017,Land Locked Developing Countries,2539
211,2017,Small Island Developing States,2685
212,2017,Low Income Food Deficit Countries,2505


##### Finding the unique countries in food supply data

In [5641]:
df_food_supply_countries = df2017_food_supply.Country.unique()

### Extract minimum calorie intake data for 2018

In [5642]:
df_min_cal_intake = pd.read_csv("DDW_Min Cal Intake.csv")
df_min_cal_intake = df_min_cal_intake[df_min_cal_intake["Value"].notna()]
df2017_min_cal = df_min_cal_intake[(df_min_cal_intake["Year"] == 2017)]
df2017_min_cal = df2017_min_cal[["Year", "Area", "Value"]]
df2017_min_cal = df2017_min_cal.rename(columns = {"Area": "Country", "Value":"Mininum_calorie_intake"})
df2017_min_cal = df2017_min_cal.reset_index(drop=True)
df2017_min_cal

Unnamed: 0,Year,Country,Mininum_calorie_intake
0,2017,Afghanistan,1676.0
1,2017,Albania,1911.0
2,2017,Algeria,1781.0
3,2017,Angola,1659.0
4,2017,Antigua and Barbuda,1888.0
...,...,...,...
183,2017,Vanuatu,1695.0
184,2017,Venezuela (Bolivarian Republic of),1817.0
185,2017,Viet Nam,1785.0
186,2017,Yemen,1703.0


##### Finding the unique countries in minimum calorie intake data

In [5643]:
df_min_cal_intake_countries = df2017_min_cal.Country.unique()

### Extract GDP per capita data for 2018

In [5644]:
df_GDP = pd.read_csv("DDW_GDP per capita adjusted.csv")
df_GDP = df_GDP[(df_GDP["Value"].notna())]
df2017_GDP = df_GDP[(df_GDP["Year"])==2017]
df2017_GDP = df2017_GDP[["Year","Area","Value"]]
df2017_GDP = df2017_GDP.rename(columns = {"Area": "Country", "Value":"GDP_per_capita_adjusted_for_PPP"})
df2017_GDP = df2017_GDP.reset_index(drop=True)
df2017_GDP

Unnamed: 0,Year,Country,GDP_per_capita_adjusted_for_PPP
0,2017,Afghanistan,2058.4
1,2017,Albania,12771.0
2,2017,Algeria,11737.4
3,2017,Angola,7310.9
4,2017,Antigua and Barbuda,19840.3
...,...,...,...
182,2017,Uzbekistan,6840.7
183,2017,Vanuatu,3081.5
184,2017,Viet Nam,8996.4
185,2017,Zambia,3485.0


#### List of unique countries for GDP per capita data

In [5645]:
df_GDP_countries = df2017_GDP.Country.unique()

### Extract population data for 2018

In [5646]:
df_pop = pd.read_csv("DDW_Population.csv")
# df_pop = df_pop.dropna()
df2017_population = df_pop[(df_pop["year"]==2017)]
df2017_population = df2017_population[df2017_population["population"].notna()]
df2017_population = df2017_population[["year", "country", "population"]]
df2017_population = df2017_population.rename(columns = {"year": "Year", "country": "Country", "population":"Population"})
df2017_population = df2017_population.reset_index(drop=True)
df2017_population

Unnamed: 0,Year,Country,Population
0,2017,Afghanistan,3.629611e+07
1,2017,Africa,1.244222e+09
2,2017,Albania,2.884169e+06
3,2017,Algeria,4.138918e+07
4,2017,Andorra,7.699700e+04
...,...,...,...
226,2017,Wallis and Futuna,1.189400e+04
227,2017,World,7.548173e+09
228,2017,Yemen,2.783481e+07
229,2017,Zambia,1.685361e+07


#### Getting unique countries for population data

In [5647]:
df_pop_coutries = df2017_population.Country.unique()

### Extract land area used for agriculture data for 2018

In [5648]:
df_agriculture = pd.read_csv("DDW_Agricultural Land.csv")
# remove rows with no value (NaN)
df2017_agriculture = df_agriculture[["Country Name","2017"]]
df2017_agriculture = df2017_agriculture[df2017_agriculture["2017"].notna()]
df2017_agriculture = df2017_agriculture.rename(columns = {"Country Name": "Country", "2017":"Agriculture_land_area"})
df2017_agriculture = df2017_agriculture.reset_index(drop=True)
df2017_agriculture


Unnamed: 0,Country,Agriculture_land_area
0,Aruba,20.00
1,Africa Eastern and Southern,6538552.75
2,Afghanistan,379100.00
3,Africa Western and Central,3589797.00
4,Angola,563974.30
...,...,...
252,Samoa,620.00
253,"Yemen, Rep.",234520.00
254,South Africa,963410.00
255,Zambia,238360.00


#### Getting unique countries for Agriculture data

In [5649]:
df_agriculture_countries = df2017_agriculture.Country.unique()

### Extract CO2 (in million metric tonnes) data for 2018

In [5650]:
df_co2 = pd.read_csv("DDW_CO2.csv")
df2017_co2 = df_co2[(df_co2["year"])==2017]
df2017_co2 = df2017_co2[df2017_co2["co2"].notna()]
df2017_co2 = df2017_co2[["year","country","co2"]]
df2017_co2 = df2017_co2.rename(columns = {"year": "Year", "country": "Country"})
df2017_co2 = df2017_co2.reset_index(drop=True)
df2017_co2

Unnamed: 0,Year,Country,co2
0,2017,Afghanistan,6.860
1,2017,Africa,1384.372
2,2017,Albania,5.302
3,2017,Algeria,154.936
4,2017,Andorra,0.465
...,...,...,...
232,2017,Wallis and Futuna,0.026
233,2017,World,35925.738
234,2017,Yemen,9.951
235,2017,Zambia,6.517


#### Getting unique countries for $CO_{2}$ data

In [5651]:
df_co2_countries = df2017_co2.Country.unique()

### Extract basic water drinking services data for 2018

In [5652]:
df_water = pd.read_csv("DDW_Basic Water Drinking Services.csv")
df_water = df_water[df_water["Value"].notna()]
df2017_water = df_water[(df_water["Year"] == 2017)]
df2017_water = df2017_water[["Year", "Area", "Value"]]
df2017_water = df2017_water.rename(columns = {"Area": "Country", "Value": "Percentage_of_population_with_basic_water_service"})
df2017_water = df2017_water.reset_index(drop=True)
df2017_water


Unnamed: 0,Year,Country,Percentage_of_population_with_basic_water_service
0,2017,Afghanistan,66.8
1,2017,Albania,94.1
2,2017,Algeria,93.8
3,2017,American Samoa,99.0
4,2017,Andorra,99.0
...,...,...,...
232,2017,Small Island Developing States,83.1
233,2017,Low income economies,56.5
234,2017,Lower-middle-income economies,86.5
235,2017,High-income economies,99.0


#### Getting unique countries for basic water drinking services data

In [5653]:
df_water_countries = df2017_water.Country.unique()

### Extract Eating Disorder data for 2018

In [5654]:
df_eating_disorder = pd.read_csv("DDW_Eating Disorder.csv")
df_eating_disorder = df_eating_disorder[df_eating_disorder["Prevalence - Eating disorders - Sex: Both - Age: Age-standardized (Percent)"].notna()]
df_eating_disorder = df_eating_disorder.rename(columns = {"Entity": "Country", "Prevalence - Eating disorders - Sex: Both - Age: Age-standardized (Percent)":"Prevalence"})
df2017_eating_disorder = df_eating_disorder[(df_eating_disorder["Year"] == 2017)]
df2017_eating_disorder = df2017_eating_disorder[["Year", "Country", "Prevalence"]]
df2017_eating_disorder = df2017_eating_disorder.rename(columns = {"Prevalence": "Percentage_of_population_with_eating_disorder"})
df2017_eating_disorder = df2017_eating_disorder.reset_index(drop=True)
df2017_eating_disorder

Unnamed: 0,Year,Country,Percentage_of_population_with_eating_disorder
0,2017,Afghanistan,0.12
1,2017,African Region (WHO),0.11
2,2017,Albania,0.14
3,2017,Algeria,0.22
4,2017,American Samoa,0.14
...,...,...,...
223,2017,World Bank Lower Middle Income,0.13
224,2017,World Bank Upper Middle Income,0.17
225,2017,Yemen,0.14
226,2017,Zambia,0.12


#### Getting unique countries for eating disorder data

In [5655]:
df_eating_disorder_countries = df2017_eating_disorder.Country.unique()

### Extract number of people employed in agriculture data for 2018

In [5656]:
df_employment = pd.read_csv("DDW_Employment In Agriculture.csv")
df_employment = df_employment[df_employment["Value"].notna()]
df2017_employement = df_employment[(df_employment["Year"] == 2017)]
df2017_employement = df2017_employement[["Year", "Area", "Value"]]
df2017_employement["Value"] = df2017_employement["Value"]*1000
df2017_employement = df2017_employement.rename(columns = {"Area": "Country", "Value":"Number_of_people_employed_in_agriculture_forestry_fishery"})
df2017_employement = df2017_employement.reset_index(drop=True)
df2017_employement

Unnamed: 0,Year,Country,Number_of_people_employed_in_agriculture_forestry_fishery
0,2017,Afghanistan,2740235.0
1,2017,Albania,453779.0
2,2017,Algeria,1102072.0
3,2017,Argentina,6840.0
4,2017,Armenia,317111.0
...,...,...,...
111,2017,Uruguay,143717.0
112,2017,Uzbekistan,3671300.0
113,2017,Venezuela (Bolivarian Republic of),1166489.0
114,2017,Viet Nam,21564822.0


#### Getting unique countries for population employed

In [5657]:
df_employment_countries = df2017_employement.Country.unique()

### Create list of common countries

In [5658]:
countries = []
for i in df_food_supply_countries:
    if i in df_min_cal_intake_countries and i in df_GDP_countries and i in df_pop_coutries and i in df_agriculture_countries and i in df_co2_countries and i in df_water_countries and i in df_eating_disorder_countries and i in df_employment_countries:
        countries.append(i)
print("Countries", countries)
print("Length of list", len(countries))

Countries ['Afghanistan', 'Albania', 'Algeria', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Burundi', 'Cambodia', 'Canada', 'Chile', 'Colombia', 'Costa Rica', 'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'El Salvador', 'Estonia', 'Finland', 'France', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Honduras', 'Hungary', 'Iceland', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kuwait', 'Latvia', 'Lithuania', 'Luxembourg', 'Malaysia', 'Mali', 'Malta', 'Mauritania', 'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Myanmar', 'Nepal', 'Netherlands', 'New Zealand', 'North Macedonia', 'Norway', 'Panama', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Rwanda', 'Samoa', 'Serbia', 'Seychelles', 'Slovenia', 'South Africa', 'Spain', 'Sri Lanka', 'Sweden', 'Switzerland', 'Thailand', 'Tog

### With the list of common countries, we are now able to continue combining our dataframes ...

### Calculate food supply to mininum calorie intake ratio 
done by finding (food supply)/(mininum calorie intake)

In [5659]:
df2017_food_supply = df2017_food_supply[df2017_food_supply["Country"].isin(countries)]
df2017_min_cal = df2017_min_cal[df2017_min_cal["Country"].isin(countries)]

df2017_y_value = df2017_food_supply.copy()
df2017_y_value["Mininum_calorie_intake"] = list(df2017_min_cal["Mininum_calorie_intake"])
df2017_y_value["y_ratio"] = df2017_y_value["Food_supply"]/df2017_y_value["Mininum_calorie_intake"]
df2017_y_value = df2017_y_value.reset_index(drop=True)
df2017_y_value

Unnamed: 0,Year,Country,Food_supply,Mininum_calorie_intake,y_ratio
0,2017,Afghanistan,2303,1676.0,1.374105
1,2017,Albania,3326,1911.0,1.740450
2,2017,Algeria,3383,1781.0,1.899495
3,2017,Armenia,3072,1875.0,1.638400
4,2017,Australia,3404,1911.0,1.781266
...,...,...,...,...,...
85,2017,Uganda,2030,1693.0,1.199055
86,2017,Ukraine,3062,1908.0,1.604822
87,2017,United Arab Emirates,3074,2045.0,1.503178
88,2017,Uruguay,3158,1861.0,1.696937


### Calculate Agriculture Land per capita (in $\textrm{m}^2$ per capita)

In [5660]:
df2017_agriculture = df2017_agriculture[df2017_agriculture["Country"].isin(countries)]
df2017_population = df2017_population[df2017_population["Country"].isin(countries)]

df2017_agriculture_per_pop = df2017_agriculture.copy()
df2017_agriculture_per_pop["Total_Population"] = list(df2017_population["Population"])
df2017_agriculture_per_pop["Agricultural_Land_Per_Capita"] = (df2017_agriculture_per_pop["Agriculture_land_area"]/df2017_agriculture_per_pop["Total_Population"])*1000000
df2017_agriculture_per_pop = df2017_agriculture_per_pop.reset_index(drop=True)
df2017_agriculture_per_pop

Unnamed: 0,Country,Agriculture_land_area,Total_Population,Agricultural_Land_Per_Capita
0,Afghanistan,379100.00,36296108.0,10444.646021
1,Albania,11742.81,2884169.0,4071.470847
2,United Arab Emirates,3838.00,41389176.0,92.729558
3,Armenia,16761.00,2944789.0,5691.749052
4,Australia,3718370.00,24584620.0,151247.812657
...,...,...,...,...
85,Ukraine,414890.00,41166588.0,10078.318854
86,Uruguay,142229.00,44487708.0,3197.040405
87,Uzbekistan,255332.00,9487206.0,26913.297761
88,Samoa,620.00,3436645.0,180.408509


### Calculate percentage of population employed in the agriculture industry

In [5661]:
df2017_employement = df2017_employement[df2017_employement["Country"].isin(countries)]
df2017_population = df2017_population[df2017_population["Country"].isin(countries)]

df2017_percentage_employed = df2017_employement.copy()
df2017_percentage_employed["Total_Population"] = list(df2017_population["Population"])
df2017_percentage_employed["Percentage_of_population_employed_in_agriculture_forestry_fishery"] = (df2017_percentage_employed["Number_of_people_employed_in_agriculture_forestry_fishery"]/df2017_percentage_employed["Total_Population"])*100
df2017_percentage_employed = df2017_percentage_employed.reset_index(drop=True)
df2017_percentage_employed

Unnamed: 0,Year,Country,Number_of_people_employed_in_agriculture_forestry_fishery,Total_Population,Percentage_of_population_employed_in_agriculture_forestry_fishery
0,2017,Afghanistan,2740235.0,36296108.0,7.549666
1,2017,Albania,453779.0,2884169.0,15.733440
2,2017,Algeria,1102072.0,41389176.0,2.662706
3,2017,Armenia,317111.0,2944789.0,10.768547
4,2017,Australia,318393.0,24584620.0,1.295090
...,...,...,...,...,...
85,2017,Uganda,4095242.0,41166588.0,9.947975
86,2017,Ukraine,2489400.0,44487708.0,5.595703
87,2017,United Arab Emirates,65857.0,9487206.0,0.694166
88,2017,Uruguay,143717.0,3436645.0,4.181898


### Calculate co2 per agricultural land (in kg per m^2)

In [5662]:
df2017_co2 = df2017_co2[df2017_co2["Country"].isin(countries)]
df2017_agriculture = df2017_agriculture[df2017_agriculture["Country"].isin(countries)]


df2017_co2_per_land = df2017_co2.copy()
df2017_co2_per_land["Agricultural_land"] = list(df2017_agriculture["Agriculture_land_area"])
df2017_co2_per_land["co2_per_agricultural_land_area"] = (df2017_co2_per_land["co2"]/df2017_co2_per_land["Agricultural_land"])*1000
df2017_co2_per_land = df2017_co2_per_land.reset_index(drop=True)
df2017_co2_per_land


Unnamed: 0,Year,Country,co2,Agricultural_land,co2_per_agricultural_land_area
0,2017,Afghanistan,6.860,379100.00,0.018095
1,2017,Albania,5.302,11742.81,0.451510
2,2017,Algeria,154.936,3838.00,40.368942
3,2017,Armenia,5.537,16761.00,0.330350
4,2017,Australia,414.751,3718370.00,0.111541
...,...,...,...,...,...
85,2017,Uganda,5.374,414890.00,0.012953
86,2017,Ukraine,223.085,142229.00,1.568492
87,2017,United Arab Emirates,168.831,255332.00,0.661221
88,2017,Uruguay,6.163,620.00,9.940323


### Filtering the rest of the dataframes needed based on the list of common countries

In [5663]:
df2017_GDP = df2017_GDP[df2017_GDP["Country"].isin(countries)]
df2017_GDP = df2017_GDP.reset_index(drop=True)
df2017_water = df2017_water[df2017_water["Country"].isin(countries)]
df2017_water = df2017_water.reset_index(drop=True)
df2017_eating_disorder = df2017_eating_disorder[df2017_eating_disorder["Country"].isin(countries)]
df2017_eating_disorder = df2017_eating_disorder.reset_index(drop=True)


### Combining all the variables into 1 data frame, filtering the data based on the countries available in countries list

In [5664]:
df2017_combined = df2017_y_value.loc[:, ["Country", "y_ratio"]]
df2017_combined["GDP_per_capita_adjusted_for_PPP"] = df2017_GDP.loc[:, "GDP_per_capita_adjusted_for_PPP"]
df2017_combined["Agricultural_land_per_capita"] = df2017_agriculture_per_pop.loc[:, "Agricultural_Land_Per_Capita"]
df2017_combined["Percentage_of_population_with_basic_water_service"] = df2017_water.loc[:, "Percentage_of_population_with_basic_water_service"]
df2017_combined["Percentage_of_population_with_eating_disorder"] = df2017_eating_disorder.loc[:, "Percentage_of_population_with_eating_disorder"]
df2017_combined["Percentage_of_population_employed_in_agriculture_forestry_fishery"] = df2017_percentage_employed.loc[:, "Percentage_of_population_employed_in_agriculture_forestry_fishery"]
df2017_combined["co2_per_agricultural_land_area"] = df2017_co2_per_land.loc[:, "co2_per_agricultural_land_area"]
df2017_combined = df2017_combined.reset_index(drop=True)
df2017_combined

Unnamed: 0,Country,y_ratio,GDP_per_capita_adjusted_for_PPP,Agricultural_land_per_capita,Percentage_of_population_with_basic_water_service,Percentage_of_population_with_eating_disorder,Percentage_of_population_employed_in_agriculture_forestry_fishery,co2_per_agricultural_land_area
0,Afghanistan,1.374105,2058.4,10444.646021,66.8,0.12,7.549666,0.018095
1,Albania,1.740450,12771.0,4071.470847,94.1,0.14,15.733440,0.451510
2,Algeria,1.899495,11737.4,92.729558,93.8,0.22,2.662706,40.368942
3,Armenia,1.638400,12115.1,5691.749052,99.0,0.13,10.768547,0.330350
4,Australia,1.781266,48398.5,151247.812657,99.0,1.11,1.295090,0.111541
...,...,...,...,...,...,...,...,...
85,Uganda,1.199055,2074.7,10078.318854,51.0,0.10,9.947975,0.012953
86,Ukraine,1.604822,11860.6,3197.040405,93.8,0.13,5.595703,1.568492
87,United Arab Emirates,1.503178,67183.6,26913.297761,99.0,0.31,0.694166,0.661221
88,Uruguay,1.696937,23009.9,180.408509,99.0,0.38,4.181898,9.940323


In [5665]:
#df2017_combined.to_csv("df2017_combined_v1.csv")

In [5666]:
f2 = df2017_combined.Country.unique()
print(f2)

['Afghanistan' 'Albania' 'Algeria' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Bosnia and Herzegovina' 'Brazil' 'Bulgaria' 'Burundi' 'Cambodia'
 'Canada' 'Chile' 'Colombia' 'Costa Rica' 'Cyprus' 'Czechia' 'Denmark'
 'Djibouti' 'Dominican Republic' 'Ecuador' 'El Salvador' 'Estonia'
 'Finland' 'France' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Guatemala'
 'Honduras' 'Hungary' 'Iceland' 'Indonesia' 'Ireland' 'Israel' 'Italy'
 'Jamaica' 'Japan' 'Jordan' 'Kazakhstan' 'Kuwait' 'Latvia' 'Lithuania'
 'Luxembourg' 'Malaysia' 'Mali' 'Malta' 'Mauritania' 'Mauritius' 'Mexico'
 'Mongolia' 'Montenegro' 'Myanmar' 'Nepal' 'Netherlands' 'New Zealand'
 'North Macedonia' 'Norway' 'Panama' 'Paraguay' 'Peru' 'Philippines'
 'Poland' 'Portugal' 'Romania' 'Rwanda' 'Samoa' 'Serbia' 'Seychelles'
 'Slovenia' 'South Africa' 'Spain' 'Sri Lanka' 'Sweden' 'Switzerland'
 'Thailand' 'Togo' 'Tunisia' 'Uganda' 'Ukraine' 'United Arab Emirates'
 'Uruguay' 'Uzbekistan']


### Remove outliers and normalise data using min-max normalisation

In [5667]:
def identify_outlier(dataframe, i):
    q3 = dataframe[i].quantile(q=0.75)
    q1 = dataframe[i].quantile(q=0.25)
    IQR = q3 - q1
    upper_outlier = q3 + 1.5*IQR
    lower_outlier = q1 - 1.5*IQR
    return upper_outlier, lower_outlier

def normalization(dfin):
    dfout = dfin.copy()
    dfout = (dfin - dfin.min(axis = 0)) / (dfin.max(axis = 0) - dfin.min(axis = 0))
    return dfout

def standardization(dfin):
    dfout = (dfin - dfin.mean(axis=0))/dfin.std(axis=0)
    return dfout

In [5668]:
def drop_outlier_value(df, column, outlier_value):
    return df[column] <= outlier_value[0] and df[column] >= outlier_value[1]

## Before standardisation (keith clean)

In [5669]:
#df2017_combined_1 = df2017_combined.loc[(drop_outlier_value(df2017_combined, "GDP_per_capita_adjusted_for_PPP", outlier_value)), :]

df2017_combined_1 = df2017_combined.loc[(df2017_combined["GDP_per_capita_adjusted_for_PPP"] <= identify_outlier(df2017_combined, "GDP_per_capita_adjusted_for_PPP")[0]) &(df2017_combined["GDP_per_capita_adjusted_for_PPP"] >= identify_outlier(df2017_combined, "GDP_per_capita_adjusted_for_PPP")[1]), :]
df2017_combined_2 = df2017_combined.loc[(df2017_combined["Agricultural_land_per_capita"] <= identify_outlier(df2017_combined, "Agricultural_land_per_capita")[0]) &(df2017_combined["Agricultural_land_per_capita"] >= identify_outlier(df2017_combined, "Agricultural_land_per_capita")[1]), :]
df2017_combined_3 = df2017_combined.loc[(df2017_combined["Percentage_of_population_with_basic_water_service"] <= identify_outlier(df2017_combined, "Percentage_of_population_with_basic_water_service")[0]) &(df2017_combined["Percentage_of_population_with_basic_water_service"] >= identify_outlier(df2017_combined, "Percentage_of_population_with_basic_water_service")[1]), :]
df2017_combined_4 = df2017_combined.loc[(df2017_combined["Percentage_of_population_with_eating_disorder"] <= identify_outlier(df2017_combined, "Percentage_of_population_with_eating_disorder")[0]) &(df2017_combined["Percentage_of_population_with_eating_disorder"] >= identify_outlier(df2017_combined, "Percentage_of_population_with_eating_disorder")[1]), :]
df2017_combined_5 = df2017_combined.loc[(df2017_combined["Percentage_of_population_employed_in_agriculture_forestry_fishery"] <= identify_outlier(df2017_combined, "Percentage_of_population_employed_in_agriculture_forestry_fishery")[0]) &(df2017_combined["Percentage_of_population_employed_in_agriculture_forestry_fishery"] >= identify_outlier(df2017_combined, "Percentage_of_population_employed_in_agriculture_forestry_fishery")[1]), :]
df2017_combined_6 = df2017_combined.loc[(df2017_combined["co2_per_agricultural_land_area"] <= identify_outlier(df2017_combined, "co2_per_agricultural_land_area")[0]) &(df2017_combined["co2_per_agricultural_land_area"] >= identify_outlier(df2017_combined, "co2_per_agricultural_land_area")[1]), :]
df2017_combined_1
#df2017_combined_6 = df2017_combined.reset_index(drop=True)
#df2017_combined_6
#identify_outlier(df2017_combined_1, "GDP_per_capita_adjusted_for_PPP")

Unnamed: 0,Country,y_ratio,GDP_per_capita_adjusted_for_PPP,Agricultural_land_per_capita,Percentage_of_population_with_basic_water_service,Percentage_of_population_with_eating_disorder,Percentage_of_population_employed_in_agriculture_forestry_fishery,co2_per_agricultural_land_area
0,Afghanistan,1.374105,2058.4,10444.646021,66.8,0.12,7.549666,0.018095
1,Albania,1.740450,12771.0,4071.470847,94.1,0.14,15.733440,0.451510
2,Algeria,1.899495,11737.4,92.729558,93.8,0.22,2.662706,40.368942
3,Armenia,1.638400,12115.1,5691.749052,99.0,0.13,10.768547,0.330350
4,Australia,1.781266,48398.5,151247.812657,99.0,1.11,1.295090,0.111541
...,...,...,...,...,...,...,...,...
85,Uganda,1.199055,2074.7,10078.318854,51.0,0.10,9.947975,0.012953
86,Ukraine,1.604822,11860.6,3197.040405,93.8,0.13,5.595703,1.568492
87,United Arab Emirates,1.503178,67183.6,26913.297761,99.0,0.31,0.694166,0.661221
88,Uruguay,1.696937,23009.9,180.408509,99.0,0.38,4.181898,9.940323


#### List of countries left after removol of countries with outliers

In [5670]:
countries_1 = df2017_combined_1.Country.unique()
countries_2 = df2017_combined_2.Country.unique()
countries_3 = df2017_combined_3.Country.unique()
countries_4 = df2017_combined_4.Country.unique()
countries_5 = df2017_combined_5.Country.unique()
countries_6 = df2017_combined_6.Country.unique()

countries_norm = []
for i in countries_1:
    if i in countries_2 and i in countries_3 and i in countries_4 and i in countries_5 and i in countries_6:
        countries_norm.append(i)

print(len(countries_norm))

55


In [5671]:
df2017_normalized = df2017_combined[df2017_combined["Country"].isin(countries_norm)]
df2017_normalized = df2017_normalized.reset_index(drop=True)
df2017_normalized

Unnamed: 0,Country,y_ratio,GDP_per_capita_adjusted_for_PPP,Agricultural_land_per_capita,Percentage_of_population_with_basic_water_service,Percentage_of_population_with_eating_disorder,Percentage_of_population_employed_in_agriculture_forestry_fishery,co2_per_agricultural_land_area
0,Albania,1.74045,12771.0,4071.470847,94.1,0.14,15.73344,0.45151
1,Armenia,1.6384,12115.1,5691.749052,99.0,0.13,10.768547,0.33035
2,Austria,1.89563,54173.0,3009.806685,99.0,0.61,1.898298,2.621807
3,Azerbaijan,1.684748,14121.4,4852.56136,93.9,0.15,17.80404,0.768812
4,Bangladesh,1.424242,4894.6,127.31281,97.4,0.12,15.463252,3.981505
5,Barbados,1.525681,15800.4,46361.479794,98.5,0.25,1.408313,0.088169
6,Belarus,1.724101,18356.1,10096.396565,96.5,0.14,5.545609,0.630059
7,Belgium,1.93628,50442.3,4404.228743,99.0,0.48,0.472567,1.977286
8,Belize,1.536544,7140.9,59290.799015,98.0,0.2,6.858626,0.027648
9,Bosnia and Herzegovina,1.698133,13753.8,25343.618773,96.1,0.14,4.592643,0.256875


## After normalization (keith clean)

In [5672]:
df2017_normalized["GDP_per_capita_adjusted_for_PPP_normalized"] = normalization(df2017_normalized["GDP_per_capita_adjusted_for_PPP"])
df2017_normalized["Agricultural_land_per_capita_normalized"] = standardization(df2017_normalized["Agricultural_land_per_capita"])
df2017_normalized["Percentage_of_population_with_basic_water_service_normalized"] = normalization(df2017_normalized["Percentage_of_population_with_basic_water_service"])
df2017_normalized["Percentage_of_population_with_eating_disorder_normalized"] = normalization(df2017_normalized["Percentage_of_population_with_eating_disorder"])
df2017_normalized["Percentage_of_population_employed_in_agriculture_forestry_fishery_normalized"] = normalization(df2017_normalized["Percentage_of_population_employed_in_agriculture_forestry_fishery"])
df2017_normalized["co2_per_agricultural_land_area_normalized"] = standardization(df2017_normalized["co2_per_agricultural_land_area"])
df2017_normalized


Unnamed: 0,Country,y_ratio,GDP_per_capita_adjusted_for_PPP,Agricultural_land_per_capita,Percentage_of_population_with_basic_water_service,Percentage_of_population_with_eating_disorder,Percentage_of_population_employed_in_agriculture_forestry_fishery,co2_per_agricultural_land_area,GDP_per_capita_adjusted_for_PPP_normalized,Agricultural_land_per_capita_normalized,Percentage_of_population_with_basic_water_service_normalized,Percentage_of_population_with_eating_disorder_normalized,Percentage_of_population_employed_in_agriculture_forestry_fishery_normalized,co2_per_agricultural_land_area_normalized
0,Albania,1.74045,12771.0,4071.470847,94.1,0.14,15.73344,0.45151,0.124094,-0.602205,0.524272,0.06,0.853414,-0.47373
1,Armenia,1.6384,12115.1,5691.749052,99.0,0.13,10.768547,0.33035,0.115253,-0.498374,1.0,0.04,0.576116,-0.5632
2,Austria,1.89563,54173.0,3009.806685,99.0,0.61,1.898298,2.621807,0.682193,-0.670239,1.0,1.0,0.080696,1.128915
3,Azerbaijan,1.684748,14121.4,4852.56136,93.9,0.15,17.80404,0.768812,0.142298,-0.552151,0.504854,0.08,0.969061,-0.23942
4,Bangladesh,1.424242,4894.6,127.31281,97.4,0.12,15.463252,3.981505,0.01792,-0.854955,0.84466,0.02,0.838324,2.132978
5,Barbados,1.525681,15800.4,46361.479794,98.5,0.25,1.408313,0.088169,0.16493,2.107831,0.951456,0.28,0.05333,-0.742038
6,Belarus,1.724101,18356.1,10096.396565,96.5,0.14,5.545609,0.630059,0.199381,-0.216114,0.757282,0.06,0.284405,-0.341881
7,Belgium,1.93628,50442.3,4404.228743,99.0,0.48,0.472567,1.977286,0.631903,-0.580881,1.0,0.74,0.001067,0.652972
8,Belize,1.536544,7140.9,59290.799015,98.0,0.2,6.858626,0.027648,0.0482,2.93637,0.902913,0.18,0.35774,-0.786729
9,Bosnia and Herzegovina,1.698133,13753.8,25343.618773,96.1,0.14,4.592643,0.256875,0.137342,0.760961,0.718447,0.06,0.231181,-0.617457


In [5673]:
df2017_normalized = df2017_combined_6.copy()
df2017_normalized_1 = df2017_normalized.loc[(df2017_combined["GDP_per_capita_adjusted_for_PPP"] <= identify_outlier(df2017_combined, "GDP_per_capita_adjusted_for_PPP")[0]) &(df2017_combined["GDP_per_capita_adjusted_for_PPP"] >= identify_outlier(df2017_combined, "GDP_per_capita_adjusted_for_PPP")[1]), :]
df2017_combined_2 = df2017_combined_1.loc[(df2017_combined_1["Agricultural_land_per_capita"] <= identify_outlier(df2017_combined_1, "Agricultural_land_per_capita")[0]) &(df2017_combined_1["Agricultural_land_per_capita"] >= identify_outlier(df2017_combined_1, "Agricultural_land_per_capita")[1]), :]
df2017_combined_3 = df2017_combined_2.loc[(df2017_combined_2["Percentage_of_population_with_basic_water_service"] <= identify_outlier(df2017_combined_2, "Percentage_of_population_with_basic_water_service")[0]) &(df2017_combined_2["Percentage_of_population_with_basic_water_service"] >= identify_outlier(df2017_combined_2, "Percentage_of_population_with_basic_water_service")[1]), :]
df2017_combined_4 = df2017_combined_3.loc[(df2017_combined_3["Percentage_of_population_with_eating_disorder"] <= identify_outlier(df2017_combined_3, "Percentage_of_population_with_eating_disorder")[0]) &(df2017_combined_3["Percentage_of_population_with_eating_disorder"] >= identify_outlier(df2017_combined_3, "Percentage_of_population_with_eating_disorder")[1]), :]
df2017_combined_5 = df2017_combined_4.loc[(df2017_combined_4["Percentage_of_population_employed_in_agriculture_forestry_fishery"] <= identify_outlier(df2017_combined_4, "Percentage_of_population_employed_in_agriculture_forestry_fishery")[0]) &(df2017_combined_4["Percentage_of_population_employed_in_agriculture_forestry_fishery"] >= identify_outlier(df2017_combined_4, "Percentage_of_population_employed_in_agriculture_forestry_fishery")[1]), :]
df2017_combined_6 = df2017_combined_5.loc[(df2017_combined_5["co2_per_agricultural_land_area"] <= identify_outlier(df2017_combined_5, "co2_per_agricultural_land_area")[0]) &(df2017_combined_5["co2_per_agricultural_land_area"] >= identify_outlier(df2017_combined_5, "co2_per_agricultural_land_area")[1]), :]
df2017_combined_6 = df2017_combined_6.reset_index(drop=True)
df2017_combined_6

Unnamed: 0,Country,y_ratio,GDP_per_capita_adjusted_for_PPP,Agricultural_land_per_capita,Percentage_of_population_with_basic_water_service,Percentage_of_population_with_eating_disorder,Percentage_of_population_employed_in_agriculture_forestry_fishery,co2_per_agricultural_land_area
0,Albania,1.74045,12771.0,4071.470847,94.1,0.14,15.73344,0.45151
1,Armenia,1.6384,12115.1,5691.749052,99.0,0.13,10.768547,0.33035
2,Austria,1.89563,54173.0,3009.806685,99.0,0.61,1.898298,2.621807
3,Bangladesh,1.424242,4894.6,127.31281,97.4,0.12,15.463252,3.981505
4,Barbados,1.525681,15800.4,46361.479794,98.5,0.25,1.408313,0.088169
5,Belarus,1.724101,18356.1,10096.396565,96.5,0.14,5.545609,0.630059
6,Belgium,1.93628,50442.3,4404.228743,99.0,0.48,0.472567,1.977286
7,Belize,1.536544,7140.9,59290.799015,98.0,0.2,6.858626,0.027648
8,Bosnia and Herzegovina,1.698133,13753.8,25343.618773,96.1,0.14,4.592643,0.256875
9,Chile,1.609808,24411.5,8530.930185,99.0,0.39,4.145679,0.533388


In [5674]:
df2017_normalized = df2017_combined_6.copy()
df2017_normalized["GDP_per_capita_adjusted_for_PPP_normalized"] = normalization(df2017_normalized["GDP_per_capita_adjusted_for_PPP"])
df2017_normalized["Agricultural_land_per_capita_normalized"] = standardization(df2017_normalized["Agricultural_land_per_capita"])
df2017_normalized["Percentage_of_population_with_basic_water_service_normalized"] = normalization(df2017_normalized["Percentage_of_population_with_basic_water_service"])
df2017_normalized["Percentage_of_population_with_eating_disorder_normalized"] = normalization(df2017_normalized["Percentage_of_population_with_eating_disorder"])
df2017_normalized["Percentage_of_population_employed_in_agriculture_forestry_fishery_normalized"] = normalization(df2017_normalized["Percentage_of_population_employed_in_agriculture_forestry_fishery"])
df2017_normalized["co2_per_agricultural_land_area_normalized"] = standardization(df2017_normalized["co2_per_agricultural_land_area"])
df2017_normalized

Unnamed: 0,Country,y_ratio,GDP_per_capita_adjusted_for_PPP,Agricultural_land_per_capita,Percentage_of_population_with_basic_water_service,Percentage_of_population_with_eating_disorder,Percentage_of_population_employed_in_agriculture_forestry_fishery,co2_per_agricultural_land_area,GDP_per_capita_adjusted_for_PPP_normalized,Agricultural_land_per_capita_normalized,Percentage_of_population_with_basic_water_service_normalized,Percentage_of_population_with_eating_disorder_normalized,Percentage_of_population_employed_in_agriculture_forestry_fishery_normalized,co2_per_agricultural_land_area_normalized
0,Albania,1.74045,12771.0,4071.470847,94.1,0.14,15.73344,0.45151,0.124094,-0.599329,0.524272,0.06,1.0,-0.483912
1,Armenia,1.6384,12115.1,5691.749052,99.0,0.13,10.768547,0.33035,0.115253,-0.496616,1.0,0.04,0.675072,-0.548892
2,Austria,1.89563,54173.0,3009.806685,99.0,0.61,1.898298,2.621807,0.682193,-0.66663,1.0,1.0,0.094557,0.680052
3,Bangladesh,1.424242,4894.6,127.31281,97.4,0.12,15.463252,3.981505,0.01792,-0.849358,0.84466,0.02,0.982318,1.409279
4,Barbados,1.525681,15800.4,46361.479794,98.5,0.25,1.408313,0.088169,0.16493,2.081526,0.951456,0.28,0.06249,-0.678777
5,Belarus,1.724101,18356.1,10096.396565,96.5,0.14,5.545609,0.630059,0.199381,-0.217396,0.757282,0.06,0.333256,-0.388153
6,Belgium,1.93628,50442.3,4404.228743,99.0,0.48,0.472567,1.977286,0.631903,-0.578235,1.0,0.74,0.00125,0.334385
7,Belize,1.536544,7140.9,59290.799015,98.0,0.2,6.858626,0.027648,0.0482,2.901144,0.902913,0.18,0.419187,-0.711236
8,Bosnia and Herzegovina,1.698133,13753.8,25343.618773,96.1,0.14,4.592643,0.256875,0.137342,0.749158,0.718447,0.06,0.270889,-0.588297
9,Chile,1.609808,24411.5,8530.930185,99.0,0.39,4.145679,0.533388,0.281008,-0.316634,1.0,0.56,0.241637,-0.439999


## Training 1st Model
- We will be defining some functions that we have used in class

In [5675]:
# def normalize_z(dfin):
#     dfout = (dfin - dfin.mean(axis=0))/dfin.std(axis=0)
#     return dfout

# def normalize_min_max(dfin):
#     dfout = (dfin - dfin.min(axis=0))/(dfin.max(axis=0) - dfin.min(axis=0))
#     return dfout

def get_features_targets(df, feature_names, target_names):
    df_feature = df.loc[:,feature_names]
    df_target = df.loc[:,target_names]
    return df_feature, df_target

def prepare_feature(df_feature):
    feature = df_feature.to_numpy().reshape(-1, len(df_feature.columns))
    X = np.concatenate((np.ones((feature.shape[0],1)),feature), axis = 1)
    return X

def prepare_target(df_target):
    target = df_target.to_numpy().reshape(-1, len(df_target.columns))
    return target

def predict(df_feature, beta):
    # Commented out normalising data as data cleaning already did it
    # feature_norm = normalize_z(df_feature)
    # X = prepare_feature(feature_norm)
    X = prepare_feature(df_feature)
    return np.matmul(X, beta)

def calc_linear(X, beta):
    return np.matmul(X, beta)

def split_data(df_feature, df_target, random_state=None, test_size=0.5):
    indexes = df_feature.index
    if random_state != None:
        np.random.seed(random_state)
    
    k = int(test_size*len(indexes))
    test_index = np.random.choice(indexes, k, replace = False)
    indexes = set(indexes)
    test_index = set(test_index)
    train_index = indexes - test_index
    
    df_feature_train = df_feature.loc[train_index,:]
    df_feature_test = df_feature.loc[test_index,:]
    
    df_target_train = df_target.loc[train_index,:]
    df_target_test = df_target.loc[test_index,:]
    return df_feature_train, df_feature_test, df_target_train, df_target_test

def r2_score(y, ypred):
    ymean = np.mean(y)
    diff = y-ymean
    sstot = np.matmul(diff.T, diff)
    error = y -ypred
    ssres = np.matmul(error.T,error)
    return 1 - ssres/sstot

def mean_squared_error(target, pred):
    n = target.shape[0]
    error = target - pred
    return (1/n)*np.matmul(error.T, error)[0][0]

def compute_cost(X, y, beta):
    J = 0
    m = X.shape[0]
    error = calc_linear(X,beta)-y
    error_sq = np.matmul(error.T,error)
    J = (1/(2*m)) * error_sq
    return J

def gradient_descent(X, y, beta, alpha, num_iters):
    m = X.shape[0]
    J_storage = np.zeros((num_iters,1))
    for n in range(num_iters):
        deriv = np.matmul(X.T,calc_linear(X,beta)-y)
        beta = beta - (alpha * (1/m)) * deriv
        J_storage[n] = compute_cost(X,y,beta)
    return beta, J_storage

### Considering Possible Removal of Features
- We will be using another metric to verify whether there is a good possible relationship between our features and the target. We decided to use Mean Absolute Error (MAE)
$$\large \textrm{MAE} = \frac{1}{n} \sum |y_i - x_i| %$$

In [5676]:
def mean_absolute_error(df,feature,target):
    x = df[feature].to_numpy()
    y = df[target].to_numpy()
    n = df.shape[0]
    error = np.abs((y-x))
    return (1/n) * (error.sum())

In [5677]:
def mean_absolute_error_percentage(df,feature,target):
    x = df[feature].to_numpy()
    y = df[target].to_numpy()
    n = df.shape[0]
    error = np.abs((y-x)/y)
    return ((1/n) * (error.sum())) / 100
    

#### Applying MAE on normalised 2017 data

In [5678]:
features = ['GDP_per_capita_adjusted_for_PPP_normalized','Agricultural_land_per_capita_normalized','Percentage_of_population_with_basic_water_service_normalized','Percentage_of_population_with_eating_disorder_normalized','Percentage_of_population_employed_in_agriculture_forestry_fishery_normalized','co2_per_agricultural_land_area_normalized']
target = ['y_ratio']

print("MAE Data\n")
for feature in features:
    mae = mean_absolute_error(df2017_normalized,feature,target)
    print(f"\t{feature}: {mae}\n")

MAE Data

	GDP_per_capita_adjusted_for_PPP_normalized: 72.75138635198913

	Agricultural_land_per_capita_normalized: 95.96602977468756

	Percentage_of_population_with_basic_water_service_normalized: 46.50868631932304

	Percentage_of_population_with_eating_disorder_normalized: 72.65198729019681

	Percentage_of_population_employed_in_agriculture_forestry_fishery_normalized: 71.8842036105433

	co2_per_agricultural_land_area_normalized: 100.4320249206575



### Preparing the Data

#### Getting Features and Targets

In [None]:
features_y = features.copy()
features_y.insert(0,"y_ratio")
features_y.insert(0,"Country")
df2017_normalized_only = df2017_normalized.loc[:,features_y]
print("First five data sets")
display(df2017_normalized_only.head())

In [None]:
df2017_feature , df2017_target = get_features_targets(df2017_normalized_only,features,target)
# # df2017_feature is already normalisead
# # df2017_target was not normalised above
display(df2017_feature.describe())
display(df2017_target.describe())

#### Pair Plots

In [None]:
# sns.set()
# myplot = sns.pairplot(data=df2017_normalised_only)

### Splitting the Data

In [None]:
df2017_feature_train , df2017_feature_test ,df2017_target_train , df2017_target_test = split_data(df2017_feature,df2017_target)

### Cost Function

In [None]:
X = prepare_feature(df2017_feature_train)
target = prepare_target(df2017_target_train)

beta = np.zeros((7,1))
J = compute_cost(X,target,beta)
print(J)

beta = np.ones((7,1))
J = compute_cost(X, target, beta)
print(J)

### Gradient Descent
- Using 1500 iterations and alpha of 0.01
- Also choosing the lower cost function

In [None]:
iterations = 1500 # arbitrary value
alpha = 0.01 # arbitrary value
beta = np.ones((7,1))

beta , J_storage = gradient_descent(X,target,beta,alpha,iterations)
print(beta)

In [None]:
plt.plot(J_storage)

### Running Predictions

In [None]:
pred = predict(df_features_test,beta)

#### For 1st to 6th factor to test the model

In [5687]:
# plt.scatter(df2017_normalized_only[""],df_target_test)
# plt.scatter(df2017_normalized_only[""],pred)

In [None]:
# plt.scatter(df2017_normalized_only[""],df_target_test)
# plt.scatter(df2017_normalized_only[""],pred)

In [None]:
# plt.scatter(df2017_normalized_only[""],df_target_test)
# plt.scatter(df2017_normalized_only[""],pred)

In [None]:
# plt.scatter(df2017_normalized_only[""],df_target_test)
# plt.scatter(df2017_normalized_only[""],pred)

In [None]:
# plt.scatter(df2017_normalized_only[""],df_target_test)
# plt.scatter(df2017_normalized_only[""],pred)

In [None]:
# plt.scatter(df2017_normalized_only[""],df_target_test)
# plt.scatter(df2017_normalized_only[""],pred)