## Research Question
### Is there a relationship between the life ladder and GDP per capita from 2008 to 2018? 

In [1]:
import pandas as pd
import numpy as np

### Method chaining
- Load the dataframe, drop all NA, reset index, and remove columns not being used
- set the year from 2008-2018, reset index, creata a new column and rename it

In [2]:
def load_and_process(url_or_path_to_csv_file):
    
    df1 = (
    pd.read_csv('whreport.csv')
    .dropna()
    .reset_index(drop=True)
    .drop(columns =['Social support','Healthy life expectancy at birth','Freedom to make life choices','Generosity','Perceptions of corruption','Positive affect','Negative affect'])
    )
    
    df2 =(
    df1
    .loc[df1.year >= 2008]
    .loc[df1.year <= 2018]
    .reset_index(drop=True)
    .assign(GDP_per_capita= lambda x: np.exp(x['Log GDP per capita']))
    .rename(columns ={'GDP_per_capita':'GDP per capita'})
    )

    return df2 


In [4]:
sampledata=load_and_process('whreport.csv')

### creata another load_year function
- to all of the year more neatly and clear for visualization

In [5]:
def load_year (dataframe ,year):
    dfyear = dataframe[dataframe['year'] == year].reset_index(drop=True)
    return dfyear

In [6]:
load_year(sampledata,2008)

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,GDP per capita
0,Afghanistan,2008,3.724,7.370,1587.633783
1,Argentina,2008,5.961,10.048,23109.521552
2,Armenia,2008,4.652,9.256,10467.180768
3,Australia,2008,7.254,10.719,45206.673558
4,Austria,2008,7.181,10.887,53476.629197
...,...,...,...,...,...
96,Uruguay,2008,5.664,9.671,15851.192293
97,Venezuela,2008,6.258,9.701,16333.932967
98,Vietnam,2008,5.480,8.440,4628.554985
99,Zambia,2008,4.730,7.918,2746.273004


In [471]:
df2 =(
    df1
    .loc[df1.year >= 2008]
    .loc[df1.year <= 2018]
    .reset_index(drop=True)
    .assign(GDP_per_capita= lambda x: np.exp(x['Log GDP per capita']))
    .rename(columns ={'GDP_per_capita':'GDP per capita'})
)

df2

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,GDP per capita
0,Afghanistan,2008,3.724,7.370,1587.633783
1,Afghanistan,2009,4.402,7.540,1881.830025
2,Afghanistan,2010,4.758,7.647,2094.353096
3,Afghanistan,2011,3.832,7.620,2038.562130
4,Afghanistan,2012,3.783,7.705,2219.417382
...,...,...,...,...,...
1327,Zimbabwe,2014,4.184,7.991,2954.249733
1328,Zimbabwe,2015,3.703,7.992,2957.205460
1329,Zimbabwe,2016,3.735,7.984,2933.642195
1330,Zimbabwe,2017,3.638,8.016,3029.036921


In [461]:
df = pd.read_csv('whreport.csv')

In [273]:
df

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,GDP per capita
0,Afghanistan,2008,3.724,7.370,1587.634
1,Afghanistan,2009,4.402,7.540,1881.830
2,Afghanistan,2010,4.758,7.647,2094.353
3,Afghanistan,2011,3.832,7.620,2038.562
4,Afghanistan,2012,3.783,7.705,2219.417
...,...,...,...,...,...
1944,Zimbabwe,2016,3.735,7.984,2933.642
1945,Zimbabwe,2017,3.638,8.016,3029.037
1946,Zimbabwe,2018,3.616,8.049,3130.663
1947,Zimbabwe,2019,2.694,7.950,2835.575


### Remove all columns not being used

In [23]:
del df ['Social support']

In [25]:
del df ['Healthy life expectancy at birth']

In [26]:
del df ['Freedom to make life choices']

In [27]:
del df ['Generosity']

In [29]:
del df ['Perceptions of corruption']

In [30]:
del df ['Positive affect']

In [31]:
del df ['Negative affect']

In [33]:
import math

### create a column based on existing one for calculation

In [327]:
GDPinValue = round(np.exp(df['Log GDP per capita']),3)
GDPinValue

0       1587.634
1       1881.830
2       2094.353
3       2038.562
4       2219.417
          ...   
1944    2933.642
1945    3029.037
1946    3130.663
1947    2835.575
1948    2512.416
Name: Log GDP per capita, Length: 1708, dtype: float64

In [48]:
df ['GDP per capita'] = GDPinValue

In [None]:
GDPinValue

### Drop all NA and set the timeframe for data analysis

In [162]:
df=df.dropna()

In [247]:
cleaned_df = df[(df['year'] >= 2008) & (df['year'] <= 2018)]

In [416]:
cleaned_df=cleaned_df.reset_index(drop=True)
cleaned_df

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,GDP per capita,Category,Categories
0,Afghanistan,2008,3.724,7.370,1587.634,Low,Low
1,Afghanistan,2009,4.402,7.540,1881.830,Low,Low
2,Afghanistan,2010,4.758,7.647,2094.353,Low,Low
3,Afghanistan,2011,3.832,7.620,2038.562,Low,Low
4,Afghanistan,2012,3.783,7.705,2219.417,Low,Low
...,...,...,...,...,...,...,...
1465,Zimbabwe,2014,4.184,7.991,2954.250,Low,Low
1466,Zimbabwe,2015,3.703,7.992,2957.205,Low,Low
1467,Zimbabwe,2016,3.735,7.984,2933.642,Low,Low
1468,Zimbabwe,2017,3.638,8.016,3029.037,Low,Low


In [253]:
cleaned_df=cleaned_df.rename(columns = {'Categories':'GDP Categories'})

In [185]:
set(cleaned_df['year'])

{2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018}

In [186]:
max(cleaned_df['GDP per capita'])

114119.337

In [187]:
min(cleaned_df['GDP per capita'])

761.279

In [362]:
cleaned_df['GDP per capita'].describe()

count      1470.000000
mean      20198.257286
std       20039.313562
min         761.279000
25%        4671.568250
50%       12682.780500
75%       30622.899750
max      114119.337000
Name: GDP per capita, dtype: float64

In [386]:
lowgdp = cleaned_df[(cleaned_df['GDP per capita'] >= 761.279) & (cleaned_df['GDP per capita'] <= 4671.568250)]

In [387]:
mediumgdp = cleaned_df[(cleaned_df['GDP per capita'] >= 4671.568250) & (cleaned_df['GDP per capita'] <= 30622.899750)]

In [388]:
highgdp = cleaned_df[(cleaned_df['GDP per capita'] >=  30622.899750) & (cleaned_df['GDP per capita'] <= 114119.337000)]

In [427]:
conditions = [(gdplist >= 761.279) & (gdplist <= 4671.568250), 
              (gdplist >= 4671.568250) & (gdplist <= 30622.899750),
              (gdplist >=  30622.899750) & (gdplist <= 114119.337000)
             ]

In [426]:
gdplist= cleaned_df['GDP per capita']

In [431]:
values = ['L', 'M', 'H']

In [432]:
cleaned_df['sample1'] = np.select(conditions,values)

In [441]:
cleaned_df

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,GDP per capita
0,Afghanistan,2008,3.724,7.370,1587.634
1,Afghanistan,2009,4.402,7.540,1881.830
2,Afghanistan,2010,4.758,7.647,2094.353
3,Afghanistan,2011,3.832,7.620,2038.562
4,Afghanistan,2012,3.783,7.705,2219.417
...,...,...,...,...,...
1465,Zimbabwe,2014,4.184,7.991,2954.250
1466,Zimbabwe,2015,3.703,7.992,2957.205
1467,Zimbabwe,2016,3.735,7.984,2933.642
1468,Zimbabwe,2017,3.638,8.016,3029.037


In [440]:
del cleaned_df['Categories']

In [None]:
cleaned_df.assign()

In [None]:
sns.lmplot(x='total_bill', y='tip', hue='sex', data=tips_df, markers=['o', '^'], 
          scatter_kws={'s': 100, 'linewidth': 0.5, 'edgecolor': 'w'})