In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

Let's read in some world data

In [None]:
country_data = pd.read_csv("world_data/world_data.csv") \
                .rename(columns={'\ufeffCountry Name': "Country Name"}) \
                .dropna(thresh=2)


In [None]:
country_data.info()

In [None]:
country_data.head()

Pivot the table to group countries over to the index while setting the columns from the `Series Name` with the values from the Y2k

In [None]:
indicators = country_data.pivot_table(index="Country Name", 
                                      columns="Series Name", 
                                      values="2000 [YR2000]")

For now we want to narrow our scope down to just a single X and a single Y.

In [None]:
indicators[['School enrollment, secondary (% gross)',
            'Life expectancy at birth, total (years)']].dropna(thresh=2).head()

In [None]:
plt.scatter(indicators['School enrollment, secondary (% gross)'], 
            indicators['Life expectancy at birth, total (years)'])
plt.show()

Now how about a little scikit-learn!?

In [None]:
from sklearn import linear_model

Lets get our X and Y

 - X -> School Enrollment in secondary level education
 - Y -> Life Expectancy at birth

In [None]:
df = indicators.loc[:, ['School enrollment, secondary (% gross)', 
                        'Life expectancy at birth, total (years)']]
df.dropna(inplace=True)
input_data = df[['School enrollment, secondary (% gross)']]
life_expectancy = df['Life expectancy at birth, total (years)']

Create a new `LinearRegression` classifier and fit our data (from above) into our model.

We can print relevant data to this model once our model has been fit.

In [None]:
regr1 = linear_model.LinearRegression()
regr1.fit(input_data, life_expectancy)
print('Coefficients: \n', regr1.coef_)
print(regr1.score(input_data, life_expectancy))

PLOTS!

In [None]:
plt.scatter(input_data, life_expectancy,  color='black')
plt.plot(input_data, regr1.predict(input_data), color='blue', linewidth=3)

This is how we can predict specific values on our model:

In [None]:
regr1.predict(80)

A single X and Y are fun and and simple - BUT BORING!  We can create a new DF and create a new X and Y that contain multiple feature vectors.

In the case below we will fit Sanitation Facilities & Access to Water sources with the Life Expectancy. Cool stuff eh?

In [None]:
df = indicators.loc[:, ['Improved sanitation facilities (% of population with access)',
                        'Improved water source (% of population with access)',
                        'Life expectancy at birth, total (years)']]
df.dropna(inplace=True)
input_data = df[['Improved sanitation facilities (% of population with access)',
                 'Improved water source (% of population with access)']]
life_expectancy = df['Life expectancy at birth, total (years)']

Taking our `input_data` and `life_expectancy` from above we can fit a new `LinearRegression` model to our new data and get relevant information about it.

In [None]:
regr2 = linear_model.LinearRegression()
indicators.loc[:, ['Improved sanitation facilities (% of population with access)']]
fit = regr2.fit(input_data, life_expectancy)
print('Coefficients: \n', regr2.coef_)
print(regr2.score(input_data, life_expectancy))

Since we are fitting on multiple dimensions we can even do a 3d plot. 3d Plots don't tell an incredible amount of information to the lay-man but they sure look cool.

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
xx = input_data['Improved sanitation facilities (% of population with access)']
yy = input_data['Improved water source (% of population with access)']
zz = life_expectancy
predict = regr2.predict(input_data)
x_surf, y_surf = np.meshgrid(xx, yy)
ax.plot_surface(x_surf, y_surf, predict, color="red", alpha=0.1)
ax.scatter(xx, yy, zz)

With multiple X's how do we predict on those?  Easy! Pass a feature vector representing %'s of their respective category and our model will predict the life expectancy based on those values.

In [None]:
print(regr2.predict([80, 80]))
print(regr2.predict([85, 80]))
print(regr2.predict([80, 85]))
print(regr2.predict([82.5, 82.5]))

Notice we just multiply the vectors together to get the `xx` variable.  Our `yy` remains as the life expectancy.

In [None]:
x1 = input_data['Improved sanitation facilities (% of population with access)']
x2 = input_data['Improved water source (% of population with access)']
yy = life_expectancy
predict = regr2.predict(input_data)
xx = x1 * x2
plt.scatter(xx, yy,  color='black')
plt.plot((min(xx), max(xx)), (min(predict), max(predict)), color='blue', linewidth=3)

Adding another dimension is as easy too!  Lets add our school enrollment back in and see how that affects things.

In [None]:
df = indicators.loc[:, ['School enrollment, secondary (% gross)',
                        'Improved sanitation facilities (% of population with access)',
                        'Improved water source (% of population with access)',
                        'Life expectancy at birth, total (years)']]
df.dropna(inplace=True)
input_data = df[['School enrollment, secondary (% gross)', 
                 'Improved sanitation facilities (% of population with access)',
                 'Improved water source (% of population with access)']]
life_expectancy = df['Life expectancy at birth, total (years)']

In [None]:
regr3 = linear_model.LinearRegression()
fit = regr3.fit(input_data, life_expectancy)
print(regr3.score(input_data, life_expectancy))

What can we precict with this new model?

In [None]:
df = indicators.loc[:, ['School enrollment, secondary (% gross)',
                        'Immunization, measles (% of children ages 12-23 months)',
                        'Improved sanitation facilities (% of population with access)',
                        'Improved water source (% of population with access)',
                        'Life expectancy at birth, total (years)']]
df.dropna(inplace=True)
input_data = df[['School enrollment, secondary (% gross)', 
                 'Immunization, measles (% of children ages 12-23 months)',
                 'Improved sanitation facilities (% of population with access)',
                 'Improved water source (% of population with access)']]
life_expectancy = df['Life expectancy at birth, total (years)']
regr4 = linear_model.LinearRegression()
fit = regr4.fit(input_data, life_expectancy)
print(regr4.score(input_data, life_expectancy))

## Automating picking the best features

In [None]:
import itertools
dependent_vars = list(indicators.columns)
dependent_vars.remove('Life expectancy at birth, total (years)')
combos = list(itertools.combinations(dependent_vars, 2))

In [None]:
choices = []

def regression_for(combo):
    combo = list(combo)
    df = indicators.loc[:, combo + ['Life expectancy at birth, total (years)']]
    df.dropna(inplace=True)
    input_data = df[combo]
    life_expectancy = df['Life expectancy at birth, total (years)']
    regr = linear_model.LinearRegression()
    regr.fit(input_data, life_expectancy)
    return regr, regr.score(input_data, life_expectancy)

for combo in combos:
    regr, score = regression_for(combo)
    choices.append((combo, score))
                   
best = sorted(choices, key=lambda x: x[1])[-1]
print(best)
regr, score = regression_for(best[0])
print(regr.coef_, regr.intercept_)