# Chapter 1 - ML Landscape

## Linear models

TODO

### Using Scikit-Learn

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import sklearn.linear_model

In [None]:
gdp_per_capita = pd.read_csv("data/gdp_per_capita.csv", 
                             thousands=',',
                             delimiter='\t',
                             encoding='latin1',
                             na_values="n/a")

oecd_bli = pd.read_csv("data/oecd_bli_2015.csv", thousands=',')

In [None]:
def prepare_country_stats(oecd_bli, gdp_per_capita):
    # Filter down to rows where INEQUALITY == 'TOT'
    oecd_bli = oecd_bli[oecd_bli["INEQUALITY"]=="TOT"]
    
    # Generate a pivot table, one row per country, with columns containing the values for indicator;
    # Country name will be used as an index
    oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value")
    
    # Prepare GDP per capita data to be merged with OECD data
    gdp_per_capita = gdp_per_capita.rename(columns={"2015": "GDP per capita"})
    gdp_per_capita.set_index("Country", inplace=True)
    
    # Merge the datasets using their indexes
    full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True)

    # Sort data and remove unwanted rows/columns
    full_country_stats.sort_values(by="GDP per capita", inplace=True)
    remove_indices = [0, 1, 6, 8, 33, 34, 35]
    keep_indices = list(set(range(36)) - set(remove_indices))
    return full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[keep_indices]

In [None]:
# Prepare and preview the data
country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)
country_stats

In [None]:
# Visualise the data
X = np.c_[country_stats["GDP per capita"]]
y = np.c_[country_stats["Life satisfaction"]]
country_stats.plot(kind='scatter', x='GDP per capita', y='Life satisfaction')
plt.show()

In [None]:
# Select a linear model
lin_reg_model = sklearn.linear_model.LinearRegression()

In [None]:
# Train the model
lin_reg_model.fit(X, y)

In [None]:
# Make a new prediction for Cyprus
X_new = [[22587]]
print(lin_reg_model.predict(X_new))

In [None]:
# Try the same using K-nearest neighbours
knn_model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=3)
knn_model.fit(X, y)

In [None]:
# Expect prediction to be average for Slovenia, Portugal and Spain (~5.77)
print(knn_model.predict(X_new))