![title](house_prices.jpg)

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import Imputer
from itertools import product
import seaborn as sns
%matplotlib inline
from sklearn.tree import export_graphviz
import graphviz

# Data loading and cleaning

## Data loading

In [None]:
df = pd.read_csv("house_sales_prices.csv")

In [None]:
df.head()

In [None]:
df.describe()

## Retrait valeurs manquantes

In [None]:
df_with_dropped_na = None #TODO: enlever toutes les colonnes contenant des valeurs manquantes

In [None]:
df_with_dropped_na.head()

## Conserver seulement les colonnes numériques

In [None]:
df_with_dropped_na.dtypes.head(12)

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

df_numeric = None #TODO: sélectionner seulement les colonnes dont le type est numérique

# Data visualization

In [None]:
sns.distplot(df_numeric.SalePrice)

In [None]:
correlations = df_numeric.corr()
most_correlated_features = correlations["SalePrice"].sort_values(ascending=False)[:15]
most_correlated_features

In [None]:
correlations_most_correlated_features = df_numeric[most_correlated_features.index].corr()
sns.heatmap(correlations_most_correlated_features, cmap="coolwarm")

In [None]:
sns.boxplot(x=df_numeric.OverallQual, y=df_numeric.SalePrice)

In [None]:
sns.regplot(x=df_numeric.GrLivArea, y=df_numeric.SalePrice, color="green")

In [None]:
sns.pairplot(df_numeric[most_correlated_features.index[:8]])

# Premier modèle

## Cible et variables explicatives

In [None]:
target = "SalePrice"
y = df_numeric[target]

In [None]:
x = df_numeric.drop(target, axis=1)
features = x.columns.tolist()
x.head()

## Séparation des données d'entraînement et de test

![title](training_test.png)

In [None]:
test_size_ratio = 0.2
random_state = 123
x_train, x_test, y_train, y_test = (None, None, None, None)
#TODO: créer les ensembles d'entraînement et de test,
#avec 80% de données d'entraînement et 20% de données de test

In [None]:
x_train.shape, x_test.shape

## Entraînement du modèle

In [None]:
tree_model = DecisionTreeRegressor(max_depth=8)

In [None]:
#TODO: entraîner le modèle sur les données d'entraînement

## Prédictions sur l'ensemble d'apprentissage

In [None]:
predictions_train = tree_model.predict(x_train)
r2_score(predictions_train, y_train)

In [None]:
predictions_vs_realite_train = pd.DataFrame({"predictions sur ensemble d'entrainement": predictions_train,
                                           "valeurs ensemble d'entrainement": y_train})
predictions_vs_realite_train.head(15)

In [None]:
predictions_vs_realite_train.plot.scatter(x="predictions sur ensemble d'entrainement", y="valeurs ensemble d'entrainement")

## Predictions sur l'ensemble de test

In [None]:
predictions = None # TODO
r2_score(predictions, y_test)

In [None]:
predictions_vs_realite = None #TODO
predictions_vs_realite.plot.scatter(x="predictions sur ensemble de test", y="valeurs ensemble de test")

In [None]:
predictions_vs_realite = pd.DataFrame({"predictions sur ensemble de test": predictions,
                                       "valeurs ensemble de test": y_test})
predictions_vs_realite.plot.scatter(x="predictions sur ensemble de test", y="valeurs ensemble de test")

## Jetons un oeil à l'arbre

In [None]:
dot_data_tree = export_graphviz(tree_model, out_file=None, 
                         feature_names=features,  
                         filled=True, rounded=True,  
                         special_characters=True) 
graphviz.Source(dot_data_tree)

# Recherche des meilleurs paramètres

![titile](training_and_test.png)

In [None]:
x_training, x_val, y_training, y_val = train_test_split(x_train, y_train)

In [None]:
params_grid = {"max_depth": [None] + list(range(2, 12)), 
               "min_samples_split": np.linspace(0.001, 0.1, 25), 
               "min_samples_leaf": np.linspace(0.001, 0.1, 25)}
params_combinations = list(map(lambda l: {"max_depth": l[0], 
                                          "min_samples_split": l[1],
                                          "min_samples_leaf": l[2]},
                               list(product(*(params_grid[key] for key in params_grid)))))

In [None]:
def get_score(params):
    #TODO: retourne score R2 sur l'ensemble de validation, après entraînement sur l'ensemble d'entraînement
    return 0

In [None]:
scores = list(map(get_score, params_combinations))

In [None]:
max_score = max(scores)
print("Score du meilleur modèle: %s" % max_score)
best_score_index = scores.index(max_score)
best_params = params_combinations[best_score_index]
best_tree = DecisionTreeRegressor(**best_params).fit(x_train, y_train)
print("Score du meilleur modèle sur l'ensemble de test: %s" % best_tree.score(x_test, y_test))

print("Meilleurs paramètres: %s" % best_params)

In [None]:
predictions_best_tree_vs_realite = pd.DataFrame({"predictions sur ensemble de test": best_tree.predict(x_test),
                                       "valeurs ensemble de test": y_test})
predictions_best_tree_vs_realite.plot.scatter(x="predictions sur ensemble de test", y="valeurs ensemble de test")

# Validation croisée

![title](kfolds.jpg)

In [None]:
def get_cross_val_score(params):
    scores = []
    x_train_matrix = x_train.as_matrix()
    y_train_matrix = y_train.as_matrix()
    kfold = KFold(n_splits=6)
    for train_indices, val_indices in kfold.split(x_train_reindexed):
        x_train_k = x_train_matrix[train_indices, :]
        y_train_k = y_train_matrix[train_indices]
        x_val_k = x_train_matrix[val_indices, :]
        y_val_k = y_train_matrix[val_indices]
        
        #TODO: ajouter le score du modèle entraîné sur le sous-ensemble d'entraînement,
        #appliqué sur le sous-ensemble de validation, à la liste des scores
    return np.mean(scores)

In [None]:
get_cross_val_score(best_params)

In [None]:
cv_scores = list(map(get_cross_val_score, params_combinations))

In [None]:
max_score_cv = max(cv_scores)
print("Score du meilleur modèle: %s" % max_score_cv)
best_score_index_cv = cv_scores.index(max_score_cv)
best_params_cv = params_combinations[best_score_index_cv]
best_tree_cv = DecisionTreeRegressor(**best_params_cv).fit(x_train, y_train)
print("Score du meilleur modèle sur l'ensemble de test: %s" % best_tree_cv.score(x_test, y_test))

print("Meilleurs paramètres: %s" % best_params_cv)

In [None]:
predictions_vs_realite_cv = pd.DataFrame({"predictions sur ensemble de test": best_tree.predict(x_test),
                                       "valeurs ensemble de test": y_test})
predictions_best_tree_vs_realite.plot.scatter(x="predictions sur ensemble de test", y="valeurs ensemble de test")

In [None]:
dot_data = export_graphviz(best_tree_cv, out_file=None, 
                         feature_names=features,  
                         filled=True, rounded=True,  
                         special_characters=True) 
graphviz.Source(dot_data)

# Foret

![title](random_forest.png)

In [None]:
n_samples = 1000
sample_size = 1000
pool_size = x_train.shape[0]

def get_bootstrap_sample(pool_size=pool_size, sample_size=sample_size):
    return np.random.choice(range(pool_size), size=sample_size, replace=True)

samples = [get_bootstrap_sample() for _ in range(n_samples)]

In [None]:
def train_individual_tree(sample):
    x_train_sample = x_train.as_matrix()[sample, :]
    y_train_sample = y_train.as_matrix()[sample]
    tree_sample = DecisionTreeRegressor(max_depth=50)
    return tree_sample.fit(x_train_sample, y_train_sample)
    
tree_samples = list(map(train_individual_tree, samples))

In [None]:
predictions_tree_samples = None
#TODO: récupérer la liste des prédictions de chacun des arbres sur x_test

In [None]:
bootstrap_aggregation_predictions = sum(predictions_tree_samples) / n_samples
r2_score(y_test, bootstrap_aggregation_predictions)

In [None]:
predictions_vs_realite_bootstrap_aggregation = pd.DataFrame({"predictions sur ensemble de test": bootstrap_aggregation_predictions,
                                       "valeurs ensemble de test": y_test})
predictions_vs_realite_bootstrap_aggregation.plot.scatter(x="predictions sur ensemble de test", y="valeurs ensemble de test")

## Feature sampling

In [None]:
frac = 0.8
num_features = x_train.shape[1]


def get_feature_sample(num_features=num_features, frac=frac):
    #TODO: retourner un échantillon d'une proportion frac des variables explicatives, 
    #sans replacement
    return []
feature_samples = [get_feature_sample() for _ in range(n_samples)]

In [None]:
def train_individual_tree_bagging(sample, feature_sample):
    x_train_sample = x_train.as_matrix()[sample, :][:, feature_sample]
    y_train_sample = y_train.as_matrix()[sample]
    tree_sample = DecisionTreeRegressor(max_depth=50)
    return tree_sample.fit(x_train_sample, y_train_sample)
    
tree_samples_features = list(map(lambda s: train_individual_tree_bagging(s[0], s[1]), 
                        zip(samples, feature_samples)))

In [None]:
predictions_tree_bagging = list(map(lambda s: s[0].predict(x_test.as_matrix()[:, s[1]]), 
                                             zip(tree_samples_features, feature_samples)))

In [None]:
bagging_predictions = sum(predictions_tree_bagging) / n_samples
r2_score(y_test, bagging_predictions)

In [None]:
predictions_vs_realite_bagging = pd.DataFrame({"predictions sur ensemble de test": bagging_predictions,
                                       "valeurs ensemble de test": y_test})
predictions_vs_realite_bagging.plot.scatter(x="predictions sur ensemble de test", y="valeurs ensemble de test")

## Comparison with sklearn random forest

In [None]:
rf = RandomForestRegressor(max_depth=50, n_estimators=1000, n_jobs=-1)

In [None]:
rf.fit(x_train, y_train)

In [None]:
rf.score(x_test, y_test)

In [None]:
scores = {}
for n in range(100, 2000, 100):
    #TODO: calculer le score du rf sur y_test, avec n estimateurs
sns.tsplot(data=list(scores.values()))

# Boosting

![title](boosting_trees.png)

In [None]:
def boost_tree(n=30, max_depth=2):
    trees = []
    current_residuals_to_predict = y_train
    for i in range(n):
        #TODO: entraîner un arbre à apprendre le résidu d'erreur courrants,
        #ajouter l'arbre à la liste d'arbres, et updater le résidu 
        #en soustrayant les prédictions de l'arbre
    return trees


In [None]:
boosting_predictions = np.sum(list(map(lambda t: t.predict(x_test), trees)), axis=0)
r2_score(y_test, boosting_predictions)

In [None]:
predictions_per_tree_number = pd.DataFrame({"cible": y_train,
                                 "predictions un arbre": trees[0].predict(x_train),
                                 "predictions deux arbre": trees[0].predict(x_train) + trees[1].predict(x_train),
                                 "predictions dix arbre": np.sum(list(map(lambda t: t.predict(x_train), trees[:10])), axis=0),
                                 "predictions tous les arbre": np.sum(list(map(lambda t: t.predict(x_train), trees)), axis=0)})
predictions_per_tree_number.head()

## GBM de sklearn

In [None]:
gbm = GradientBoostingRegressor(n_estimators=100, criterion="mse")
gbm.fit(x_train, y_train)

In [None]:
gbm.score(x_test, y_test)

# Feature engineering

## Imputation des valeurs manquantes

TODO: récupérer les données de départ, et remplacer les valeurs manquantes par la moyenne ou la médianne des valeurs de la colonne

## Dummification

TODO: Remplacer les colonnes contenant des variables catégorielles par des colonnes contenant des 0 et des 1, indicant si l'échangillon appartient ou non à la catégorie