## Prédire le prix de l'immobilier

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#https://www.kaggle.com/uciml/student-alcohol-consumption

housing = pd.read_csv("/dbfs/FileStore/tables/DataSource/rappels_python/immo_prix.csv", index_col=0)

housing['Age'] = housing['YrSold'] - housing['YearBuilt']

### Obtenir des quartiers avec plus de 30 observations

In [5]:
#garde quartiers avec plus de 30 observation 
counts = housing['Neighborhood'].value_counts()
more_than_30 = list(counts[counts>30].index)
housing = housing.loc[housing['Neighborhood'].isin(more_than_30)]

In [6]:
features = ['CentralAir', 'LotArea', 'OverallQual', 'OverallCond', 
            '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'Age']
target = 'SalePrice'

### Transformer les quartiers et Central Air au format d'one-hot encoding

In [8]:
# Neighborhood
dummies_nb = pd.get_dummies(housing['Neighborhood'], drop_first=True)
housing = pd.concat([housing, dummies_nb], axis=1)
# CentralAir
housing['CentralAir'] = housing['CentralAir'].map({'N':0, 'Y':1}).astype(int)

In [9]:
features += list(dummies_nb.columns)

In [10]:
X = housing[features].values
y = housing[target].values
n = housing.shape[0]

### Quel est le modèle le plus simple possible? juste prédire la moyenne!

In [12]:
y_mean = np.mean(y)
y_mean

$$ RMSE = \sqrt{ \frac {\sum (obs - pred)^2 }{n} } $$

In [14]:
RMSE_null_model = np.sqrt(np.sum((y - y_mean)**2) / n)
RMSE_null_model

## Building a Linear Regression Model

In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
regressor = LinearRegression()

In [18]:
regressor.fit(X, y)

In [19]:
housing['predictions'] = regressor.predict(X)

In [20]:
y_pred = housing['predictions'].values

In [21]:
RMSE_regressor = np.sqrt(np.sum((y - y_pred)**2) / n)
RMSE_regressor

In [22]:
housing.plot.scatter(x='SalePrice', y='predictions');

## Faites une prédiction pour une nouvelle maison

In [24]:
new_house = np.array([[0, 12000, 6, 6, 1200, 500, 3, 5, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]])
prediction = regressor.predict(new_house)
print("Pour une maison avec les caractéristiques suivantes:\n")
for feature, feature_value in zip(features, new_house[0]):
    if feature_value > 0:
        print("{}: {}".format(feature, feature_value))
print("\n La valeur prévue pour la maison est: {:,}".format(round(prediction[0])))