In [294]:
import pandas as pd 
import missingno as msno 
import numpy as np 
import seaborn as sms
from datetime import datetime as dt 
import matplotlib.pyplot as plt
import plotly.express as px 
from dash import Dash, dcc, html, Input, Output
import sklearn 
from sklearn.linear_model import LinearRegression, Ridge, Lasso   
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
import pickle 



pd.options.display.max_columns = 150 
pd.options.display.max_rows = 150 

## Recuperation du CSV
### On crée 2 dataset, un avec notre Target (y), ici 'price' , la deuxième avec toutes les features qu'on juge utiles

In [236]:
df_modelisation = pd.read_csv('df2.csv')
df_modelisation_prix = df_modelisation[['price']]
df_modelisation_prix.head()

Unnamed: 0,price
0,221900.0
1,538000.0
2,180000.0
3,604000.0
4,510000.0


In [237]:

df_modelisation = df_modelisation.drop('id', axis=1)
df_modelisation = df_modelisation.drop('date_sale', axis=1)

In [238]:
df_modelisation = df_modelisation.drop('price_log', axis=1)

In [239]:
df_modelisation = df_modelisation.drop('sqft_living15', axis=1)
df_modelisation = df_modelisation.drop('sqft_lot15', axis=1)

In [240]:
df_modelisation.head(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,year,month
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,2014,10
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,2014,12


In [241]:
df_modelisation["view"].unique()

array([0, 3, 4, 2, 1])

In [242]:
data_types_dict = {'month': str} 
df_modelisation = df_modelisation.astype(data_types_dict)  
df_modelisation.dtypes

price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
year               int64
month             object
dtype: object

In [243]:
data_types_dict = {'zipcode': str} 
df_modelisation = df_modelisation.astype(data_types_dict)  
df_modelisation.dtypes

price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode           object
lat              float64
long             float64
year               int64
month             object
dtype: object

### Modèle de regression linéaire

In [244]:
X = df_modelisation.drop('price', axis=1)
y = df_modelisation[['price']]
X_train, X_test, y_train, y_test = train_test_split(X,y)
num_col = list(X.select_dtypes(include=[float,int]).columns)
cat_col = list(X.select_dtypes(include=[object]).columns)
my_num_pipe = make_pipeline(StandardScaler(), PolynomialFeatures())
preprocessing = ColumnTransformer([
    ("one_hot", OneHotEncoder(),cat_col),
    ("scaling", my_num_pipe, num_col)
])


In [245]:
list(num_col)

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'lat',
 'long',
 'year']

In [246]:
list(cat_col)

['zipcode', 'month']

LINEAR REGRESSION

In [247]:
preprocessing = ColumnTransformer(transformers=[('one_hot', OneHotEncoder(), cat_col),
                                ('scaling', StandardScaler(), num_col)])                              

In [248]:
my_pipe_lr = make_pipeline(preprocessing, LinearRegression())

In [249]:
my_pipe_lr = Pipeline(
    [
        ("preprocessing", preprocessing), 
        ('ridge', LinearRegression())
    ]
)

In [250]:
my_pipe_lr.fit(X_train, y_train)

In [251]:
my_pipe_lr.score(X_test, y_test)

0.7926535592833545

In [252]:
my_pipe_lr
with open("my_pipe_lr.pkl", "wb") as f:
    pickle.dump(my_pipe_lr, f)

def load_from_pickle(name):
    with open(name, "rb") as f:
        return pickle.load(f)


In [297]:
X_train['sqft_lot'].info()


<class 'pandas.core.series.Series'>
Int64Index: 16209 entries, 20948 to 8276
Series name: sqft_lot
Non-Null Count  Dtype
--------------  -----
16209 non-null  int64
dtypes: int64(1)
memory usage: 253.3 KB


In [290]:
X_train.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16209 entries, 20948 to 8276
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   bedrooms       16209 non-null  int64  
 1   bathrooms      16209 non-null  float64
 2   sqft_living    16209 non-null  int64  
 3   sqft_lot       16209 non-null  int64  
 4   floors         16209 non-null  float64
 5   waterfront     16209 non-null  int64  
 6   view           16209 non-null  int64  
 7   condition      16209 non-null  int64  
 8   grade          16209 non-null  int64  
 9   sqft_above     16209 non-null  int64  
 10  sqft_basement  16209 non-null  int64  
 11  yr_built       16209 non-null  int64  
 12  yr_renovated   16209 non-null  int64  
 13  zipcode        16209 non-null  object 
 14  lat            16209 non-null  float64
 15  long           16209 non-null  float64
 16  year           16209 non-null  int64  
 17  month          16209 non-null  object 
dtypes: 

In [253]:
load_from_pickle("my_pipe_lr.pkl")

In [254]:
type (my_pipe_lr)

sklearn.pipeline.Pipeline

Ridge

In [255]:
my_pipe_ridge = Pipeline(
    [
        ("preprocessing", preprocessing), 
        ('ridge', Ridge())
    ]
)

In [256]:
my_pipe_ridge.get_params()

{'memory': None,
 'steps': [('preprocessing',
   ColumnTransformer(transformers=[('one_hot', OneHotEncoder(),
                                    ['zipcode', 'month']),
                                   ('scaling', StandardScaler(),
                                    ['bedrooms', 'bathrooms', 'sqft_living',
                                     'sqft_lot', 'floors', 'waterfront', 'view',
                                     'condition', 'grade', 'sqft_above',
                                     'sqft_basement', 'yr_built', 'yr_renovated',
                                     'lat', 'long', 'year'])])),
  ('ridge', Ridge())],
 'verbose': False,
 'preprocessing': ColumnTransformer(transformers=[('one_hot', OneHotEncoder(),
                                  ['zipcode', 'month']),
                                 ('scaling', StandardScaler(),
                                  ['bedrooms', 'bathrooms', 'sqft_living',
                                   'sqft_lot', 'floors', 'waterfront', '

In [257]:
hyperparametres = {'ridge__alpha': [0.01,0.1, 0.2,0.5, 1,10]}

In [258]:
random_search = GridSearchCV(my_pipe_ridge, hyperparametres, cv= 5)
random_search

In [259]:
random_search.fit(X_train,y_train)

In [260]:
random_search.best_params_

{'ridge__alpha': 1}

In [261]:
my_pipe_ridge.set_params(**random_search.best_params_)
my_pipe_ridge.fit(X_train,y_train)

In [262]:
my_pipe_ridge.score(X_test, y_test)

0.7927427002853391

LASSO

In [263]:
my_pipe_lasso = make_pipeline(preprocessing, Lasso())

In [264]:
my_pipe_lasso = Pipeline(
    [
        ("preprocessing", preprocessing), 
        ('lasso', Lasso())
    ]
)

In [265]:
my_pipe_lasso.fit(X_train, y_train)

  model = cd_fast.sparse_enet_coordinate_descent(


In [266]:
my_pipe_lasso.score(X_test, y_test)

0.7926703403264976

Elastic Net

In [267]:
my_pipe_elastic_net = Pipeline(
    [
        ("preprocessing", preprocessing), 
        ('elastic_net', ElasticNet())
    ]
)

In [268]:
my_pipe_elastic_net.fit(X_train, y_train)

In [269]:
my_pipe_elastic_net.score(X_test, y_test)

0.6784860821816903

In [289]:
X_train

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,year,month
20948,3,2.75,2450,5750,2.0,0,0,3,9,2450,0,2013,0,98065,47.5439,-121.862,2015,3
13670,3,2.25,1820,99752,1.0,0,0,4,7,1820,0,1969,0,98092,47.2838,-122.006,2015,2
14117,4,2.50,2430,6796,2.0,0,0,3,8,2430,0,1993,0,98058,47.4499,-122.127,2015,3
18996,4,1.75,2290,36900,1.5,0,2,5,7,1690,600,1938,0,98022,47.2034,-122.003,2015,5
8638,4,3.00,6430,27517,2.0,0,0,3,12,6430,0,2001,0,98004,47.6208,-122.219,2014,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13929,2,1.75,1370,5125,1.0,0,0,5,6,1370,0,1944,0,98103,47.6926,-122.346,2014,6
13480,2,1.00,830,4000,1.0,0,0,4,6,830,0,1947,0,98117,47.6909,-122.381,2015,4
5498,4,1.00,1680,5043,1.5,0,0,4,6,1680,0,1911,0,98118,47.5354,-122.273,2015,3
5269,3,2.50,2074,4900,2.0,0,0,3,8,2074,0,1997,0,98034,47.7327,-122.233,2014,10


In [288]:
X_train.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'year', 'month'],
      dtype='object')

SUPPRIMER LES VALEURS INFLUENTES

In [324]:

y_pred = my_pipe_ridge.predict(X_train)



y_pred.reshape(1,16209)
y_pred

array([[593939.37074758],
       [252204.52059351],
       [445611.15031413],
       ...,
       [374882.43477098],
       [545683.78628817],
       [420733.25832307]])

In [317]:
""" y_pred2 = pd.DataFrame(y_pred, columns=["price"] )
y_pred2
type(y_pred2) """

' y_pred2 = pd.DataFrame(y_pred, columns=["price"] )\ny_pred2\ntype(y_pred2) '

In [319]:
y_train

Unnamed: 0,price
20948,605000.0
13670,369000.0
14117,412950.0
18996,449999.0
8638,4489000.0
...,...
13929,409950.0
13480,455000.0
5498,366350.0
5269,515100.0


In [325]:
residual = (y_pred-y_train).abs().sort_values(by='price', axis= 0 , ascending= False)
residual


Unnamed: 0,price
3914,3.398251e+06
9254,3.067982e+06
1448,2.559747e+06
1315,2.339391e+06
4411,2.292250e+06
...,...
2355,5.147223e+01
17817,4.961667e+01
7285,3.460698e+01
20336,3.460176e+01


In [None]:
# to_drop = residual[residual > 500000].index

In [None]:
""" 
from sklearn.pipeline import make_pipeline


model = make_pipeline(PolynomialFeatures(2), LinearRegression())

X = df_modelisation2.values
y = df_modelisation.price.values
X_train, X_test,y_train , y_test = train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train)

print("----train-----")
print (model.score(X_train,y_train))
print ("----test----")
print(model.score(X_test,y_test))
print (model.predict(X_test))  """


' \nfrom sklearn.pipeline import make_pipeline\n\n\nmodel = make_pipeline(PolynomialFeatures(2), LinearRegression())\n\nX = df_modelisation2.values\ny = df_modelisation.price.values\nX_train, X_test,y_train , y_test = train_test_split(X, y, test_size=0.2)\nmodel.fit(X_train, y_train)\n\nprint("----train-----")\nprint (model.score(X_train,y_train))\nprint ("----test----")\nprint(model.score(X_test,y_test))\nprint (model.predict(X_test))  '

In [None]:
df_modelisation.to_csv("df_modelisation.csv")  

In [None]:
df_modelisation_price = df_modelisation["price"]
df_modelisation_price.to_csv("df_modelisation_price.csv")

## Modélisation de la méthode de régression linéaire

In [None]:
""" plt.scatter(X, y)
plt.plot(X, model.predict(X), c='red') """

" plt.scatter(X, y)\nplt.plot(X, model.predict(X), c='red') "

In [None]:
""" plt.plot(df_modelisation2 , df_modelisation_prix,'ro', markersize = 4) #UTILISATION DE LA FONCTION PLOT 
plt.show() """


" plt.plot(df_modelisation2 , df_modelisation_prix,'ro', markersize = 4) #UTILISATION DE LA FONCTION PLOT \nplt.show() "

In [None]:
""" predictions = model.predict(X_test)
plt.scatter(X_test,y)
plt.plot(X, predictions, c='r') """

" predictions = model.predict(X_test)\nplt.scatter(X_test,y)\nplt.plot(X, predictions, c='r') "

## Fonction qui permet de prédire le prix d'une maison

In [None]:
""" def prediction_maison(model,sqft_living	,grade	,sqft_above	,sqft_living15	,bathrooms,	view	,sqft_basement	,
                bedrooms	,zipcode_98004	,waterfront	,floors	,zipcode_98039,	zipcode_98040	,zipcode_98112,	zipcode_98006,	yr_renovated	,
                zipcode_98033,	zipcode_98105	,sqft_lot,	zipcode_98075,	zipcode_98199	,
                sqft_lot15 ,	zipcode_98001,	zipcode_98042,	zipcode_98023):
    x = np.array([sqft_living	,grade	,sqft_above	,sqft_living15	,bathrooms,	view	,sqft_basement	,
                bedrooms	,zipcode_98004	,waterfront	,floors	,zipcode_98039,	zipcode_98040	,zipcode_98112,	zipcode_98006,	yr_renovated	,
                zipcode_98033,	zipcode_98105	,sqft_lot,	zipcode_98075,	zipcode_98199	,
                sqft_lot15 ,	zipcode_98001,	zipcode_98042,	zipcode_98023]).reshape(1,25)
    print (model.predict(x)) """


' def prediction_maison(model,sqft_living\t,grade\t,sqft_above\t,sqft_living15\t,bathrooms,\tview\t,sqft_basement\t,\n                bedrooms\t,zipcode_98004\t,waterfront\t,floors\t,zipcode_98039,\tzipcode_98040\t,zipcode_98112,\tzipcode_98006,\tyr_renovated\t,\n                zipcode_98033,\tzipcode_98105\t,sqft_lot,\tzipcode_98075,\tzipcode_98199\t,\n                sqft_lot15 ,\tzipcode_98001,\tzipcode_98042,\tzipcode_98023):\n    x = np.array([sqft_living\t,grade\t,sqft_above\t,sqft_living15\t,bathrooms,\tview\t,sqft_basement\t,\n                bedrooms\t,zipcode_98004\t,waterfront\t,floors\t,zipcode_98039,\tzipcode_98040\t,zipcode_98112,\tzipcode_98006,\tyr_renovated\t,\n                zipcode_98033,\tzipcode_98105\t,sqft_lot,\tzipcode_98075,\tzipcode_98199\t,\n                sqft_lot15 ,\tzipcode_98001,\tzipcode_98042,\tzipcode_98023]).reshape(1,25)\n    print (model.predict(x)) '

In [None]:
""" #essai de la fonction
prediction_maison(model,50,7,30,48,2,2,0,2,1,0,1,0,0,0,0,1988,0,0,35,0,0,40,0,0,0) """

' #essai de la fonction\nprediction_maison(model,50,7,30,48,2,2,0,2,1,0,1,0,0,0,0,1988,0,0,35,0,0,40,0,0,0) '