# CONFIGURING

In [8]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import json
import time
import seaborn as sb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from numpy import NaN
from sklearn.utils import shuffle
from sklearn.model_selection  import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics

#SELECT THE FILE
df = pd.read_csv("df_train.csv")

# CLEANING

In [9]:
#DROP NAN AND USELESS VALUES
df = df.replace(-1, NaN)
df.dropna(subset='price', inplace = True)
df.dropna(subset='number_of_bedrooms', inplace = True)
df.dropna(subset='surface', inplace = True)
df.drop(df.index[df['type_of_sale'] != 'residential_sale'], inplace=True)
df.drop(df.index[df['subtype_of_property'] == 'APARTMENT_BLOCK'], inplace=True)
df.drop(df.index[df['type_of_property'] == 'HOUSE_GROUP'], inplace=True)
df.drop(df.index[df['type_of_property'] == 'APARTMENT_GROUP'], inplace=True)

#REPLACE NAN BY 0
df["swimming_pool"] = df["swimming_pool"].replace(NaN, 0).astype(int)
df["garden"] = df["garden"].replace(NaN, 0).astype(int)
df["terrace"] = df["terrace"].replace(NaN, 0).astype(int)



# FEATURE ENGINEERING

In [10]:
#TRANSLATE CATEGORIES IN NUM VALUES
    #Property_type
map_property = {"HOUSE":1, "APARTMENT":0}
df["type_of_property"] = df["type_of_property"].map(map_property).astype(int)

    #State_of_building
map_state = {"GOOD":1, "TO_RENOVATE":0, "AS_NEW":1, "JUST_RENOVATED":1, "TO_RESTORE":0, "NO_INFO":1, "TO_BE_DONE_UP":1}
df["state_of_the_building"] = df["state_of_the_building"].map(map_state).astype(int)

    #Postal code and communes
map_provinces = list(df["province"].unique())
dict_province = {}
for province in map_provinces : 
    dict_province[province] = df[(df["province"] == province)]["price"].median()/df["price"].median()
df["province_score"] = df["province"].map(dict_province)


#DROP FEATURES
del df["province"]
del df["type_of_sale"]
del df["Unnamed: 0"]
del df["id"]
del df["postal_code"]
del df["locality"]
del df["fully_equipped_kitchen"]
del df["kitchen_type"]
del df["land_surface"]
del df["number_of_facades"]
del df["garden_surface"]
del df["terrace_surface"]
del df["furnished"]
del df["open_fire"]
del df["region"]
del df["subtype_of_property"]


#DROP OUTLIERS

factor = 2

    #Price
upper_lim = df['price'].mean() + df['price'].std() * factor
df = df[(df['price'] < upper_lim) & (df['price'] >= 20000)]

    #Surface
upper_lim1 = df['surface'].mean() + df['surface'].std() * factor
lower_lim1 = df['surface'].mean() - df['surface'].std() * factor
df = df[(df['surface'] < upper_lim1) & (df['surface'] > lower_lim1)]

    #Bedrooms
upper_lim2 = df['number_of_bedrooms'].mean() + df['number_of_bedrooms'].std() * factor
lower_lim2 = df['number_of_bedrooms'].mean() - df['number_of_bedrooms'].std() * factor
df = df[(df['number_of_bedrooms'] < upper_lim2) & (df['number_of_bedrooms'] > lower_lim2)]


#STANDARDIZATION
df['number_of_bedrooms'] = (df['number_of_bedrooms'] - df['number_of_bedrooms'].mean()) / df['number_of_bedrooms'].std()
df['surface'] = (df['surface'] - df['surface'].mean()) / df['surface'].std()

#EXTRACT
df.to_csv("13_cleaned_process_dataset.csv")

# REGRESSION TRAINING

In [11]:
# Define the ML model variable
X_train = df.drop('price',axis= 1)
y_train = df['price']

#Create a linear regression model
model = LinearRegression()

#Fit the model with training data
model.fit(X_train,y_train)

#Display the score
model.score(X_train, y_train)

0.4865129971499542