In [997]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score


In [998]:
filename = "../.gitignore/data_a (Florent).csv"
#Open the csv file with no space before and after the values
df = pd.read_csv(filename, skipinitialspace=True) 

In [999]:
#Defining all the functions I'll use

#Functions for the preprocessing

#Function who displays the number of rows and columns of a given dataframe
def nb_rows_columns (df):
    print(f"There are {df.shape[0]} rows")
    print(f"There are {df.shape[1]} columns")

#Function who displays the number of null values per column
def nb_null_values (df):
    print(f"Here is the null values for each column :\n{df.isnull().sum()}")

#Function to drop the duplicates if there are some
def duplicates (df):
    nb_duplicates = len(df) - len(df.drop_duplicates())
    if nb_duplicates > 0:
        df = df.drop_duplicates()
        print(f"There were {nb_duplicates} duplicates in the dataframe")
        return df
    else :
        print("There is no duplicates in the dataframe")

#Function for the modeling part

#Defining a function to model the datas
def modeling_data (X, y, sample, regressors):
    for regressor in regressors :    
        #Splitting the data between the training part and the testing part
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test =  train_test_split(X,y, test_size = 0.2, random_state = 42 )

    
        #Calling the algorithm we want to apply on our model and applying it
        regressor = regressor
        regressor.fit(X_train, y_train)

        #Asking for the score of our model for its training part
        training_score = regressor.score(X_train, y_train)
        print(f"The score of the model {regressor} for training part is : {(training_score*100).round(2)} %")

        #Making a prediction with our model on the testing data part
        regressor.predict(X_test)

        #Asking the score of our model for its testing part
        test_score = regressor.score(X_test, y_test)
        print(f"The score of the model {regressor} for testing part is : {(test_score*100).round(2)} %")

        scores = cross_val_score(regressor, X, y, cv=5) # cv is the number of folds (k)
        print(f"The results for the cross-validation with {regressor} model is {scores}")

        # It is always a good practice to show the mean AND the standard deviation of the model accuracy
        print("Accuracy: {:.2f}% (+/- {:.2f})".format(scores.mean() * 100, scores.std() * 100))

        if (type (sample) == list) :
            #Convert into Numpy array
            Xnew = np.array(sample).reshape((1,-1))

            #Make a prediction
            ynew = regressor.predict(Xnew)
            print(f"Renting Price Predicted with {regressor} = {ynew[0].round(2)} €\n")


In [1000]:
#Exploring a bit the dataframe before working on it
nb_rows_columns(df)
nb_null_values(df)
duplicates(df)

There are 78192 rows
There are 20 columns
Here is the null values for each column :
Unnamed: 0                              0
To rent                              3992
To sell                              3992
Price                                3992
Number of rooms                      4915
Living Area                         15887
Fully equipped kitchen              30252
Furnished                           27135
Open fire                               0
Terrace                                 0
Area of the terrace                 16786
Garden                                  0
Area of the garden                   8579
Surface of the land                 43849
Surface area of the plot of land    43849
Number of facades                   24991
Swimming pool                           0
State of the building               25132
zipcode                                 0
type                                    1
dtype: int64
There is no duplicates in the dataframe


In [1001]:
#Selecting the data I'll work on knowing my client's question is : 
#How much will I be able to rent this appartment in Belgium ?

#Taking all the rent buildings directly from the source
df_rent = df.query("`To rent` == True" )

#Taking only the appartments and Rez-de-chaussée
df_rent_appart = df_rent.query("type == 'Appartement' | type == 'Rez-de-chaussée'").copy()

#Creating the column "Province"

Province = []
for zipcode in df_rent_appart['zipcode']:
    if (zipcode >= 1000) & (zipcode <= 1299): Province.append('Brussels')
    elif (zipcode >= 1300) & (zipcode <= 1499): Province.append('Brabant wallon')
    elif (zipcode >= 2000) & (zipcode <= 2999): Province.append('Anvers')
    elif (zipcode >= 3500) & (zipcode <= 3999): Province.append('Limbourg')
    elif (zipcode >= 4000) & (zipcode <= 4999): Province.append('Liege')
    elif (zipcode >= 5000) & (zipcode <= 5680): Province.append('Namur')
    elif (zipcode >= 6600) & (zipcode <= 6999): Province.append('Luxembourg')
    elif (zipcode >= 8000) & (zipcode <= 8999): Province.append('Flandre occiendentale')
    elif (zipcode >= 9000) & (zipcode <= 9999): Province.append('Flandre orientale')
    elif (zipcode >= 1500) & (zipcode <= 1999) or (zipcode >= 3000) & (zipcode <= 3499): Province.append('Brabant flamand')
    elif (zipcode >= 6000) & (zipcode <= 6599) or (zipcode >= 7000) & (zipcode <= 7999): Province.append('Hainaut')

df_rent_appart['Province'] = Province


In [1002]:
#Deleting the columns I find not accurate to answer my client's question

#df_rent_appart = df_rent_appart.filter(['Price', 'Living Area', 'Province', 'State of the building'], axis=1) #
columns = ['Unnamed: 0', 'To rent', 'To sell', 'Fully equipped kitchen', 'Furnished']
columns += ['Area of the terrace', 'Area of the garden', 'Surface of the land']
columns += ['Surface area of the plot of land', 'Number of facades', 'zipcode', 'type']
df_rent_appart = df_rent_appart.drop(columns=columns)
nb_null_values(df_rent_appart)

Here is the null values for each column :
Price                       0
Number of rooms           124
Living Area              2288
Open fire                   0
Terrace                     0
Garden                      0
Swimming pool               0
State of the building    3132
Province                    0
dtype: int64


In [1003]:
#Deleting the outliers manually 

#print(f"{df_rent_appart.nlargest(20, 'Price')}")
df_rent_appart = df_rent_appart[df_rent_appart['Price'] <= 8000] #Deleting the 120120 and 66000 price for renting !
#print(f"{df_rent_appart.nsmallest(20, 'Price')}")

#Deleting the colocations
rows_to_delete = df_rent_appart[ (df_rent_appart['Living Area'] > 100) & (df_rent_appart['Price'] < 500) ]
df_rent_appart.drop(rows_to_delete.index, inplace = True)
#nb_rows_columns(df_rent_appart)
#nb_rows_columns(rows_to_delete)

#Deleting the price 45 for renting 89 square m...
df_rent_appart = df_rent_appart[df_rent_appart['Price'] >= 275] 

#print(df_rent_appart.nlargest(20, 'Living Area'))
#Deleting the Living Area > 485 square m because there are very little renting prices for those huge Living area
df_rent_appart = df_rent_appart.query("`Living Area` < 485") 


#Drop the Living Area that are nulls
df_rent_appart = df_rent_appart.dropna(subset='Living Area')
df_rent_appart = df_rent_appart.dropna(subset='State of the building')
df_rent_appart = df_rent_appart.dropna(subset="Number of rooms")
nb_null_values(df_rent_appart)
nb_rows_columns(df_rent_appart)

Here is the null values for each column :
Price                    0
Number of rooms          0
Living Area              0
Open fire                0
Terrace                  0
Garden                   0
Swimming pool            0
State of the building    0
Province                 0
dtype: int64
There are 6617 rows
There are 9 columns


In [1004]:
#Putting the non_numerical values into numeric_values with get_dummies function

df_rent_appart = pd.get_dummies(data = df_rent_appart, columns = ['Province','State of the building'])
df_rent_appart.columns

Index(['Price', 'Number of rooms', 'Living Area', 'Open fire', 'Terrace',
       'Garden', 'Swimming pool', 'Province_Anvers',
       'Province_Brabant flamand', 'Province_Brabant wallon',
       'Province_Brussels', 'Province_Flandre occiendentale',
       'Province_Flandre orientale', 'Province_Hainaut', 'Province_Liege',
       'Province_Limbourg', 'Province_Luxembourg', 'Province_Namur',
       'State of the building_\nBon\n',
       'State of the building_\nExcellentétat\n',
       'State of the building_\nFraîchementrénové\n',
       'State of the building_\nÀrafraîchir\n',
       'State of the building_\nÀrestaurer\n',
       'State of the building_\nÀrénover\n'],
      dtype='object')

In [1005]:
#Modelization part using differents algorythms

#Defining the features and the target
#Features = Number of rooms, Living Area, Open fire, Terrace, Garden, Swimming pool, State of the building, Province 
X = np.array(df_rent_appart.drop(columns = 'Price'))
#Target = Renting price
y = np.array(df_rent_appart['Price']) 

#print(f"That's a sample line for the X_features : {X}")

GB = GradientBoostingRegressor()
LR = LinearRegression()
La = Lasso()
R = Ridge()
EL = ElasticNet()
DTC = DecisionTreeClassifier()

sample = [2.0, 90, False, False, False, False, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
#Modeling and predicting with GradientBoostingRegressor, LinearRegression, Lasso, Ridge, ElasticNet
modeling_data(X,y,sample,regressors=[GB, LR, La, R, EL, DTC])

The score of the model GradientBoostingRegressor() for training part is : 76.39 %
The score of the model GradientBoostingRegressor() for testing part is : 73.63 %
The results for the cross-validation with GradientBoostingRegressor() model is [0.80223503 0.74002239 0.68339809 0.65381551 0.24629312]
Accuracy: 62.52% (+/- 19.61)
Renting Price Predicted with GradientBoostingRegressor() = 1296.04 €

The score of the model LinearRegression() for training part is : 63.6 %
The score of the model LinearRegression() for testing part is : 66.53 %
The results for the cross-validation with LinearRegression() model is [0.71620634 0.65544746 0.62508062 0.58251928 0.05337345]
Accuracy: 52.65% (+/- 24.05)
Renting Price Predicted with LinearRegression() = 1490.15 €

The score of the model Lasso() for training part is : 63.48 %
The score of the model Lasso() for testing part is : 66.47 %
The results for the cross-validation with Lasso() model is [0.71524217 0.65921321 0.62017886 0.58158571 0.06107617]
Ac



The results for the cross-validation with DecisionTreeClassifier() model is [0.06193353 0.06797583 0.06575964 0.06198035 0.05517763]
Accuracy: 6.26% (+/- 0.44)
Renting Price Predicted with DecisionTreeClassifier() = 950.0 €



In [1006]:
#Just a test need to understand deeper

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 

# Creating train-test split and classifier
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=41, test_size=0.2)
regressor= GradientBoostingRegressor(random_state=5)

# Setting all the parameters we want to test

parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [100,500,1000, 1500],
                  'max_depth'    : [4,6,8,10]
                 }

grid_GBR = GridSearchCV(estimator=regressor, param_grid = parameters, cv = 2, n_jobs=-1)
grid_GBR.fit(X_train, y_train)


print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingRegressor(learning_rate=0.04, max_depth=4, random_state=5,
                          subsample=0.9)

 The best score across ALL searched params:
 0.6999956466082995

 The best parameters across ALL searched params:
 {'learning_rate': 0.04, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.9}
