In [651]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso

filename = "../.gitignore/data_a (Florent).csv"

df = pd.read_csv(filename, skipinitialspace=True) #Open the csv file with no space before and after the values

In [652]:
#Defining all the functions I'll use

def nb_rows_columns (df):
    print(f"There are {df.shape[0]} rows")
    print(f"There are {df.shape[1]} columns")

def nb_null_values (df):
    print(f"Here is the null values for each column :\n{df.isnull().sum()}")

def duplicates (df):
    nb_duplicates = len(df) - len(df.drop_duplicates())
    if nb_duplicates > 0:
        print(f"There are {nb_duplicates} duplicates in the dataframe")
    else :
        print("There is no duplicates in the dataframe")


In [653]:
#Exploring a bit the dataframe before working on it
nb_rows_columns(df)
nb_null_values(df)
duplicates(df)

There are 78192 rows
There are 20 columns
Here is the null values for each column :
Unnamed: 0                              0
To rent                              3992
To sell                              3992
Price                                3992
Number of rooms                      4915
Living Area                         15887
Fully equipped kitchen              30252
Furnished                           27135
Open fire                               0
Terrace                                 0
Area of the terrace                 16786
Garden                                  0
Area of the garden                   8579
Surface of the land                 43849
Surface area of the plot of land    43849
Number of facades                   24991
Swimming pool                           0
State of the building               25132
zipcode                                 0
type                                    1
dtype: int64
There is no duplicates in the dataframe


In [654]:
#Selecting the data I'll work on knowing my client's question is : 
#How much will I be able to rent this appartment in Belgium ?

#Taking all the rent buildings directly from the source
df_rent = df.query("`To rent` == True" )

#Taking only the appartments and Rez-de-chaussée
df_rent_appart = df_rent.query("type == 'Appartement' | type == 'Rez-de-chaussée'").copy()

#Creating the column "Province"

Province = []
for zipcode in df_rent_appart['zipcode']:
    if (zipcode >= 1000) & (zipcode <= 1299): Province.append('Brussels')
    elif (zipcode >= 1300) & (zipcode <= 1499): Province.append('Brabant wallon')
    elif (zipcode >= 2000) & (zipcode <= 2999): Province.append('Anvers')
    elif (zipcode >= 3500) & (zipcode <= 3999): Province.append('Limbourg')
    elif (zipcode >= 4000) & (zipcode <= 4999): Province.append('Liege')
    elif (zipcode >= 5000) & (zipcode <= 5680): Province.append('Namur')
    elif (zipcode >= 6600) & (zipcode <= 6999): Province.append('Luxembourg')
    elif (zipcode >= 8000) & (zipcode <= 8999): Province.append('Flandre occiendentale')
    elif (zipcode >= 9000) & (zipcode <= 9999): Province.append('Flandre orientale')
    elif (zipcode >= 1500) & (zipcode <= 1999) or (zipcode >= 3000) & (zipcode <= 3499): Province.append('Brabant flamand')
    elif (zipcode >= 6000) & (zipcode <= 6599) or (zipcode >= 7000) & (zipcode <= 7999): Province.append('Hainaut')

df_rent_appart['Province'] = Province


In [655]:
#Deleting the columns I find not accurate to answer my client's question

#df_rent_appart = df_rent_appart.filter(['Price', 'Living Area', 'Province', 'State of the building'], axis=1) #
columns = ['Unnamed: 0', 'To rent', 'To sell', 'Fully equipped kitchen', 'Furnished']
columns += ['Area of the terrace', 'Area of the garden', 'Surface of the land']
columns += ['Surface area of the plot of land', 'Number of facades', 'zipcode', 'type']
df_rent_appart = df_rent_appart.drop(columns=columns)
nb_null_values(df_rent_appart)

Here is the null values for each column :
Price                       0
Number of rooms           124
Living Area              2288
Open fire                   0
Terrace                     0
Garden                      0
Swimming pool               0
State of the building    3132
Province                    0
dtype: int64


In [656]:
#Deleting the outliers manually 

#print(f"{df_rent_appart.nlargest(20, 'Price')}")
df_rent_appart = df_rent_appart[df_rent_appart['Price'] <= 8000] #Deleting the 120120 and 66000 price for renting !
#print(f"{df_rent_appart.nsmallest(20, 'Price')}")

#Deleting the colocations
rows_to_delete = df_rent_appart[ (df_rent_appart['Living Area'] > 100) & (df_rent_appart['Price'] < 500) ]
df_rent_appart.drop(rows_to_delete.index, inplace = True)
#nb_rows_columns(df_rent_appart)
#nb_rows_columns(rows_to_delete)

#Deleting the price 45 for renting 89 square m...
df_rent_appart = df_rent_appart[df_rent_appart['Price'] >= 275] 

#print(df_rent_appart.nlargest(20, 'Living Area'))
#Deleting the Living Area > 485 square m because there are very little renting prices for those huge Living area
df_rent_appart = df_rent_appart.query("`Living Area` < 485") 


#Drop the Living Area that are nulls
df_rent_appart = df_rent_appart.dropna(subset='Living Area')
df_rent_appart = df_rent_appart.dropna(subset='State of the building')
df_rent_appart = df_rent_appart.dropna(subset="Number of rooms")
#nb_null_values(df_rent_appart)
#nb_rows_columns(df_rent_appart)

In [657]:
#Putting the non_numerical values into numeric_values with get_dummies function

df_rent_appart = pd.get_dummies(data = df_rent_appart, columns = ['Province','State of the building'])
df_rent_appart.columns

Index(['Price', 'Number of rooms', 'Living Area', 'Open fire', 'Terrace',
       'Garden', 'Swimming pool', 'Province_Anvers',
       'Province_Brabant flamand', 'Province_Brabant wallon',
       'Province_Brussels', 'Province_Flandre occiendentale',
       'Province_Flandre orientale', 'Province_Hainaut', 'Province_Liege',
       'Province_Limbourg', 'Province_Luxembourg', 'Province_Namur',
       'State of the building_\nBon\n',
       'State of the building_\nExcellentétat\n',
       'State of the building_\nFraîchementrénové\n',
       'State of the building_\nÀrafraîchir\n',
       'State of the building_\nÀrestaurer\n',
       'State of the building_\nÀrénover\n'],
      dtype='object')

In [658]:
#Modelization part for renting appart in Belgium with LinearRegression

#Defining the features and the target

#features = Number of rooms, Living Area, Open fire, Terrace, Garden, Swimming pool, State of the building, Province     
X = np.array(df_rent_appart.drop(columns = 'Price')) 

#target = Renting price of an appartment or rez-de-chaussée
y = np.array(df_rent_appart['Price']) 

#print(f"That's a sample line for the X_features : {X[0]}")

#Splitting the data between the training part and the testing part
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =  train_test_split(X,y, test_size = 0.2, random_state = 42 )


#Calling the algorithm we want to apply on our model and applying it
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

#Asking for the score of our model for its training part
training_score = regressor.score(X_train, y_train)
print(f"The score of the model for training part : {training_score}")


#Making a prediction with our model on the testing data part

regressor.predict(X_test)

#Asking the score of our model for its testing part
test_score = regressor.score(X_test, y_test)
print(f"The score of the model for testing part : {test_score}")

#Define one new instance
#A random sample
use_samples = [2.0, 90, False, False, False, False, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
samples_to_predict = []

# Convert into Numpy array
Xnew = np.array(use_samples).reshape((1,-1))

# make a prediction
ynew = regressor.predict(Xnew)
print(f"Renting Price Predicted = {ynew[0].round(2)} €")

The score of the model for training part : 0.6360367029106249
The score of the model for testing part : 0.6652749848859687
Renting Price Predicted = 1490.15 €


In [659]:
#Modelization part for renting appart in Belgium GradientBoostingRegressor

#Defining the features and the target

X = np.array(df_rent_appart.drop(columns = 'Price')) #features = 
y = np.array(df_rent_appart['Price']) #target = Renting price

#print(f"That's a sample line for the X_features : {X}")

#Splitting the data between the training part and the testing part
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =  train_test_split(X,y, test_size = 0.2, random_state = 42 )


#Calling the algorithm we want to apply on our model and applying it

from sklearn.ensemble import GradientBoostingRegressor

regressor = GradientBoostingRegressor()
regressor.fit(X_train, y_train)

#Asking for the score of our model for its training part
training_score = regressor.score(X_train, y_train)
print(f"The score of the model for training part : {training_score}")


#Making a prediction with our model on the testing data part

regressor.predict(X_test)

#Asking the score of our model for its testing part
test_score = regressor.score(X_test, y_test)
print(f"The score of the model for testing part : {test_score}")

#Define one new instance
#A random sample
use_samples = [2.0, 90, False, False, False, False, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
samples_to_predict = []

# Convert into Numpy array
Xnew = np.array(use_samples).reshape((1,-1))

# make a prediction
ynew = regressor.predict(Xnew)
print(f"Renting Price Predicted = {ynew[0].round(2)} €")

The score of the model for training part : 0.7638712451519573
The score of the model for testing part : 0.7351323571524035
Renting Price Predicted = 1296.04 €


In [None]:
#Defining a function to model the datas

#Modelization part using differents algorythms
 
#Defining the features and the target
X = np.array(df_rent_appart.drop(columns = 'Price')) #features = 
y = np.array(df_rent_appart['Price']) #target = Renting price

#print(f"That's a sample line for the X_features : {X}")

GB = GradientBoostingRegressor()
LR = LinearRegression()
Lasso = Lasso()
R = Ridge()
EL = ElasticNet()


def modeling_data (X, y, regressor):
    #Splitting the data between the training part and the testing part
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test =  train_test_split(X,y, test_size = 0.2, random_state = 42 )

    #Calling the algorithm we want to apply on our model and applying it
    regressor = regressor
    regressor.fit(X_train, y_train)

    #Asking for the score of our model for its training part
    training_score = regressor.score(X_train, y_train)
    print(f"The score of the model for training part : {training_score}")

    #Making a prediction with our model on the testing data part
    regressor.predict(X_test)

    #Asking the score of our model for its testing part
    test_score = regressor.score(X_test, y_test)
    print(f"The score of the model for testing part : {test_score}")

    #Function to give a new instance to the predict model
    def prediction () :
    #Define one new instance
    #A random sample
    use_samples = [2.0, 90, False, False, False, False, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
    samples_to_predict = []

    # Convert into Numpy array
    Xnew = np.array(use_samples).reshape((1,-1))

    # make a prediction
    ynew = regressor.predict(Xnew)
    print(f"Renting Price Predicted = {ynew[0].round(2)} €")