In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("data/train_set.csv", sep = "\t")
print(len(df))

4764


In [2]:
X = df.drop(["ID", "Log price (1 billion VND)", 'Latitude', 'Longitude'], axis = 1)
y = df['Log price (1 billion VND)'].values

In [3]:
num_feats = ['Log area (square meters)', '(Log) Bedrooms', '(Log) WC', '(Log) Number of floors',\
    'Distance to center 0', 'Distance to center 1']

In [4]:
cat_feats = ['Parking',
       'For rent', 'Infrastructure', 'Facade','Cluster']

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feats),
        ('cat', OneHotEncoder(), cat_feats)])

X = preprocessor.fit_transform(X)

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

In [7]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

scores = cross_val_score(model, X, y, cv = 5, scoring = 'r2')

In [8]:
scores.mean()

0.374958956840273

In [9]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state = 0)
scores = cross_val_score(model, X, y, cv = 5, scoring = 'r2')
print(scores.mean())

0.5787966027810206


In [15]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['sqrt'] # ko co auto
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [16]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [17]:
# First create the base model to tune 
regressor = RandomForestRegressor(random_state = 42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = regressor,
                               param_distributions = random_grid,
                               n_iter = 100, cv = 3, verbose=2, random_state=42,
                               n_jobs = -1)
rf_random.fit(X, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [18]:
rf_random.best_params_

{'n_estimators': 822,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': False}

In [19]:
rf_random.best_score_

0.5705100295977669

In [38]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = range(300, 600, 100)
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [20, 30, 40, 50]
# Minimum number of samples required to split a node
min_samples_split = [5,6]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
parameters = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

regressor = RandomForestRegressor()

grid_search = GridSearchCV(estimator = regressor,
                           param_grid = parameters,
                           cv = 3, n_jobs = -1, verbose = 2)

grid_search = grid_search.fit(X, y)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [None]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 30,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 500}

In [None]:
model2 = grid_search.best_estimator_

cross_val_score(model2, X, y, cv = 5, scoring = 'r2').mean()

0.46206246186044037

In [None]:
model = rf_random.best_estimator_
model.fit(X, y)

cross_val_score(model, X, y, cv = 5, scoring = 'r2').mean()

0.46092763236364576

In [None]:
rf_new = RandomForestRegressor(random_state = 0, n_estimators=1200, min_samples_leaf=2,\
    min_samples_split=5, max_features='sqrt', max_depth=20, bootstrap=True)

rf_new.fit(X, y)

scores = cross_val_score(rf_new, X, y, cv = 5, scoring = 'r2')
scores

array([0.55155041, 0.573592  , 0.59296369, 0.59721927, 0.54199608])

In [None]:
from sklearn.metrics import r2_score

test = pd.read_csv("data/test_set.csv", sep = "\t")
X_test = test.drop(["ID", "Log price (1 billion VND)", 'Latitude', 'Longitude'], axis = 1)
y_test = test['Log price (1 billion VND)'].values
X_test["Log(using area)"] = X_test["(Log) Number of floors"] * X_test["Log area (square meters)"]
X_test = preprocessor.transform(X_test)

y_pred = model.predict(X_test)

r2_score(y_test, y_pred)

0.46829390946227645

In [None]:
y_pred2 = model2.predict(X_test)
r2_score(y_test, y_pred2)

ValueError: X has 15 features, but RandomForestRegressor is expecting 16 features as input.