In [7]:
import numpy as np
import pandas as pd
import datetime
import pickle

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold


# Imputation for Cafe and prom_part variables

In [8]:
df = pd.read_csv('train.csv')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30471 entries, 0 to 30470
Columns: 292 entries, id to price_doc
dtypes: float64(119), int64(157), object(16)
memory usage: 67.9+ MB


In [10]:
df.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452


Get the columns with missing values

In [None]:
missing_cols = df.columns[df.isna().any()].tolist()
missing_cols

['life_sq',
 'floor',
 'max_floor',
 'material',
 'build_year',
 'num_room',
 'kitch_sq',
 'state',
 'preschool_quota',
 'school_quota',
 'hospital_beds_raion',
 'raion_build_count_with_material_info',
 'build_count_block',
 'build_count_wood',
 'build_count_frame',
 'build_count_brick',
 'build_count_monolith',
 'build_count_panel',
 'build_count_foam',
 'build_count_slag',
 'build_count_mix',
 'raion_build_count_with_builddate_info',
 'build_count_before_1920',
 'build_count_1921-1945',
 'build_count_1946-1970',
 'build_count_1971-1995',
 'build_count_after_1995',
 'metro_min_walk',
 'metro_km_walk',
 'railroad_station_walk_km',
 'railroad_station_walk_min',
 'ID_railroad_station_walk',
 'cafe_sum_500_min_price_avg',
 'cafe_sum_500_max_price_avg',
 'cafe_avg_price_500',
 'cafe_sum_1000_min_price_avg',
 'cafe_sum_1000_max_price_avg',
 'cafe_avg_price_1000',
 'cafe_sum_1500_min_price_avg',
 'cafe_sum_1500_max_price_avg',
 'cafe_avg_price_1500',
 'cafe_sum_2000_min_price_avg',
 'cafe_su

get the cafe columns and fillna with median. Then fillna the prom part column with median

In [None]:
df2 = df.copy()
cafe_cols = [col for col in missing_cols if 'build' in col]
cols_fill = cafe_cols.copy()
cols_fill.insert(0,'sub_area')
df2[cafe_cols] = df2[cols_fill].groupby('sub_area').transform(lambda x: x.fillna(x.median(skipna=True)))
df2['prom_part_5000'] = df2['prom_part_5000'].fillna(df2['prom_part_5000'].median())

## Create model data sets

Create the feature and target set

In [None]:
X = df2.copy().drop(columns=["timestamp","price_doc"]).select_dtypes(include=['number']).fillna(1)
print("The shape of X is: ", X.shape)

y = df2.loc[:,"price_doc"].copy()
print("The shape of y is: ", y.shape)

The shape of X is:  (30471, 275)
The shape of y is:  (30471,)


Create the train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=42)

# Parameter Search
Set up the regressor, parameter search grid, and scoring metric

In [None]:
clf = RandomForestRegressor(random_state=42, n_jobs=-1)
# Create stratified 5-fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Tune the following parameters
n_estimators= [100]
max_features = ['auto', 'sqrt']
min_samples_split = [10, 100, 500]
min_samples_leaf = [1]
min_impurity_decrease = [0.0]

param_dist = {'n_estimators': n_estimators,
              'max_features': max_features,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'min_impurity_decrease': min_impurity_decrease
#               'verbose' : [1]
             }

scoring = {  'neg_mean_squared_error':'neg_mean_squared_error'}

run the parameter search

In [None]:
%%time
# If you don't want to wait 45+ min for this to run, skip to next cell

# RandomizedSearchCV will iterate over the 9 possible tuning combinations from the cell above
n_iter_search = 15
rf_random_search = RandomizedSearchCV(clf, param_distributions=param_dist, scoring=scoring, cv = cv, random_state=42,
                                      n_iter=n_iter_search,
                                      refit='neg_mean_squared_error')
rf_random_search.fit(X_train, y_train)



save the search so we don't need to run it every time

In [None]:
filename = 'rf_random_search.p'
pickle.dump(rf_random_search, open(filename, 'wb'))

Display the results

In [None]:
rf_cv_results_df = pd.DataFrame(rf_random_search.cv_results_)
rf_cv_results_df