In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import iqr
from sklearn import preprocessing
from sklearn.decomposition import PCA 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split,cross_val_score
import xgboost as xgb

  import pandas.util.testing as tm


In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
data_path='/content/drive/My Drive/Projects/housing_train.csv'

In [0]:
data=pd.read_csv(data_path)

In [0]:
data.drop(columns=['id','url','region_url','image_url','description'],inplace=True)

In [6]:
regions_list=data.region.unique()
regions_dict = {val : idx + 1 for idx, val in enumerate(regions_list)}
data['region_code'] = data['region'].map(regions_dict)
regions_dict

{'SF bay area': 48,
 'akron / canton': 272,
 'albany': 92,
 'albuquerque': 242,
 'ames': 122,
 'anchorage / mat-su': 9,
 'ann arbor': 176,
 'annapolis': 167,
 'asheville': 204,
 'ashtabula': 282,
 'athens': 90,
 'atlanta': 87,
 'augusta': 88,
 'bakersfield': 18,
 'baltimore': 168,
 'baton rouge': 145,
 'battle creek': 177,
 'bemidji': 189,
 'bend': 287,
 'billings': 215,
 'binghamton': 256,
 'birmingham': 1,
 'bismarck': 273,
 'bloomington': 115,
 'bloomington-normal': 104,
 'boise': 99,
 'boone': 214,
 'boston': 171,
 'boulder': 35,
 'bowling green': 139,
 'bozeman': 226,
 'brainerd': 196,
 'brunswick': 89,
 'buffalo': 243,
 'butte': 231,
 'cape cod / islands': 187,
 'catskills': 250,
 'cedar rapids': 131,
 'central NJ': 238,
 'central louisiana': 162,
 'central michigan': 190,
 'champaign urbana': 105,
 'charlotte': 213,
 'chautauqua': 262,
 'chicago': 106,
 'chico': 33,
 'chillicothe': 284,
 'cincinnati': 270,
 'cleveland': 276,
 'clovis / portales': 260,
 'colorado springs': 52,
 '

In [0]:
laundry_list=data.laundry_options.unique()
laundry_dict = {val : idx + 1 for idx, val in enumerate(laundry_list)}
data['laundry_options_code'] = data['laundry_options'].map(laundry_dict)

In [0]:
parking_list=data.parking_options.unique()
parking_dict = {val : idx + 1 for idx, val in enumerate(parking_list)}
data['parking_options_code'] = data['parking_options'].map(parking_dict)

In [9]:
state_list=data.state.unique()
state_dict = {val : idx + 1 for idx, val in enumerate(state_list)}
data['state_code'] = data['state'].map(state_dict)
state_dict

{'ak': 2,
 'al': 1,
 'ar': 4,
 'az': 3,
 'ca': 5,
 'co': 6,
 'ct': 7,
 'dc': 8,
 'de': 10,
 'fl': 9,
 'ga': 11,
 'hi': 12,
 'ia': 16,
 'id': 13,
 'il': 14,
 'in': 15,
 'ks': 17,
 'ky': 18,
 'la': 19,
 'ma': 23,
 'md': 22,
 'me': 20,
 'mi': 21,
 'mn': 24,
 'mo': 27,
 'ms': 25,
 'mt': 28,
 nan: 39,
 'nc': 26,
 'nd': 36,
 'ne': 29,
 'nh': 34,
 'nj': 31,
 'nm': 32,
 'nv': 30,
 'ny': 33,
 'oh': 35,
 'ok': 37,
 'or': 38}

In [0]:
housetype_list=data.type.unique()
housetype_dict = {val : idx + 1 for idx, val in enumerate(housetype_list)}
#Mapping codes to their categories
data['type_code'] = data['type'].map(housetype_dict)

In [11]:
for state in data.state.unique():
  reg_str='['
  sub_data=data[data['state']==state]
  for reg in sub_data.region.unique():
    reg_str=reg_str+'"'+str(regions_dict[reg])+" "+"-"+" "+reg+'",'
  reg_str=reg_str+']'
  print(state,reg_str)

al ["1 - birmingham","2 - huntsville / decatur","3 - dothan","4 - mobile","5 - montgomery","6 - florence / muscle shoals","7 - gadsden-anniston","8 - tuscaloosa",]
ak ["9 - anchorage / mat-su","10 - fairbanks","25 - kenai peninsula","26 - southeast alaska",]
az ["11 - phoenix","12 - flagstaff / sedona","13 - tucson","15 - prescott","16 - yuma","27 - mohave county","30 - show low","32 - sierra vista",]
ar ["14 - little rock","17 - fayetteville","19 - texarkana","28 - fort smith","29 - jonesboro",]
ca ["18 - bakersfield","20 - fresno / madera","21 - hanford-corcoran","22 - humboldt county","23 - inland empire","24 - los angeles","31 - gold country","33 - chico","34 - imperial county","36 - modesto","37 - orange county","38 - mendocino county","39 - merced","40 - palm springs","41 - reno / tahoe","42 - monterey bay","43 - redding","44 - sacramento","45 - san diego","46 - san luis obispo","47 - santa barbara","48 - SF bay area","49 - stockton","50 - ventura county","51 - susanville","53 - 

In [12]:
data_Clean=data.loc[
    (data['price']>-132.5) & (data['price']<2399.5) &
    (data['sqfeet']>146.0) & (data['sqfeet']<1762.0) &
    (data['beds']>0) & (data['beds']<4) &
    (data['baths']>0) & (data['baths']<4) &
    (data['lat']>-90) & (data['lat']<90) &
    (data['long']>-180) & (data['long']<180)
]
data_Clean.shape

(228384, 22)

In [13]:
data_Clean.type.unique()

array(['apartment', 'house', 'manufactured', 'townhouse', 'condo',
       'duplex', 'in-law', 'cottage/cabin', 'flat', 'loft', 'land',
       'assisted living'], dtype=object)

In [0]:
cols = [col for col in data.columns if col not in ['type','region','laundry_options','parking_options','state','price']]
X=data_Clean[cols]
Y=data_Clean['price']

In [0]:
#Train Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y.values, test_size=0.33, random_state=42)

In [16]:
XGR = xgb.XGBRegressor()
param_grid = {"objective":['reg:squarederror'],
              'colsample_bytree': [0.7],
              'learning_rate': [0.06],
              'max_depth': [10], 
              'alpha': [1], 
              'n_estimators':[2500]}

grid_search = GridSearchCV(estimator = XGR, param_grid = param_grid,cv=3, verbose=2)
XGR_GS = grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] alpha=1, colsample_bytree=0.7, learning_rate=0.06, max_depth=10, n_estimators=2500, objective=reg:squarederror 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  alpha=1, colsample_bytree=0.7, learning_rate=0.06, max_depth=10, n_estimators=2500, objective=reg:squarederror, total= 7.4min
[CV] alpha=1, colsample_bytree=0.7, learning_rate=0.06, max_depth=10, n_estimators=2500, objective=reg:squarederror 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.4min remaining:    0.0s


[CV]  alpha=1, colsample_bytree=0.7, learning_rate=0.06, max_depth=10, n_estimators=2500, objective=reg:squarederror, total= 7.3min
[CV] alpha=1, colsample_bytree=0.7, learning_rate=0.06, max_depth=10, n_estimators=2500, objective=reg:squarederror 
[CV]  alpha=1, colsample_bytree=0.7, learning_rate=0.06, max_depth=10, n_estimators=2500, objective=reg:squarederror, total= 7.2min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 21.9min finished


In [17]:
XGR_GS.best_estimator_

XGBRegressor(alpha=1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0,
             importance_type='gain', learning_rate=0.06, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=None, n_estimators=2500,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [0]:
Y_pred_XGR = XGR_GS.predict(X_test)

In [0]:
import pickle

# Saving model to disk

pickle.dump(XGR_GS, open('/content/drive/My Drive/Projects/HouseRent_XGB_1.pkl','wb'))

In [20]:
model = pickle.load(open('/content/drive/My Drive/Projects/HouseRent_XGB_1.pkl','rb'))
# ['sqfeet', 'beds', 'baths', 'cats_allowed', 'dogs_allowed', 'smoking_allowed', 'wheelchair_access', 'electric_vehicle_charge',
# 'comes_furnished', 'lat', 'long', 'region_code', 'type_code', 'laundry_options_code', 'parking_options_code', 'state_code']
print(model)
#print(model.n_features)

print('Test Result:-')
print(model.predict([[1500,3,2,0,0,0,0,1,0,120,120,10,1,2,1,2]]))

GridSearchCV(cv=3, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=nan,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1], 'colsample_bytree': [0.7],
                         'learning_rate': [0.06], 'max_depth': [10]

In [21]:
import joblib

# To dump
joblib.dump(XGR_GS, '/content/drive/My Drive/Projects/HouseRent_Joblib_XGB.joblib' + '.gz', compress=('gzip', 9)) 

['/content/drive/My Drive/Projects/HouseRent_Joblib_XGB.joblib.gz']

In [22]:
#model = joblib.load(open('/content/drive/My Drive/Projects/HouseRent_Joblib_XGB.joblib'))
model = joblib.load('/content/drive/My Drive/Projects/HouseRent_Joblib_XGB.joblib' + '.gz')
# ['sqfeet', 'beds', 'baths', 'cats_allowed', 'dogs_allowed', 'smoking_allowed', 'wheelchair_access', 'electric_vehicle_charge',
# 'comes_furnished', 'lat', 'long', 'region_code', 'type_code', 'laundry_options_code', 'parking_options_code', 'state_code']
print(model)
#print(model.n_features)

print('Test Result:-')
print(model.predict([[1500,3,2,0,0,0,0,1,0,120,120,10,1,2,1,2]]))

GridSearchCV(cv=3, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=nan,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1], 'colsample_bytree': [0.7],
                         'learning_rate': [0.06], 'max_depth': [10]

In [24]:
pip freeze | grep scikit-learn

scikit-learn==0.22.2.post1


In [23]:
pip freeze | grep xgboost

xgboost==0.90
