In [117]:
import pandas as pd
import numpy as np
import mysql.connector as sql
import geopandas as gpd
from shapely.geometry import Point
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from feature_engine.categorical_encoders import MeanCategoricalEncoder
from feature_engine import missing_data_imputers as mdi

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.5f}'.format

In [113]:
#df = pd.read_csv('Data/house_data_details_eda.csv')
#points = df.apply(lambda row: Point(row.longitude, row.latitude), axis=1) #Changing long and lat into shape
#gdf = gpd.GeoDataFrame(df, geometry=points)
#gdf.crs = {'init': 'epsg:4326'}
#gdf.head()
#gdf.plot()

In [84]:
#db_connection = sql.connect(host='Samuels-MacBook-Air.local', database='realestate_AVM',
#                            user='root', password='Building9Floor')

In [85]:
#df = pd.read_sql('SELECT * FROM properties;', con=db_connection)
#df.head()

In [118]:
raw_data = pd.read_csv('Data/house_data_details_eda.csv')
df = raw_data.copy()
df.head()

Unnamed: 0,latitude,longitude,address,property_type,home_size,lot_size,year_built,parcel_number,realtyID,county,subdivision,census,tract,lot,zoning,date,sale_price,estimated_value,sex_offenders,crime_index,enviornmental_hazards,natural_disasters,school quality,url,bedrooms,bathrooms,month
0,-118.44562,34.41838,17902 Stillmore St,Single Family Residence,1196.0,7867.0,1961.0,2844008001,1111559154.0,Los Angeles,25916,1.0,920039.0,6,SCUR2,2020-10-09,417000.0,568700.0,1,Low,6,1,Excellent,https://www.realtytrac.com/property/ca/santa-c...,3.0,2.0,2020-10
1,-118.34,34.17455,2805 W Chandler Blvd,Single Family Residence,950.0,2750.0,1954.0,2478008002,1111549760.0,Los Angeles,9852,2.0,311300.0,52,BUR1*,2020-10-09,770000.0,684100.0,4,Low,5,1,Above Average,https://www.realtytrac.com/property/ca/burbank...,2.0,1.0,2020-10
2,-118.38114,33.81235,280 Via Linda Vis,Single Family Residence,2029.0,7007.0,1948.0,7514018019,1111567119.0,Los Angeles,10302,4.0,651304.0,19,TORR-LO,2020-10-09,1250000.0,1437000.0,3,Low,3,2,Excellent,https://www.realtytrac.com/property/ca/redondo...,2.0,2.0,2020-10
3,-117.99154,33.99928,14852 Edgeridge Dr,Single Family Residence,3046.0,51836.4,1949.0,8221018031,1111565735.0,Los Angeles,2610,2.0,408402.0,13,LCA110000*,2020-10-09,1225000.0,1061000.0,0,Moderate,2,1,Above Average,https://www.realtytrac.com/property/ca/haciend...,3.0,2.0,2020-10
4,-117.99388,34.07099,12844 Waltham St,Single Family Residence,1080.0,12251.0,1947.0,8556005010,1111564332.0,Los Angeles,10104,3.0,404701.0,283,BPR1*,2020-10-09,480000.0,558900.0,4,Moderate,3,1,Average,https://www.realtytrac.com/property/ca/baldwin...,3.0,1.0,2020-10


In [121]:
features = ['latitude', 'longitude', 'home_size', 'lot_size', 'year_built', 'property_type', 'crime_index', 'school quality', 'month']

targets = df['sale_price']
inputs = df[features]

In [122]:
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, test_size=.2, random_state=24)

In [124]:
#Encode the categorical variable by the mean sale price of each category
mean_enc = MeanCategoricalEncoder(variables=['property_type', 'crime_index', 'month', 'school quality'])
mean_enc.fit(x_train, y_train)

x_train = mean_enc.transform(x_train)
x_test = mean_enc.transform(x_test)

#Impute the missing values for month
imputer = mdi.EndTailImputer(distribution='skewed', tail='left')
imputer.fit(x_train)

x_test = imputer.transform(x_test)

In [40]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

In [125]:
def acc_model(params):
    rf = RandomForestRegressor(**params)
    return cross_val_score(rf, x_train, y_train).mean()

param_space = {
    
               'n_estimators': hp.choice('n_estimators', range(25,500)), 
               'max_features': hp.choice('max_features',range(1,7)), 
               'min_samples_leaf': hp.choice('min_samples_leaf',range(1,3))
            }

best = 0

def f(params):
    global best
    acc = acc_model(params)
    if acc > best:
        best = acc
    print ('new best:', best, params)
    return {'loss': acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, param_space, algo=tpe.suggest, max_evals=100, trials=trials)
print ('best:')
print (best)

new best:                                              
0.7273397312333942                                     
{'max_features': 5, 'min_samples_leaf': 2, 'n_estimators': 106}
new best:                                                                        
0.7273435401137738                                                               
{'max_features': 6, 'min_samples_leaf': 1, 'n_estimators': 51}                   
new best:                                                                        
0.7315668399375075                                                               
{'max_features': 5, 'min_samples_leaf': 1, 'n_estimators': 173}                  
new best:                                                                        
0.7315668399375075                                                               
{'max_features': 3, 'min_samples_leaf': 1, 'n_estimators': 188}                  
new best:                                                                        
0.73

new best:                                                                         
0.7325866889142756                                                                
{'max_features': 4, 'min_samples_leaf': 2, 'n_estimators': 158}                   
new best:                                                                         
0.7325866889142756                                                                
{'max_features': 2, 'min_samples_leaf': 2, 'n_estimators': 339}                   
new best:                                                                         
0.7325866889142756                                                                
{'max_features': 1, 'min_samples_leaf': 2, 'n_estimators': 295}                   
new best:                                                                         
0.7325866889142756                                                                
{'max_features': 5, 'min_samples_leaf': 2, 'n_estimators': 302}                   
new 

{'max_features': 1, 'min_samples_leaf': 2, 'n_estimators': 53}                    
new best:                                                                         
0.7325866889142756                                                                
{'max_features': 1, 'min_samples_leaf': 2, 'n_estimators': 30}                    
new best:                                                                         
0.7325866889142756                                                                
{'max_features': 1, 'min_samples_leaf': 2, 'n_estimators': 273}                   
new best:                                                                         
0.7325866889142756                                                                
{'max_features': 1, 'min_samples_leaf': 2, 'n_estimators': 126}                   
new best:                                                                         
0.7325866889142756                                                                
{'ma

0.733838188645259                                                                 
{'max_features': 5, 'min_samples_leaf': 1, 'n_estimators': 466}                   
new best:                                                                         
0.733838188645259                                                                 
{'max_features': 1, 'min_samples_leaf': 2, 'n_estimators': 423}                   
100%|██████████| 100/100 [12:36<00:00,  7.57s/trial, best loss: 0.6370944364567201]
best:
{'max_features': 0, 'min_samples_leaf': 1, 'n_estimators': 3}


In [126]:
rf = RandomForestRegressor(n_estimators=423, 
                           max_features=1,
                           min_samples_leaf=2,
                           random_state=24)

rf.fit(x_train, y_train)
predictions = rf.predict(x_test)
score = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Random Forest Score score: {score}')

Random Forest Score score: 261328.26130235082


In [127]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

x_train_scaled = pd.DataFrame(x_train_scaled, columns=features)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=features)

In [128]:
lr = LinearRegression()
lr.fit(x_train_scaled, y_train)
predictions = lr.predict(x_test_scaled)
score = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Linear Regression score: {score}')

Linear Regression score: 312406.2253480721


In [134]:
df2 = pd.DataFrame(lr.coef_, columns=features)

AttributeError: 'LinearRegression' object has no attribute 'variables_'