In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics


In [2]:
df = pd.read_csv('/Users/dhruv/Downloads/californiahousingdataset/housing.csv', lineterminator='\n')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value\r
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [3]:
df = df.fillna(0)

df = df.rename(columns={'median_house_value\r': 'median_house_value'})

In [4]:
for col in df.columns:
    print(col)

longitude
latitude
housing_median_age
total_rooms
total_bedrooms
population
households
median_income
ocean_proximity
median_house_value


In [5]:
X = df.copy()
X = X.drop('median_house_value', axis=1)

y = df['median_house_value']

preXTrain, preXTest, yTrain, yTest = train_test_split(X, y, test_size=0.2, random_state=1)

preXTrain = preXTrain.reset_index(drop=True)
yTrain = yTrain.reset_index(drop=True)
preXTest = preXTest.reset_index(drop=True)
yTest = yTest.reset_index(drop=True)

In [6]:
catCols = [col for col in X.columns if X[col].dtype == 'object' or X[col].dtype == 'datetime64[ns]']

labelXTrain = preXTrain.copy()
labelXTest = preXTest.copy()

le = LabelEncoder()

for col in catCols:
    le.fit(labelXTrain[col])
    
    labelXTest[col] = labelXTest[col].map(lambda s: '<unknown>' if s not in le.classes_ else s)
    
    le.classes_ = np.append(le.classes_, '<unknown>')
    
    labelXTrain[col] = le.transform(labelXTrain[col])
    labelXTest[col] = le.transform(labelXTest[col])
    
XTrain = labelXTrain.copy()
XTest = labelXTest.copy()

print(XTrain)

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.43     37.71                  52         1410           286.0   
1        -122.35     37.95                  42         1485           290.0   
2        -121.24     37.90                  16           50            10.0   
3        -118.35     34.02                  34         5218          1576.0   
4        -118.39     33.89                  38         1851           332.0   
...          ...       ...                 ...          ...             ...   
16507    -117.88     33.76                  17         1768           474.0   
16508    -119.63     34.42                  42         1765           263.0   
16509    -118.26     33.93                  42         1433           295.0   
16510    -117.16     33.73                  10         2381           454.0   
16511    -122.20     37.79                  35         1802           459.0   

       population  households  median_income  ocean

In [7]:
model = RandomForestRegressor(n_estimators=40, random_state=0)
model.fit(XTrain, yTrain)

preds = model.predict(XTest)


In [8]:
seriesPreds = pd.Series(preds)

comparison = pd.concat([yTest, seriesPreds], axis=1)
comparison.columns = ['yTest', 'preds']
print(comparison)


       yTest       preds
0     355000  323667.525
1      70700   64500.000
2     229400  247897.500
3     112500  127665.025
4     225400  259735.050
...      ...         ...
4123   68200   72985.000
4124  225000  317942.525
4125  350000  375387.600
4126  227300  242980.000
4127  141700  107650.000

[4128 rows x 2 columns]


In [9]:
print('MAE:', metrics.mean_absolute_error(yTest, preds))
print('RMSE:', metrics.mean_squared_error(yTest, preds, squared=False))


MAE: 32157.727713178294
RMSE: 49501.98657824563
