In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
import pandas as pd

df = pd.read_csv('data/bike.csv')

In [3]:
df.describe()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,366.0,2.49658,0.500684,6.519836,0.028728,2.997264,0.683995,1.395349,0.495385,0.474354,0.627894,0.190486,848.176471,3656.172367,4504.348837
std,211.165812,1.110807,0.500342,3.451913,0.167155,2.004787,0.465233,0.544894,0.183051,0.162961,0.142429,0.077498,686.622488,1560.256377,1937.211452
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.05913,0.07907,0.0,0.022392,2.0,20.0,22.0
25%,183.5,2.0,0.0,4.0,0.0,1.0,0.0,1.0,0.337083,0.337842,0.52,0.13495,315.5,2497.0,3152.0
50%,366.0,3.0,1.0,7.0,0.0,3.0,1.0,1.0,0.498333,0.486733,0.626667,0.180975,713.0,3662.0,4548.0
75%,548.5,3.0,1.0,10.0,0.0,5.0,1.0,2.0,0.655417,0.608602,0.730209,0.233214,1096.0,4776.5,5956.0
max,731.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,0.861667,0.840896,0.9725,0.507463,3410.0,6946.0,8714.0


In [4]:
df = pd.read_csv('data/apartments_for_rent_classified_100K.csv', sep=';', low_memory=False)
df = df[['bathrooms', 'bedrooms', 'square_feet', 'cityname', 'price']]
df = df.dropna()

In [5]:
df['cityname'] = df['cityname'].apply(lambda x: x.strip())
location_stats = df.groupby('cityname')['cityname'].agg('count').sort_values(ascending=False)
location_stats_less_than_10 = location_stats[location_stats <= 10]
df.cityname = df.cityname.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)

In [6]:
df

Unnamed: 0,bathrooms,bedrooms,square_feet,cityname,price
0,1.0,1.0,542,Redondo Beach,2195.0
1,1.5,3.0,1500,Newport News,1250.0
2,2.0,3.0,1650,Raleigh,1395.0
3,1.0,2.0,820,Vacaville,1600.0
4,1.0,1.0,624,Albuquerque,975.0
...,...,...,...,...,...
99487,1.0,1.0,605,Houston,780.0
99488,2.0,2.0,921,Jacksonville,813.0
99489,1.0,1.0,650,San Diego,1325.0
99490,1.0,1.0,701,Huntersville,931.0


In [7]:
cityname = pd.get_dummies(df['cityname'], dummy_na=True)

In [8]:
X = pd.concat([df.drop(['cityname', 'price'], axis=1), cityname], axis=1)
X.columns = X.columns.astype(str)
y = df['price']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [10]:
from sklearn.linear_model import LinearRegression
results_df = pd.DataFrame()
metrics_df = pd.DataFrame()

model1 = DecisionTreeRegressor()
model1.fit(X_train, y_train)
model2 = LinearRegression()
model2.fit(X_train, y_train)

pred_1 = model1.predict(X_test)
pred_2 = model2.predict(X_test)

results_df['price'] = y_test
results_df['price_model1'] = pred_1
results_df['price_model2'] = pred_2
results_df['error_model1'] = results_df['price'] - results_df['price_model1']
results_df['error_model2'] = results_df['price'] - results_df['price_model2']

metrics = {
    'mae_model1': mean_absolute_error(results_df['price'], results_df['price_model1']),
    'mae_model2': mean_absolute_error(results_df['price'], results_df['price_model2']),
    'mse_model1': mean_squared_error(results_df['price'], results_df['price_model1']),
    'mse_model2': mean_squared_error(results_df['price'], results_df['price_model2']),
    'rmse_model1': mean_squared_error(results_df['price'], results_df['price_model1'])**0.5,
    'rmse_model2': mean_squared_error(results_df['price'], results_df['price_model2'])**0.5
}
metrics_df = pd.DataFrame(metrics, index=[0])

results_df.to_csv('results/apartments_results.csv')
metrics_df.to_csv('results/apartments_metrics.csv')