# Prelimary Prediction

Author: Vincent

Last Update: 2024-04-10

## Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score

from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz, export_text, plot_tree
from sklearn.svm import SVR

In [3]:
from scipy.stats import zscore
import warnings

In [4]:
warnings.filterwarnings('ignore')
%matplotlib inline

## Load Dataset

In [5]:
df_price = pd.read_csv('price_transform.csv')

In [6]:
df_price_busan = df_price[df_price['city'] == 0]

In [7]:
df_price_busan.head()

Unnamed: 0,key,apartment_id,city,exclusive_use_area,floor,latitude,longitude,address_by_law,total_parking_capacity_in_site,total_household_count_in_sites,...,front_door_structure_stairway,front_door_structure_-,front_door_structure_mixed,heat_type_central,heat_type_individual,heat_type_-,heat_type_district,heat_fuel_gas,heat_fuel_-,heat_fuel_cogeneration
1491,1491,3583,0,0.196117,-0.852127,-1.152681,1.22318,2614010600,-0.546816,-0.547497,...,1,0,0,0,1,0,0,1,0,0
1492,1492,11211,0,1.115385,0.120889,-1.15188,1.22305,2614010600,-0.595315,-0.820525,...,1,0,0,0,1,0,0,1,0,0
1493,1493,11731,0,-1.101359,0.120889,-1.179418,1.231544,2614012400,-0.74216,-0.785953,...,0,0,0,0,1,0,0,0,1,0
1993,1993,9380,0,-0.504584,-0.852127,-1.177814,1.282234,2620012100,-0.857346,-0.919807,...,1,0,0,0,1,0,0,1,0,0
1994,1994,35014,0,-1.054574,-1.269134,-1.176829,1.277556,2620012100,0.097818,2.547994,...,0,0,0,0,1,0,0,1,0,0


## Train-test split

In [8]:
_ = df_price_busan.copy().drop([
    'transaction_real_price',
    'apartment_id',
    'room_id',
    'key',
    'address_by_law'
], axis=1)
X_train, X_test, y_train, y_test = train_test_split(_, df_price_busan['transaction_real_price'])

## No Prediction

In [9]:
y_mean = sum(y_train)/len(y_train)
r2_score(y_test, [y_mean] * len(y_test))

-5.325297225855508e-06

## Raw Prediction

In [10]:
def regression_report(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    explained = explained_variance_score(y_test, y_pred)
    return """mse: %.3f
mae: %.3f
r2: %.3f
explained variance score: %.3f
""" % (mse, mae, r2, explained)

In [11]:
def predict_with_regressor(regressor, X_train, X_test, y_train, y_test):
    y_pred = regressor.fit(X_train, y_train).predict(X_test)
    return regression_report(y_test, y_pred)

In [12]:
def predict_with_regressors(regressors, X_train, X_test, y_train, y_test):
    for regressor in regressors(0):
        print("**" + regressor.__class__.__name__ + "**")
        print(predict_with_regressor(regressor, X_train, X_test, y_train, y_test))

In [13]:
get_regressors = lambda _: [
    LinearRegression(),
    BayesianRidge(),
    DecisionTreeRegressor()
]

In [14]:
predict_with_regressors(get_regressors, X_train, X_test, y_train, y_test)

**LinearRegression**
mse: 5201173462935936.000
mae: 47120539.298
r2: 0.815
explained variance score: 0.815

**BayesianRidge**
mse: 5201155372050459.000
mae: 47119659.779
r2: 0.815
explained variance score: 0.815

**DecisionTreeRegressor**
mse: 476098822401437.750
mae: 10641131.908
r2: 0.983
explained variance score: 0.983



## Dimensionality Reduction via Regression

Since Decision Tree has the best result, use the first 5 important features to reduce the dimension.

In [15]:
dtr = DecisionTreeRegressor()
print(predict_with_regressor(dtr, X_train, X_test, y_train, y_test))

mse: 474673966903271.312
mae: 10625862.877
r2: 0.983
explained variance score: 0.983



In [16]:
pd.DataFrame({
    'feature': dtr.feature_names_in_, 
    'importance':dtr.feature_importances_
}).sort_values('importance', ascending=False).head()

Unnamed: 0,feature,importance
10,supply_area,0.619021
8,tallest_building_in_sites,0.137947
14,transaction_year,0.100199
4,longitude,0.026322
17,building_age,0.020117


## Further Prediction with Reduced Dimension

In [17]:
important_features = ['supply_area', 'tallest_building_in_sites', 'transaction_year', 'longitude', 'building_age']
X_train_dim = X_train[important_features]
X_test_dim = X_test[important_features]

In [18]:
get_dim_regressors = lambda _: [
    LinearRegression(),
    BayesianRidge(),
    DecisionTreeRegressor(),
    KNeighborsRegressor()
]

In [19]:
predict_with_regressors(get_dim_regressors, X_train_dim, X_test_dim, y_train, y_test)

**LinearRegression**
mse: 5974774662004390.000
mae: 49622903.111
r2: 0.787
explained variance score: 0.787

**BayesianRidge**
mse: 5974772877852267.000
mae: 49622760.895
r2: 0.787
explained variance score: 0.787

**DecisionTreeRegressor**
mse: 617449677325881.875
mae: 14176565.927
r2: 0.978
explained variance score: 0.978

**KNeighborsRegressor**
mse: 884287691215155.000
mae: 16455396.169
r2: 0.968
explained variance score: 0.968



## Extreme Reduce Features

In [20]:
important_features = ['supply_area', 'tallest_building_in_sites']
X_train_2d = X_train[important_features]
X_test_2d = X_test[important_features]

In [21]:
predict_with_regressors(get_dim_regressors, X_train_2d, X_test_2d, y_train, y_test)

**LinearRegression**
mse: 8832318263101237.000
mae: 65500211.627
r2: 0.685
explained variance score: 0.685

**BayesianRidge**
mse: 8832317543761653.000
mae: 65500195.808
r2: 0.685
explained variance score: 0.685

**DecisionTreeRegressor**
mse: 3180286838701959.000
mae: 42747100.426
r2: 0.887
explained variance score: 0.887

**KNeighborsRegressor**
mse: 3848770870623668.500
mae: 46082471.238
r2: 0.863
explained variance score: 0.863



In [22]:
# some regressor cannot take too much.  

get_limited_regressors = lambda _: [
    SVR(), 
    MLPRegressor()
]

print(predict_with_regressors(get_limited_regressors, 
                              X_train_2d[:10_000], 
                              X_test_2d[:10_000], 
                              y_train[:10_000], 
                              y_test[:10_000]))

**SVR**
mse: 30910823246599436.000
mae: 113034075.802
r2: -0.070
explained variance score: 0.000

**MLPRegressor**
mse: 85416863523307776.000
mae: 237800782.117
r2: -1.957
explained variance score: 0.001

None
