# Prelimary Prediction

Author: Vincent

Last Update: 2024-05-04

## Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, export_graphviz, export_text, plot_tree

from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [3]:
import warnings
from typing import List

In [4]:
warnings.filterwarnings('ignore')
%matplotlib inline

## Functions

In [5]:
def plt_add_labels(x: List[str],
                   y: List[int]) -> None:
    y_scale = y.mean()/100
    for i in range(len(x)):
        plt.text(i, y[i]+y_scale, y[i], ha = 'center')

def print_missing_val_count(df: pd.DataFrame) -> None:
    # Missing value counts
    df_na_cnt = df.isnull().sum()
    df_record_cnt = df.shape[0]
    
    # Print the count of missing value for each feature
    if df_na_cnt.sum() > 0:
        print("The following columns have missing values:")
        for col, na_cnt in zip(df_na_cnt.index, df_na_cnt.values):
            if na_cnt > 0:
                print(f"{col} has {na_cnt} ({100*na_cnt/df_record_cnt:0.1f}%) missing value.")
    else:
        print("The dataframe does not have missing values.")

def knn_imputation(df: pd.DataFrame,
                   feature_columns: List[str],
                   target_column: str) -> None:
    """
    :param df: Dataframe for missing value imputation
    :param feature_columns: list of feature names (string) for knn imputation
    :param target_column: feature name (string) to be imputed
    :return Nothing
    """
    # Fit KNN model with records have non-null value in target column
    df_clean = df[~df[target_column].isna()]
    knn = KNeighborsRegressor(p=1, n_neighbors=3)
    knn.fit(df_clean[feature_columns], df_clean[target_column])
    # Predict the missing values
    df_miss = df[df[target_column].isna()]
    if df_miss.shape[0] > 0:
        y_pred = knn.predict(df_miss[feature_columns])
        # Impute the missing values
        df.loc[df[target_column].isna(), target_column] = y_pred
        print(f"Missing values in {target_column} has been imputed.")
    else:
        print(f"There is no missing values in {target_column}. No action.")

def one_hot_encoding(df: pd.DataFrame,
                     column:str) -> pd.DataFrame:
    distinct_values = set(df[column])
    
    for value in distinct_values:
        new_column = column + "_" + value
        df[new_column] = df[column].apply(lambda x: 1 if x == value else 0)
    
    df = df.drop(column, axis=1)
    
    return df

def regression_report(y_test: List[float],
                      y_pred: List[float],
                      p: int) -> str:
    n = y_test.shape[0]
    
    root_mse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    
    metrics = f"\
Rooted-MSE: {root_mse:.4}\n\
MAE: {mae:.4}\n\
Adjusted R-Squared: {adj_r2:.4f}"
    
    return metrics

def predict_with_regressor(regressor: "sklearn.regressor",
                           X_train: pd.DataFrame,
                           X_test: pd.DataFrame,
                           y_train: List[float],
                           y_test: List[float]) -> None:
    y_pred = regressor.fit(X_train, y_train).predict(X_test)
    
    print("---------- " + regressor.__class__.__name__ + " ----------")
    print(regression_report(y_test, y_pred, X_train.shape[1]))
    print("")

def predict_with_regressors(regressors, X_train, X_test, y_train, y_test) -> None:
    for regressor in regressors(0):
        predict_with_regressor(regressor, X_train, X_test, y_train, y_test)

## Load Dataset

In [6]:
df_price = pd.read_csv('price_std.csv')
df_school = pd.read_csv('school_std.csv')
df_subway = pd.read_csv('subway_std.csv')

## Train-test split

In [8]:
# X and y dataframe (Without id columns and address by law)
X_all = df_price \
    .drop(['apartment_id',
           'room_id',
           'address_by_law',
           'transaction_real_price'],
          axis=1)

y_all = df_price['transaction_real_price']

In [9]:
# Split the Price data set into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=2024)

## Basic Models

In [10]:
# Linear Regression
predict_with_regressor(
    LinearRegression(), 
    X_train, X_test, y_train, y_test)

---------- LinearRegression ----------
Rooted-MSE: 1.884e+08
MAE: 1.184e+08
Adjusted R-Squared: 0.6601



In [11]:
# Ridge Regression
predict_with_regressor(
    Ridge(alpha=100, random_state=2024), 
    X_train, X_test, y_train, y_test)

---------- Ridge ----------
Rooted-MSE: 1.894e+08
MAE: 1.18e+08
Adjusted R-Squared: 0.6567



In [12]:
# Decision Tree
predict_with_regressor(
    DecisionTreeRegressor(random_state=2024), 
    X_train, X_test, y_train, y_test)

---------- DecisionTreeRegressor ----------
Rooted-MSE: 4.458e+07
MAE: 1.917e+07
Adjusted R-Squared: 0.9810



In [13]:
# Neural Network
predict_with_regressor(
    MLPRegressor(hidden_layer_sizes=(20,10,5,20,5), max_iter=10), 
    X_train, X_test, y_train, y_test)

---------- MLPRegressor ----------
Rooted-MSE: 1.811e+08
MAE: 1.118e+08
Adjusted R-Squared: 0.6859



## Ensemble Model

In [14]:
forest = RandomForestRegressor(n_estimators = 11, random_state=2024)
predict_with_regressor(forest, X_train, X_test, y_train, y_test)

---------- RandomForestRegressor ----------
Rooted-MSE: 3.509e+07
MAE: 1.616e+07
Adjusted R-Squared: 0.9882



In [15]:
gb_reg = GradientBoostingRegressor(n_estimators=11, random_state=2024)
predict_with_regressor(gb_reg, X_train, X_test, y_train, y_test)

---------- GradientBoostingRegressor ----------
Rooted-MSE: 2.151e+08
MAE: 1.357e+08
Adjusted R-Squared: 0.5570



In [16]:
base_regressor = DecisionTreeRegressor(random_state=2024)
bagging = BaggingRegressor(base_regressor, n_estimators=11, random_state=2024, bootstrap_features=True)

predict_with_regressor(bagging, X_train, X_test, y_train, y_test)

---------- BaggingRegressor ----------
Rooted-MSE: 3.866e+07
MAE: 1.979e+07
Adjusted R-Squared: 0.9857



In [17]:
base_regressor = DecisionTreeRegressor(random_state=2024)
adaboost_regressor = AdaBoostRegressor(base_regressor, n_estimators=11, random_state=2024)

predict_with_regressor(adaboost_regressor, X_train, X_test, y_train, y_test)

---------- AdaBoostRegressor ----------
Rooted-MSE: 3.625e+07
MAE: 1.573e+07
Adjusted R-Squared: 0.9874



## Drop Highly Correlated Columns

In [18]:
# Correlation matrix
corr_matrix = X_train.corr().abs()

# Upper triangle of correlation matrix
upper_corr_matrix = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Features with correlation > 0.9
col_to_drop = [col for col in upper_corr_matrix.columns if any(upper_corr_matrix[col] > 0.9)]

print("Columns (left) will be dropped because they are highly correlated to another column (right).")
for col in col_to_drop:
    print(col, ":", upper_corr_matrix[upper_corr_matrix[col] > 0.9].index[0])

Columns (left) will be dropped because they are highly correlated to another column (right).
latitude : city
longitude : city
supply_area : exclusive_use_area
front_door_structure_stairway : front_door_structure_corridor
heat_fuel_gas : heat_type_district
heat_fuel_cogeneration : heat_type_district


In [24]:
# Drop the highly correlated columns
X_train.drop(col_to_drop, axis=1, inplace=True)
X_test.drop(col_to_drop, axis=1, inplace=True)
X_all.drop(col_to_drop, axis=1, inplace=True)

## Feature Importance

In [20]:
# Decision Tree
dtr = DecisionTreeRegressor(max_depth=10, random_state=2024)
dtr.fit(X_train, y_train)

In [21]:
# Feature Importance from Tree Model
feat_import_df = pd.DataFrame({'feature': dtr.feature_names_in_,
                               'importance': dtr.feature_importances_}) \
    .sort_values('importance', ascending=False)

feat_import_df.head(10)

Unnamed: 0,feature,importance
1,exclusive_use_area,0.409497
0,city,0.230578
3,total_parking_capacity_in_site,0.068032
11,transaction_year,0.058225
23,heat_type_district,0.050636
6,tallest_building_in_sites,0.049093
5,apartment_building_count_in_sites,0.0302
13,building_age,0.027536
7,lowest_building_in_sites,0.021884
4,total_household_count_in_sites,0.011557


In [22]:
# Top 5 highest feature importance columns
top_5_cols = feat_import_df["feature"].tolist()[:5]
X_train_5cols = X_train[top_5_cols]
X_test_5cols = X_test[top_5_cols]

print(f"The top 5 highest feature importance columns are {str(top_5_cols)[1:-1]}")
print("Their performance are shown as follows:\n")

regressors = lambda model: [
    LinearRegression(),
    Ridge(alpha=100, random_state=2024),
    DecisionTreeRegressor(max_depth=10, random_state=2024),
    MLPRegressor(hidden_layer_sizes=(20,10,5,10,5), max_iter=20),
    KNeighborsRegressor(p=1)
]

predict_with_regressors(regressors, X_train_5cols, X_test_5cols, y_train, y_test)

The top 5 highest feature importance columns are 'exclusive_use_area', 'city', 'total_parking_capacity_in_site', 'transaction_year', 'heat_type_district'
Their performance are shown as follows:

---------- LinearRegression ----------
Rooted-MSE: 2.097e+08
MAE: 1.328e+08
Adjusted R-Squared: 0.5790

---------- Ridge ----------
Rooted-MSE: 2.097e+08
MAE: 1.328e+08
Adjusted R-Squared: 0.5790

---------- DecisionTreeRegressor ----------
Rooted-MSE: 1.412e+08
MAE: 8.555e+07
Adjusted R-Squared: 0.8090

---------- MLPRegressor ----------
Rooted-MSE: 1.99e+08
MAE: 1.251e+08
Adjusted R-Squared: 0.6207

---------- KNeighborsRegressor ----------
Rooted-MSE: 7.818e+07
MAE: 3.441e+07
Adjusted R-Squared: 0.9415



In [23]:
# Top 2 highest feature importance columns
top_2_cols = feat_import_df["feature"].tolist()[:2]
X_train_2cols = X_train[top_2_cols]
X_test_2cols = X_test[top_2_cols]

print(f"The top 2 highest feature importance columns are {str(top_2_cols)[1:-1]}")
print("Their performance are shown as follows:\n")
predict_with_regressors(regressors, X_train_2cols, X_test_2cols, y_train, y_test)

The top 2 highest feature importance columns are 'exclusive_use_area', 'city'
Their performance are shown as follows:

---------- LinearRegression ----------
Rooted-MSE: 2.356e+08
MAE: 1.535e+08
Adjusted R-Squared: 0.4685

---------- Ridge ----------
Rooted-MSE: 2.356e+08
MAE: 1.535e+08
Adjusted R-Squared: 0.4685

---------- DecisionTreeRegressor ----------
Rooted-MSE: 1.887e+08
MAE: 1.181e+08
Adjusted R-Squared: 0.6589

---------- MLPRegressor ----------
Rooted-MSE: 2.219e+08
MAE: 1.41e+08
Adjusted R-Squared: 0.5285

---------- KNeighborsRegressor ----------
Rooted-MSE: 1.577e+08
MAE: 9.784e+07
Adjusted R-Squared: 0.7617



## Dimensionality reduction

**Principle Component Analysis**

In [25]:
pca = PCA(n_components=5)
pca.fit(X_all)

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [26]:
predict_with_regressors(regressors, X_train_pca, X_test_pca, y_train, y_test)

---------- LinearRegression ----------
Rooted-MSE: 3.032e+08
MAE: 1.994e+08
Adjusted R-Squared: 0.1196

---------- Ridge ----------
Rooted-MSE: 3.032e+08
MAE: 1.994e+08
Adjusted R-Squared: 0.1196

---------- DecisionTreeRegressor ----------
Rooted-MSE: 2.38e+08
MAE: 1.599e+08
Adjusted R-Squared: 0.4575

---------- MLPRegressor ----------
Rooted-MSE: 2.826e+08
MAE: 1.854e+08
Adjusted R-Squared: 0.2350

---------- KNeighborsRegressor ----------
Rooted-MSE: 1.688e+08
MAE: 9.002e+07
Adjusted R-Squared: 0.7271



All the regressors has poor performance compare to the before.  

**Partial Least Square Regression**

In [28]:
# Create an instance of PLS with the desired number of components
pls = PLSRegression(n_components=5)

pls.fit(X_all, y_all)

X_train_pls = pca.transform(X_train)
X_test_pls = pca.transform(X_test)

In [29]:
predict_with_regressors(regressors, X_train_pls, X_test_pls, y_train, y_test)

---------- LinearRegression ----------
Rooted-MSE: 3.032e+08
MAE: 1.994e+08
Adjusted R-Squared: 0.1196

---------- Ridge ----------
Rooted-MSE: 3.032e+08
MAE: 1.994e+08
Adjusted R-Squared: 0.1196

---------- DecisionTreeRegressor ----------
Rooted-MSE: 2.38e+08
MAE: 1.599e+08
Adjusted R-Squared: 0.4575

---------- MLPRegressor ----------
Rooted-MSE: 2.787e+08
MAE: 1.843e+08
Adjusted R-Squared: 0.2563

---------- KNeighborsRegressor ----------
Rooted-MSE: 1.688e+08
MAE: 9.002e+07
Adjusted R-Squared: 0.7271



All the regressors has poor performance compare to the before.  