In [1]:
import pandas as pd
import numpy as np
import h2o
import lime
from lime import lime_tabular

from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.decomposition import PCA  # Principal component
from sklearn.linear_model import LinearRegression
from h2o.automl import H2OAutoML as ml
from h2o.estimators import H2OXGBoostEstimator

from functions.datacleaning import MoscowHousing as mh
from functions.distance import get_distance_coordinates, norm_features, PCA_plot

import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')

In [2]:
# Importing prepared_category_1 datasets
XTrain = pd.read_csv("../prepared_data/train_prepared_category_2.csv")
XTest = pd.read_csv("../prepared_data/test_prepared_category_2.csv")

# Make copies to avoid new imports
X_train = XTrain.copy()
X_test = XTest.copy()
y_train_real = XTrain["price"].copy()          # Prices actual values
y_train = XTrain["price"].copy() 
y_train_zscore = norm_features(y_train) # Prices as z-scores (normalized)

In [35]:
features = ["area_total",
            "rooms",
            "floor",
            "distance",
            "distance_metro",
            "ceiling", 
            "district", 
            "stories",
            "elevator", "elevator_no",
            "material_0.0", "material_1.0", "material_2.0", 
            "material_3.0", "material_4.0", "material_5.0",
            "material_6.0", "material_nan", 
            "heating_0.0", "heating_1.0", "heating_2.0", 
            "heating_3.0", "heating_nan",
            "bathrooms_shared_0.0" ,"bathrooms_shared_1.0",
            "bathrooms_shared_2.0", "bathrooms_shared_3.0",
            "bathrooms_shared_4.0", "bathrooms_shared_nan",
            "bathrooms_private_0.0", "bathrooms_private_1.0",
            "bathrooms_private_2.0", "bathrooms_private_3.0",
            "bathrooms_private_4.0", "bathrooms_private_nan",
            "windows_court_0.0", "windows_court_1.0", 
            "windows_court_nan", "windows_street_0.0",
            "windows_street_1.0", "windows_street_nan",
            "balconies_0.0", "balconies_1.0", "balconies_2.0",
            "balconies_3.0", "balconies_4.0", "balconies_nan",
            "loggias_0.0", "loggias_1.0", "loggias_2.0",
            "loggias_3.0","loggias_4.0","loggias_nan",
            "phones_0.0", "phones_1.0", "phones_2.0", "phones_nan",
            "parking_0.0", "parking_1.0", "parking_2.0", "parking_nan"]

categorical = ["district"]

output = "price"

In [36]:
X = X_train.copy()
y = y_train_zscore.copy()

In [37]:
X_tr, X_te, y_tr, y_te = train_test_split(X[features], y, train_size=0.80, test_size=0.2, random_state=42)

In [38]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18628 entries, 18332 to 15795
Data columns (total 61 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   area_total             18628 non-null  float64
 1   rooms                  18628 non-null  float64
 2   floor                  18628 non-null  float64
 3   distance               18628 non-null  float64
 4   distance_metro         18628 non-null  float64
 5   ceiling                18628 non-null  float64
 6   district               18628 non-null  int64  
 7   stories                18628 non-null  float64
 8   elevator               18628 non-null  int64  
 9   elevator_no            18628 non-null  int64  
 10  material_0.0           18628 non-null  int64  
 11  material_1.0           18628 non-null  int64  
 12  material_2.0           18628 non-null  int64  
 13  material_3.0           18628 non-null  int64  
 14  material_4.0           18628 non-null  int64  
 15

In [42]:
X_tr.head()

Unnamed: 0,area_total,rooms,floor,distance,distance_metro,ceiling,district,stories,elevator,elevator_no,...,loggias_4.0,loggias_nan,phones_0.0,phones_1.0,phones_2.0,phones_nan,parking_0.0,parking_1.0,parking_2.0,parking_nan
18332,0.135483,4.0,6.0,-1.335175,-0.330643,-0.052614,0,12.0,1,0,...,0,1,0,0,1,0,0,1,0,0
18707,-0.674622,1.0,13.0,0.229002,-0.095526,-0.043357,5,14.0,1,0,...,0,0,0,1,0,0,0,1,0,0
21356,-0.390838,2.0,8.0,-0.538059,-0.348848,-0.052614,4,16.0,1,0,...,0,0,0,1,0,0,0,0,0,1
19907,-0.118473,2.0,1.0,5.369689,8.626875,0.0,10,8.0,1,0,...,0,1,0,1,0,0,0,1,0,0
15697,-0.474695,1.0,14.0,-0.408207,-0.357585,0.0,6,22.0,1,0,...,0,1,0,1,0,0,0,1,0,0


In [39]:
lr = LinearRegression()
lr.fit(X_tr, y_tr)

LinearRegression()

In [40]:
print("Test R^2 Score  : ", lr.score(X_te, y_te))
print("Train R^2 Score : ", lr.score(X_tr, y_tr))

Test R^2 Score  :  0.6582851664961271
Train R^2 Score :  0.7321184592260461


In [41]:
explainer = lime_tabular.LimeTabularExplainer(np.array(X_tr,y_tr),
                                              mode="regression",
                                              feature_names=X_tr.columns,
                                              random_state=42)


In [54]:
#exp = explainer.explain_instance(X_tr.values, lr.predict(X_tr.values))

ValueError: could not broadcast input array from shape (18628,61) into shape (18628,)