In [1]:
import pandas as pd
from ydata_profiling import ProfileReport
import plotly_express as px
import plotly.io as pio

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

  from .autonotebook import tqdm as notebook_tqdm


## I. Exploratory Data Analysis

### Loading the data 

In [2]:
data = pd.read_excel("../data/raw/real_estate_data.xlsx", sheet_name="data")
data.head()

Unnamed: 0,id,sub_ms_class,zoning_ms,frontge_lot,area_lot,streetname,alleyname,shape_lot,contour_land,util,configlot,slopeland,neighborhood,c1,c2,type_building,style_house,ovl_quality,ovl_condition,year_constructed,year_remod,roof_style,roofmatl,ext1,ext2,masvnrtype,masvnrarea,exterqual,extercond,foundation,bsmtqual,bsmtcond,bsmtexposure,bsmtfintype1,bsmtfinsf1,bsmtfintype2,bsmtfinsf2,bsmtunfsf,totalbsmtsf,heating,heatingqc,centralair,electrical,1stflrsf,2ndflrsf,lowqualfinsf,grlivarea,bsmtfullbath,bsmthalfbath,fullbath,halfbath,bedroomabvgr,kitchenabvgr,kitchenqual,totrmsabvgrd,functional,fireplaces,fireplacequ,garagetype,garageyrblt,garagefinish,garagecars,garagearea,garagequal,garagecond,paveddrive,wooddecksf,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,128,45,RM,55.0,4388,Pave,,IR1,Bnk,AllPub,Inside,Gtl,OldTown,Feedr,Norm,1Fam,1.5Unf,5,7,1930,1950,Gable,CompShg,WdShing,Wd Sdng,,0.0,TA,Gd,BrkTil,TA,TA,No,LwQ,116,Unf,0,556,672,GasA,Ex,Y,SBrkr,840,0,0,840,0,0,1,0,3,1,TA,5,Typ,1,TA,,,,0,0,,,N,0,0,0,0,0,0,,,,0,6,2007,WD,Normal,87000
1,456,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,7,6,1973,1973,Hip,CompShg,HdBoard,HdBoard,BrkFace,320.0,TA,TA,CBlock,TA,TA,No,ALQ,916,Unf,0,326,1242,GasA,Fa,Y,SBrkr,1242,0,0,1242,0,0,1,1,3,1,TA,6,Typ,1,TA,Attchd,1973.0,Unf,2,528,TA,TA,Y,0,0,0,0,0,0,,,,0,9,2007,WD,Normal,175500
2,1324,30,RL,50.0,5330,Pave,,Reg,HLS,AllPub,Inside,Gtl,BrkSide,Norm,Norm,1Fam,1Story,4,7,1940,1950,Hip,CompShg,VinylSd,VinylSd,,0.0,Fa,TA,CBlock,TA,TA,No,LwQ,280,Unf,0,140,420,GasA,Gd,Y,SBrkr,708,0,0,708,0,0,1,0,2,1,Fa,5,Typ,0,,,,,0,0,,,Y,164,0,0,0,0,0,,,,0,12,2009,WD,Normal,82500
3,218,70,RM,57.0,9906,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,2Story,4,4,1925,1950,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,No,Unf,0,Unf,0,686,686,GasA,Fa,N,SBrkr,810,518,0,1328,0,0,1,0,3,1,TA,8,Typ,0,,Detchd,1940.0,Unf,1,210,TA,TA,Y,0,172,60,0,0,0,,,,0,9,2006,WD,Family,107000
4,1182,120,RM,64.0,5587,Pave,,IR1,HLS,AllPub,Inside,Mod,Crawfor,Norm,Norm,TwnhsE,1Story,8,5,2008,2008,Hip,CompShg,CemntBd,CmentBd,Stone,186.0,Ex,TA,PConc,Ex,TA,Gd,GLQ,1480,Unf,0,120,1600,GasA,Ex,Y,SBrkr,1652,0,0,1652,1,1,2,0,2,1,Gd,5,Typ,1,Gd,Attchd,2008.0,Fin,2,482,TA,TA,Y,162,53,0,153,0,0,,,,0,11,2008,New,Partial,392500


In [3]:
profile = ProfileReport(data, title="Profiling Report of Real Estate Data")
profile.to_file('report_profile.html')

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'Grvl'')
Summarize dataset: 100%|██████████| 991/991 [01:38<00:00, 10.05it/s, Completed]                                 
Generate report structure: 100%|██████████| 1/1 [00:16<00:00, 16.14s/it]
Render HTML: 100%|██████████| 1/1 [00:16<00:00, 16.66s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  4.85it/s]


In [4]:
print(f"We have {data.shape[0]} rows and {data.shape[1]} columns.")

We have 1460 rows and 81 columns.


### Analysizing missing values

In [5]:
columns_with_missing_values = [col for col in data.columns if data[col].isnull().sum() != 0]
number_of_missing_values = [data[col].isnull().sum() for col in data.columns if data[col].isnull().sum() != 0]

In [6]:
columns_with_missing_values_dict = {}
columns_with_missing_values_dict['column_name'] = columns_with_missing_values
columns_with_missing_values_dict['number_of_missing_values'] = number_of_missing_values

In [7]:
columns_with_missing_values_df = pd.DataFrame.from_dict(columns_with_missing_values_dict)
columns_with_missing_values_df['percentage_of_missing_values'] = columns_with_missing_values_df['number_of_missing_values'].apply(lambda x: x / data.shape[0])

Here's the list of the columns that contains missing values : 

In [8]:
columns_with_missing_values_df

Unnamed: 0,column_name,number_of_missing_values,percentage_of_missing_values
0,frontge_lot,259,0.177397
1,alleyname,1369,0.937671
2,masvnrtype,872,0.59726
3,masvnrarea,8,0.005479
4,bsmtqual,37,0.025342
5,bsmtcond,37,0.025342
6,bsmtexposure,38,0.026027
7,bsmtfintype1,37,0.025342
8,bsmtfintype2,38,0.026027
9,electrical,1,0.000685


We will remove all the id column and all the columns for which we have one unique value : 

In [9]:
data = data.drop(columns="id")

In [10]:
columns_with_one_value = [col for col in data.columns if data[col].nunique() == 1]
columns_with_one_value

[]

In [11]:
data.dtypes

sub_ms_class          int64
zoning_ms            object
frontge_lot         float64
area_lot              int64
streetname           object
alleyname            object
shape_lot            object
contour_land         object
util                 object
configlot            object
slopeland            object
neighborhood         object
c1                   object
c2                   object
type_building        object
style_house          object
ovl_quality           int64
ovl_condition         int64
year_constructed      int64
year_remod            int64
roof_style           object
roofmatl             object
ext1                 object
ext2                 object
masvnrtype           object
masvnrarea          float64
exterqual            object
extercond            object
foundation           object
bsmtqual             object
bsmtcond             object
bsmtexposure         object
bsmtfintype1         object
bsmtfinsf1            int64
bsmtfintype2         object
bsmtfinsf2          

In [12]:
data.shape

(1460, 80)

### Analyzing Data types

In [13]:
numeric_columns = data.select_dtypes(include=['number']).columns
categorical_columns = data.select_dtypes(include=['object', 'bool', 'category']).columns
print(f'There are {len(numeric_columns)} numeric columns and they are {numeric_columns}.') 
print(f'There are {len(categorical_columns)} categorical columns and they are {categorical_columns}.')

There are 37 numeric columns and they are Index(['sub_ms_class', 'frontge_lot', 'area_lot', 'ovl_quality',
       'ovl_condition', 'year_constructed', 'year_remod', 'masvnrarea',
       'bsmtfinsf1', 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', '1stflrsf',
       '2ndflrsf', 'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath',
       'fullbath', 'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'totrmsabvgrd',
       'fireplaces', 'garageyrblt', 'garagecars', 'garagearea', 'wooddecksf',
       'openporchsf', 'enclosedporch', '3ssnporch', 'screenporch', 'poolarea',
       'miscval', 'mosold', 'yrsold', 'saleprice'],
      dtype='object').
There are 43 categorical columns and they are Index(['zoning_ms', 'streetname', 'alleyname', 'shape_lot', 'contour_land',
       'util', 'configlot', 'slopeland', 'neighborhood', 'c1', 'c2',
       'type_building', 'style_house', 'roof_style', 'roofmatl', 'ext1',
       'ext2', 'masvnrtype', 'exterqual', 'extercond', 'foundation',
       'bsmtqual', 'bs

Of course, there are numeric columns that can be considered as categorical.

In [14]:
for col in data.columns : 
    fig = px.histogram(data, x=col)
    fig.write_image(f'../reports/figures/{col}_histrogram.png')

### Handling Missing Values

In [15]:
columns_with_missing_values_df

Unnamed: 0,column_name,number_of_missing_values,percentage_of_missing_values
0,frontge_lot,259,0.177397
1,alleyname,1369,0.937671
2,masvnrtype,872,0.59726
3,masvnrarea,8,0.005479
4,bsmtqual,37,0.025342
5,bsmtcond,37,0.025342
6,bsmtexposure,38,0.026027
7,bsmtfintype1,37,0.025342
8,bsmtfintype2,38,0.026027
9,electrical,1,0.000685


In [16]:
columns_with_missing_values_df.column_name.to_list()

['frontge_lot',
 'alleyname',
 'masvnrtype',
 'masvnrarea',
 'bsmtqual',
 'bsmtcond',
 'bsmtexposure',
 'bsmtfintype1',
 'bsmtfintype2',
 'electrical',
 'fireplacequ',
 'garagetype',
 'garageyrblt',
 'garagefinish',
 'garagequal',
 'garagecond',
 'poolqc',
 'fence',
 'miscfeature']

In [17]:
data_no_basement = data[data['totalbsmtsf'] == 0] 
basement_columns = ['bsmtqual','bsmtcond','bsmtexposure','bsmtfintype1','bsmtfintype2']
basement_columns_with_missing_values = [col for col in basement_columns if data_no_basement[col].isnull().sum() != 0]
basement_percentage_of_missing_values = [data_no_basement[col].isnull().sum()/data_no_basement.shape[0] for col in basement_columns if data_no_basement[col].isnull().sum() != 0]

In [18]:
basement_columns_with_missing_values, basement_percentage_of_missing_values

(['bsmtqual', 'bsmtcond', 'bsmtexposure', 'bsmtfintype1', 'bsmtfintype2'],
 [1.0, 1.0, 1.0, 1.0, 1.0])

The method to use for imputing missing values depends on the data we have : 
 - When the **frontage_lot** column is empty, it means the frontage is equal to 0, we will fill the missing values with 0. 
 - For the **alleyname** column, the percentage of missing values is higher than 90%, the column will be dropped. 
 - The masonry veneer is an external layer and is not necessarily found in all homes. That means that for the **masvnrtype** column, we will create another category that indicates that there is no masonry veneer for rows where the value is missing. 
 - The **masvnrarea** column is correlated to the last one. When the value is missing, we will fill it with 0.
 - When there is no basement, in other words, when the **totalbsmts** is equal to 0, all the related basement columns should be replaced to indicate that there is no basement.  
 - For the **electrical** column, only one value is missing. It will be replaced with the most frequent value.
 - When there is no fireplace, the **fireplacequ** is missing. We should replace it with another category. 
 - When there is no garage, all the related garage columns are empty and should be replaced to indicate that there is no garage. 
 - When there is no pool, the **poolqc** should indicate that.
 - When there is no fence, the **fence** column will be replaced by another category that says so. 
 - When there is no miscfeature, the **miscfeature** will be replaced by another category that says so. 

In [19]:
data = data.drop(columns='alleyname')
data['frontge_lot'] = data['frontge_lot'].fillna(0)
data['masvnrtype'] = data['masvnrtype'].fillna('No Masonry Veneer')
data['masvnrarea'] = data['masvnrarea'].fillna(0)
data['electrical'] = data['electrical'].fillna(data['electrical'].mode().iloc[0])

In [20]:
data['fireplacequ'] = data['fireplacequ'].fillna('No Fireplace')
data['poolqc'] = data['poolqc'].fillna('No Pool')
data['fence'] = data['fence'].fillna('No Fence')
data['miscfeature'] = data['miscfeature'].fillna('No Additional Feature')

In [21]:
data['bsmtqual'] = data['bsmtqual'].fillna('No Basement')
data['bsmtcond'] = data['bsmtcond'].fillna('No Basement')
data['bsmtexposure'] = data['bsmtexposure'].fillna('No Basement')
data['bsmtfintype1'] = data['bsmtfintype1'].fillna('No Basement')
data['bsmtfintype2'] = data['bsmtfintype2'].fillna('No Basement')

In [22]:
data['garagetype'] = data['garagetype'].fillna('No Garage')
data['garagefinish'] = data['garagefinish'].fillna('No Garage')
data['garagequal'] = data['garagequal'].fillna('No Garage')
data['garagecond'] = data['garagecond'].fillna('No Garage')

In [23]:
data['garageyrblt'] = data['garageyrblt'].fillna(data['yrsold'])
data['garage_age'] = data['yrsold'] - data['garageyrblt']

In [24]:
data['age'] = data['yrsold'] - data['year_remod']
data['orig_age'] = data['yrsold'] - data['year_constructed']

In [25]:
data = data.drop(columns=['garageyrblt', 'yrsold', 'year_remod', 'year_constructed'])

We check if there are still missing values : 

In [26]:
columns_with_missing_values = [col for col in data.columns if data[col].isnull().sum() != 0]
columns_with_missing_values

[]

### Handling Outliers

In [27]:
def cap_outliers_with_iqr_method(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)

In [29]:
new_numeric_columns = data.select_dtypes(include=['number']).columns
for column in new_numeric_columns:
    cap_outliers_with_iqr_method(data, column)

In [30]:
data.to_excel('../data/processed/processed_real_estate_data.xlsx')

## II. Modeling

We don't have a lot of data, so we will choose ML methods rather than DL methods. 

### 1. Regression : Predict the Price of the houses

In [31]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import lightgbm as lgb

In [32]:
categorical_columns = data.select_dtypes(include=['object', 'bool', 'category']).columns
for col in categorical_columns :
    data[col] = data[col].astype('category')

In [33]:
y = data['saleprice']
X = data.drop(columns='saleprice')

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [35]:
model = xgb.XGBRegressor(objective='reg:absoluteerror',
                         enable_categorical= True,
                         n_estimators=500,
                         random_state= 0)

In [36]:
model = model.fit(X_train, y_train)

In [37]:
y_pred = model.predict(X_test)
print(f"The root mean squared error is {root_mean_squared_error(y_pred, y_test)} and the mean squared error is {mean_absolute_error(y_pred, y_test)}")

The root mean squared error is 19816.11300049187 and the mean squared error is 14513.672971960616


In [38]:
fig = px.box(data, x='saleprice')
fig.show()

In [39]:
ax = xgb.plot_importance(model, max_num_features=20, importance_type='weight')
ax.figure.savefig("../reports/figures/feature_importances_regression_model.png")

In [40]:
ft_weights = pd.DataFrame(data={'features' : list(model.get_booster().get_score(importance_type='weight').keys()), 
                                'feature_importances' : list(model.get_booster().get_score(importance_type='weight').values())})

In [41]:
ft_weights_sorted = ft_weights.sort_values(by='feature_importances', ascending=False)
columns_to_keep = ft_weights_sorted.features[:20].to_list()

#### Fine-tuning the hyperparameters of the model 

In [42]:
def optimize_hyperparameters(parameters, model, n_folds, X ,y):
    grid = GridSearchCV(estimator=model, 
                        param_grid=parameters,
                        cv=n_folds)
    grid.fit(X, y)
    return grid.best_score_, grid.best_params_

In [43]:
parameters = {'objective':['reg:absoluteerror'],
              'enable_categorical': [True],
              'random_state': [0],
              'learning_rate': [0.01, 0.05, 0.1], 
              'max_depth': [5, 6, 7],
              'subsample': [0.7, 0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [100, 200, 500, 1000]}

model = xgb.XGBRegressor()
n_folds = 5
best_score, best_params = optimize_hyperparameters(parameters=parameters,
                         model=model, 
                         n_folds=n_folds,
                         X = X_train[columns_to_keep],
                         y = y_train)

In [44]:
print(best_score)
print(best_params)

0.8773609464662189
{'colsample_bytree': 0.7, 'enable_categorical': True, 'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 1000, 'objective': 'reg:absoluteerror', 'random_state': 0, 'subsample': 0.7}


In [45]:
model = xgb.XGBRegressor(**best_params)
model = model.fit(X_train[columns_to_keep], y_train)
y_pred = model.predict(X_test[columns_to_keep])
print(f"The root mean squared error is {root_mean_squared_error(y_pred, y_test)}") 
print(f"The mean absolute error is {mean_absolute_error(y_pred, y_test)}")
print(f"The R-squared error is {r2_score(y_pred, y_test)}")

The root mean squared error is 21291.374068554283
The mean absolute error is 14982.76918343322
The R-squared error is 0.8647410901821739


In [46]:
results = pd.DataFrame(data={'y_pred' : y_pred, 'y_true' : y_test})
results['error_percentage'] = (results['y_pred'] - results['y_true'])/results['y_true']
print(f"The mean percentage error is {results.error_percentage.mean()}")
results.head()

The mean percentage error is 0.03516076702769633


Unnamed: 0,y_pred,y_true,error_percentage
1065,137017.1875,166000.0,-0.174595
465,115412.695312,93000.0,0.240997
735,111928.234375,91000.0,0.229981
298,134394.796875,145000.0,-0.073139
246,243373.84375,227000.0,0.072131


In [47]:
px.box(results, x='error_percentage')

### 2. Clustering : Better understand our portfolio 

In [48]:
from kmodes.kprototypes import KPrototypes

In [49]:
categorical_indices = [data.columns.get_loc(col) for col in categorical_columns]

### Finding the optimal number of clusters

In [50]:
distortions = []
data_matrix = data.to_numpy()
K = range(1,10)
for k in K:
    clustering_model = KPrototypes(n_clusters=k, init='Cao', verbose=2, random_state=42)
    clustering_model.fit_predict(data_matrix, categorical=categorical_indices)
    distortions.append(clustering_model.cost_)

Initialization method and algorithm are deterministic. Setting n_init to 1.


Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 0, ncost: 6610323956766.771
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 0, ncost: 6610323956766.771
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 3, iteration: 1/100, moves: 0, ncost: 6610323956766.771
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 4, iteration: 1/100, moves: 0, ncost: 6610323956766.771
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 5, iteration: 1/100, moves: 0, ncost: 6610323956766.771
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 6, iteration: 1/100, moves: 0, ncost: 6610323956766.771
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 7, iteration: 1/100, moves: 0, ncost: 6610323956766.771
Init: initial

In [51]:
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()


FigureCanvasAgg is non-interactive, and thus cannot be shown



In [52]:
clustering_model = KPrototypes(n_clusters=3, init='Cao', verbose=2, random_state=42)
clusters = clustering_model.fit_predict(data_matrix, categorical=categorical_indices)

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 36, ncost: 1011213226783.741
Run: 1, iteration: 2/100, moves: 42, ncost: 1001940942124.6165
Run: 1, iteration: 3/100, moves: 20, ncost: 999825926116.5201
Run: 1, iteration: 4/100, moves: 6, ncost: 999687630593.9243
Run: 1, iteration: 5/100, moves: 0, ncost: 999687630593.9243
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 182, ncost: 1281755712861.7834
Run: 2, iteration: 2/100, moves: 151, ncost: 1126568093111.5522
Run: 2, iteration: 3/100, moves: 127, ncost: 1035406576807.1702
Run: 2, iteration: 4/100, moves: 75, ncost: 1008184835387.3652
Run: 2, iteration: 5/100, moves: 34, ncost: 1002245276958.366
Run: 2, iteration: 6/100, moves: 14, ncost: 1001044375357.2632
Run: 2, iteration: 7/100, moves: 10, ncost: 1000157859664.5625
Run: 2, iter

In [53]:
data['cluster'] = clusters

In [54]:
data.head()

Unnamed: 0,sub_ms_class,zoning_ms,frontge_lot,area_lot,streetname,shape_lot,contour_land,util,configlot,slopeland,neighborhood,c1,c2,type_building,style_house,ovl_quality,ovl_condition,roof_style,roofmatl,ext1,ext2,masvnrtype,masvnrarea,exterqual,extercond,foundation,bsmtqual,bsmtcond,bsmtexposure,bsmtfintype1,bsmtfinsf1,bsmtfintype2,bsmtfinsf2,bsmtunfsf,totalbsmtsf,heating,heatingqc,centralair,electrical,1stflrsf,2ndflrsf,lowqualfinsf,grlivarea,bsmtfullbath,bsmthalfbath,fullbath,halfbath,bedroomabvgr,kitchenabvgr,kitchenqual,totrmsabvgrd,functional,fireplaces,fireplacequ,garagetype,garagefinish,garagecars,garagearea,garagequal,garagecond,paveddrive,wooddecksf,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,poolqc,fence,miscfeature,miscval,mosold,saletype,salecondition,saleprice,garage_age,age,orig_age,cluster
0,45.0,RM,55.0,4388.0,Pave,IR1,Bnk,AllPub,Inside,Gtl,OldTown,Feedr,Norm,1Fam,1.5Unf,5.0,7.0,Gable,CompShg,WdShing,Wd Sdng,No Masonry Veneer,0.0,TA,Gd,BrkTil,TA,TA,No,LwQ,116.0,Unf,0.0,556.0,672.0,GasA,Ex,Y,SBrkr,840.0,0.0,0.0,840.0,0.0,0.0,1,0,3.0,1.0,TA,5.0,Typ,1.0,TA,No Garage,No Garage,0.0,0.0,No Garage,No Garage,N,0.0,0.0,0.0,0.0,0.0,0.0,No Pool,No Fence,No Additional Feature,0.0,6,WD,Normal,87000.0,0.0,57,77.0,2
1,20.0,RL,80.0,9600.0,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,7.0,6.0,Hip,CompShg,HdBoard,HdBoard,BrkFace,320.0,TA,TA,CBlock,TA,TA,No,ALQ,916.0,Unf,0.0,326.0,1242.0,GasA,Fa,Y,SBrkr,1242.0,0.0,0.0,1242.0,0.0,0.0,1,1,3.0,1.0,TA,6.0,Typ,1.0,TA,Attchd,Unf,2.0,528.0,TA,TA,Y,0.0,0.0,0.0,0.0,0.0,0.0,No Pool,No Fence,No Additional Feature,0.0,9,WD,Normal,175500.0,34.0,34,34.0,0
2,30.0,RL,50.0,5330.0,Pave,Reg,HLS,AllPub,Inside,Gtl,BrkSide,Norm,Norm,1Fam,1Story,4.0,7.0,Hip,CompShg,VinylSd,VinylSd,No Masonry Veneer,0.0,Fa,TA,CBlock,TA,TA,No,LwQ,280.0,Unf,0.0,140.0,420.0,GasA,Gd,Y,SBrkr,708.0,0.0,0.0,708.0,0.0,0.0,1,0,2.0,1.0,Fa,5.0,Typ,0.0,No Fireplace,No Garage,No Garage,0.0,0.0,No Garage,No Garage,Y,164.0,0.0,0.0,0.0,0.0,0.0,No Pool,No Fence,No Additional Feature,0.0,12,WD,Normal,82500.0,0.0,59,69.0,2
3,70.0,RM,57.0,9906.0,Pave,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,2Story,4.0,4.0,Gable,CompShg,MetalSd,MetalSd,No Masonry Veneer,0.0,TA,TA,CBlock,TA,TA,No,Unf,0.0,Unf,0.0,686.0,686.0,GasA,Fa,N,SBrkr,810.0,518.0,0.0,1328.0,0.0,0.0,1,0,3.0,1.0,TA,8.0,Typ,0.0,No Fireplace,Detchd,Unf,1.0,210.0,TA,TA,Y,0.0,170.0,0.0,0.0,0.0,0.0,No Pool,No Fence,No Additional Feature,0.0,9,WD,Family,107000.0,66.0,56,81.0,2
4,120.0,RM,64.0,5587.0,Pave,IR1,HLS,AllPub,Inside,Mod,Crawfor,Norm,Norm,TwnhsE,1Story,8.0,5.0,Hip,CompShg,CemntBd,CmentBd,Stone,186.0,Ex,TA,PConc,Ex,TA,Gd,GLQ,1480.0,Unf,0.0,120.0,1600.0,GasA,Ex,Y,SBrkr,1652.0,0.0,0.0,1652.0,1.0,0.0,2,0,2.0,1.0,Gd,5.0,Typ,1.0,Gd,Attchd,Fin,2.0,482.0,TA,TA,Y,162.0,53.0,0.0,0.0,0.0,0.0,No Pool,No Fence,No Additional Feature,0.0,11,New,Partial,340037.5,0.0,0,0.0,1


In [55]:
fig = px.box(data, x='saleprice', color='cluster')
fig.show()

In [56]:
fig = px.scatter_3d(data, x='saleprice', y='neighborhood', z='area_lot', color='cluster')
fig.show()

We retrain our clustering model without the sale price : 

In [57]:
clustering_model = KPrototypes(n_clusters=3, init='Cao', verbose=2, random_state=42)
clusters = clustering_model.fit_predict(data.drop(columns='saleprice').to_numpy(), categorical=categorical_indices)

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters


Starting iterations...
Run: 1, iteration: 1/100, moves: 500, ncost: 5406728786.133015
Run: 1, iteration: 2/100, moves: 150, ncost: 5176394174.754369
Run: 1, iteration: 3/100, moves: 42, ncost: 5137781737.849581
Run: 1, iteration: 4/100, moves: 22, ncost: 5127076969.024681
Run: 1, iteration: 5/100, moves: 15, ncost: 5122308687.580098
Run: 1, iteration: 6/100, moves: 12, ncost: 5119887870.817211
Run: 1, iteration: 7/100, moves: 3, ncost: 5119707159.056367
Run: 1, iteration: 8/100, moves: 1, ncost: 5119649226.068625
Run: 1, iteration: 9/100, moves: 2, ncost: 5119594750.666111
Run: 1, iteration: 10/100, moves: 0, ncost: 5119594750.666111
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 185, ncost: 5158840290.401474
Run: 2, iteration: 2/100, moves: 34, ncost: 5131808510.9525175
Run: 2, iteration: 3/100, moves: 13, ncost: 5126312800.80481
Run: 2, iteration: 4/100, moves: 14, ncost: 5121826042.134165
Run: 2, iteration: 5/100, mov

In [58]:
data['new_cluster'] = clusters

In [59]:
fig = px.scatter_3d(data, x='neighborhood', y='area_lot', z='frontge_lot', color='new_cluster')
fig.show()

In [60]:
import pickle

with open('../models/clustering_model.pkl', 'wb') as file:
    pickle.dump(clustering_model, file)