In [308]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.templates.default = 'simple_white'

In [309]:
df = pd.read_csv('dataset_rumah123-new.csv')
df.head()

Unnamed: 0,km-tidur,km-mandi,garasi,bangunan,tanah,lokasi,harga
0,22,22,1,218,92,Jakarta,2700000000.0
1,20,14,0,231,92,Jakarta,2700000000.0
2,20,14,7,1100,746,Jakarta,35000000000.0
3,20,14,2,350,300,Jakarta,8500000000.0
4,18,6,1,450,300,Jakarta,16500000000.0


In [310]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5989 entries, 0 to 5988
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   km-tidur  5989 non-null   int64  
 1   km-mandi  5989 non-null   int64  
 2   garasi    5989 non-null   int64  
 3   bangunan  5989 non-null   int64  
 4   tanah     5989 non-null   int64  
 5   lokasi    5989 non-null   object 
 6   harga     5989 non-null   float64
dtypes: float64(1), int64(5), object(1)
memory usage: 327.6+ KB


In [311]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
km-tidur,5989.0,3.278511,1.289664,1.0,2.0,3.0,4.0,22.0
km-mandi,5989.0,2.561196,1.320211,1.0,2.0,2.0,3.0,22.0
garasi,5989.0,0.9667724,1.163374,0.0,0.0,1.0,1.0,30.0
bangunan,5989.0,178.6801,214.8798,1.0,68.0,115.0,200.0,5000.0
tanah,5989.0,185.4084,371.1739,1.0,75.0,105.0,180.0,17000.0
harga,5989.0,4852087000.0,10623380000.0,1000000000.0,1720000000.0,2993723000.0,4881466000.0,345000000000.0


In [312]:
df.isnull().sum()

km-tidur    0
km-mandi    0
garasi      0
bangunan    0
tanah       0
lokasi      0
harga       0
dtype: int64

In [313]:
df.drop_duplicates(inplace=True)

In [314]:
cp_df = df.copy()
for x in cp_df['tanah']:
    if x <= 100:
        cp_df.tanah.replace(x, '<=100', inplace=True)
    elif x > 100 and x <=200:
        cp_df.tanah.replace(x, '<=200', inplace=True)
    elif x > 200 and x <= 300:
        cp_df.tanah.replace(x, '<=300', inplace=True)
    else:
        cp_df.tanah.replace(x, '>400', inplace=True)


In [315]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'xy'}, {'type':'domain'}]])

fig.add_trace(go.Histogram(x = cp_df["tanah"],name="Luas Tanah"),row=1, col=1)
fig.add_trace(go.Pie(labels = cp_df["tanah"] ,name="Luas Tanah", textinfo='percent+label', textposition='inside', marker=dict(line=dict(color='white', width=1))),row=1, col=2)

fig.update_layout(showlegend=False, title="Luas Tanah (m²)", title_x=.5)

fig.show()

In [316]:
cp_df['TM'] = cp_df['km-tidur'] + cp_df['km-mandi']
fig = make_subplots(rows=2, cols=2, specs=[[{'type':'xy'}, {'type':'xy'}],[{'type':'xy'}, None]], subplot_titles=("Jumlah Kamar Tidur, Kamar mandi", "Jumlah Kamar Tidur", "Jumlah Kamar Mandi"))

fig.add_trace(go.Histogram(x = cp_df["TM"],name="TM"),row=1, col=1)
fig.add_trace(go.Histogram(x = df["km-tidur"] ,name="Kamar Tidur"),row=2, col=1)
fig.add_trace(go.Histogram(x = df["km-mandi"] ,name="Kamar Mandi"),row=1, col=2)

fig.update_layout(showlegend=False, title_x=.5)

fig.show()

In [317]:
fig = px.scatter(df, x='tanah', y='harga', color="lokasi",  trendline='ols')
fig.show()

In [318]:
fig = px.scatter(df, x="bangunan", y="harga", color="lokasi")
fig.show()

In [319]:
df['km-tidur'] = df['km-tidur'].astype(int)

In [320]:
cp_df2 = df.copy()
for x in cp_df2['lokasi']:
    if x == "Jakarta":
        cp_df2.lokasi.replace(x, 1, inplace=True)
    elif x == "Bogor":
        cp_df2.lokasi.replace(x, 2, inplace=True)
    elif x == "Depok":
        cp_df2.lokasi.replace(x, 3, inplace=True)
    elif x == "Tangerang":
        cp_df2.lokasi.replace(x, 4, inplace=True)
    elif x == "Tangerang Selatan":
        cp_df2.lokasi.replace(x, 5, inplace=True)
    elif x == "Bekasi":
        cp_df2.lokasi.replace(x, 6, inplace=True)

In [321]:
df_corr = cp_df2.corr()

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = df_corr.columns,
        y = df_corr.index,
        z = np.array(df_corr),
        text=df_corr.values,
        texttemplate='%{text:.2f}',
        colorscale = ["#1f76b5", "white", "#ff7e0f"],
        showscale=False
    )
)
fig.update_layout(height=500, title="Correlation Matrix", title_x=.5)
fig.show()

In [322]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['lokasi']= le.fit_transform(df['lokasi'])
df['lokasi'].unique()

array([3, 1, 2, 4, 5, 0])

In [323]:
from sklearn.preprocessing import StandardScaler
y = df[["harga"]]
X = df.drop("harga", axis = 1)

scaler = StandardScaler()
X = scaler.fit_transform(X)
y = scaler.fit_transform(y)

In [324]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, random_state=2)
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(3672, 6) (918, 6)
(3672, 1) (918, 1)


In [335]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(x_train, np.ravel(y_train, order='C'))

In [336]:
rf_pred = rf.predict(x_test)
rf_pred

array([ 3.85283966e-01, -7.71631203e-02,  2.79569399e+00, -3.69698746e-02,
        2.28787631e-01, -3.03540466e-01, -1.80739375e-01, -3.21456481e-01,
       -3.02325098e-01,  5.33094087e-01, -2.99360431e-01,  4.34739827e-02,
       -2.00917539e-01, -5.59540773e-02, -2.47696781e-01, -1.91953474e-01,
        1.19430089e+00,  3.57332838e-01,  3.99592905e-03,  2.10408904e-01,
       -1.43186139e-01, -2.67545800e-01, -3.02350436e-01,  8.47318782e-01,
       -2.46165003e-01,  6.31130857e-03, -2.96327654e-01, -2.88840457e-01,
       -2.93766381e-01, -1.64024147e-01, -2.27893955e-01,  1.76418408e+00,
       -2.79501083e-01,  1.54004204e-01,  8.11053227e-02, -7.46353588e-02,
       -3.97297633e-02, -1.18852445e-01, -2.98817784e-01, -4.65343139e-02,
       -3.01711636e-01, -1.43780254e-01,  1.67842967e+00, -2.44659109e-01,
       -8.73199739e-02,  5.38682250e-03, -2.28960741e-01, -7.49265789e-02,
       -2.07018073e-01, -2.66052434e-01, -3.06712380e-01, -1.87406002e-01,
        2.07188268e-01, -

In [337]:
from sklearn.metrics import r2_score
r2_score(y_test, rf_pred)

0.1290105114199812

In [338]:
n_estimators = [5,20,50,100, 200, 300, 400] # number of trees in the random forest
max_features = ['auto', 'sqrt'] # number of features in consideration at every split
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
min_samples_split = [2, 6, 10, 15, 20] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4,6] # minimum sample number that can be stored in a leaf node
bootstrap = [True, False] # method used to sample data points

random_grid = {'n_estimators': n_estimators,

'max_features': max_features,

'max_depth': max_depth,

'min_samples_split': min_samples_split,

'min_samples_leaf': min_samples_leaf,

'bootstrap': bootstrap}

In [339]:
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
               n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)

In [340]:
rf_random.fit(x_train, np.ravel(y_train, order='C'))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [341]:
print ('Random grid: ', random_grid, '\n')
# print the best parameters
print ('Best Parameters: ', rf_random.best_params_, ' \n')

Random grid:  {'n_estimators': [5, 20, 50, 100, 200, 300, 400], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], 'min_samples_split': [2, 6, 10, 15, 20], 'min_samples_leaf': [1, 3, 4, 6], 'bootstrap': [True, False]} 

Best Parameters:  {'n_estimators': 300, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': False}  



In [342]:
randmf = RandomForestRegressor(n_estimators = 400, min_samples_split = 10, min_samples_leaf= 4, max_features = 'sqrt', max_depth= 40, bootstrap=False) 
randmf.fit( x_train, y_train) 


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [343]:
rf_pred = randmf.predict(x_test)
rf_pred

array([ 3.60381183e-01, -9.30859822e-02,  1.64856975e+00,  8.68202212e-02,
        2.67028848e-01, -3.07096553e-01, -1.40736486e-01, -3.26521354e-01,
       -2.81380183e-01,  4.90235197e-01, -2.57465386e-01,  6.26199360e-02,
       -2.37132597e-01, -7.45807719e-02, -2.72742304e-01, -1.78575784e-01,
        1.45015838e+00,  3.26388390e-01, -2.53065749e-02,  1.53375826e-01,
       -1.26166650e-01, -2.69347042e-01, -2.85562327e-01,  5.89011154e-01,
       -2.24538975e-01, -3.52274945e-02, -3.01282940e-01, -2.73206610e-01,
       -2.74807047e-01, -2.29552143e-01, -2.46455324e-01,  1.93605100e+00,
       -2.76312899e-01,  2.44084359e-01,  2.54959732e-02, -5.01267682e-02,
       -6.16717497e-02, -1.65428739e-01, -2.74997570e-01, -6.08071817e-02,
       -2.92045295e-01, -1.45483685e-01,  1.50556103e+00, -2.49814355e-01,
       -9.92121968e-02,  4.42990154e-02, -2.49584329e-01, -8.86720287e-02,
       -2.38883171e-01, -2.76865325e-01, -3.18135034e-01, -1.70652705e-01,
        1.72689938e-01, -

In [344]:
from sklearn.metrics import r2_score
r2_score(y_test, rf_pred)

0.7106839735744341