In [241]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.templates.default = 'simple_white'

In [242]:
df = pd.read_csv('dataset_rumah123-new.csv')
df.head()

Unnamed: 0,km-tidur,km-mandi,garasi,bangunan,tanah,lokasi,harga
0,4,4,1,218.0,92.0,Jakarta,2700000000
1,4,3,0,231.0,92.0,Jakarta,2700000000
2,5,11,7,1100.0,746.0,Jakarta,35000000000
3,4,3,2,350.0,300.0,Jakarta,8500000000
4,3,3,1,450.0,300.0,Jakarta,16500000000


In [243]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   km-tidur  6000 non-null   int64  
 1   km-mandi  6000 non-null   int64  
 2   garasi    6000 non-null   int64  
 3   bangunan  6000 non-null   float64
 4   tanah     6000 non-null   float64
 5   lokasi    6000 non-null   object 
 6   harga     6000 non-null   int64  
dtypes: float64(2), int64(4), object(1)
memory usage: 328.2+ KB


In [244]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
km-tidur,6000.0,3.370667,2.742236,1.0,2.0,3.0,4.0,106.0
km-mandi,6000.0,2.636833,2.402201,1.0,2.0,2.0,3.0,99.0
garasi,6000.0,0.9678333,1.177014,0.0,0.0,1.0,1.0,30.0
bangunan,6000.0,181.4045,233.5665,1.0,69.0,115.0,200.0,6000.0
tanah,6000.0,186.0765,371.3632,1.0,75.0,105.0,180.0,17000.0
harga,6000.0,4876058000.0,10647250000.0,1000000000.0,1720000000.0,2993723000.0,4881466000.0,345000000000.0


In [245]:
df.isnull().sum()

km-tidur    0
km-mandi    0
garasi      0
bangunan    0
tanah       0
lokasi      0
harga       0
dtype: int64

In [246]:
fig = go.Figure()
fig.add_trace(go.Histogram(x = df["km-tidur"], name="KT", texttemplate="%{y}"), ).update_xaxes(categoryorder='total descending')
fig.update_layout(title="Kamar Tidur", title_x=.5, )
fig.show()

In [247]:
cp_df = df.copy()
for x in cp_df['tanah']:
    if x <= 100:
        cp_df.tanah.replace(x, '<=100', inplace=True)
    elif x > 100 and x <=200:
        cp_df.tanah.replace(x, '<=200', inplace=True)
    elif x > 200 and x <= 300:
        cp_df.tanah.replace(x, '<=300', inplace=True)
    else:
        cp_df.tanah.replace(x, '>400', inplace=True)


In [248]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'xy'}, {'type':'domain'}]])

fig.add_trace(go.Histogram(x = cp_df["tanah"],name="Luas Tanah"),row=1, col=1)
fig.add_trace(go.Pie(labels = cp_df["tanah"] ,name="Luas Tanah", textinfo='percent+label', textposition='inside', marker=dict(line=dict(color='white', width=1))),row=1, col=2)

fig.update_layout(showlegend=False, title="Luas Tanah (m²)", title_x=.5)

fig.show()

In [249]:
cp_df['TM'] = cp_df['km-tidur'] + cp_df['km-mandi']
fig = make_subplots(rows=2, cols=2, specs=[[{'type':'xy'}, {'type':'xy'}],[{'type':'xy'}, None]], subplot_titles=("Jumlah Kamar Tidur, Kamar mandi", "Jumlah Kamar Tidur", "Jumlah Kamar Mandi"))

fig.add_trace(go.Histogram(x = cp_df["TM"],name="TM"),row=1, col=1)
fig.add_trace(go.Histogram(x = df["km-tidur"] ,name="Kamar Tidur"),row=2, col=1)
fig.add_trace(go.Histogram(x = df["km-mandi"] ,name="Kamar Mandi"),row=1, col=2)

fig.update_layout(showlegend=False, title_x=.5)

fig.show()

In [250]:
fig = px.scatter(df, x='tanah', y='harga', color="lokasi",  trendline='ols')
fig.show()

In [251]:
fig = px.scatter(df, x="bangunan", y="harga", color="lokasi")
fig.show()

In [252]:
df['km-tidur'] = df['km-tidur'].astype(int)

In [253]:
cp_df2 = df.copy()
for x in cp_df2['lokasi']:
    if x == "Jakarta":
        cp_df2.lokasi.replace(x, 1, inplace=True)
    elif x == "Bogor":
        cp_df2.lokasi.replace(x, 2, inplace=True)
    elif x == "Depok":
        cp_df2.lokasi.replace(x, 3, inplace=True)
    elif x == "Tangerang":
        cp_df2.lokasi.replace(x, 4, inplace=True)
    elif x == "Tangerang Selatan":
        cp_df2.lokasi.replace(x, 5, inplace=True)
    elif x == "Bekasi":
        cp_df2.lokasi.replace(x, 6, inplace=True)

In [254]:
df_corr = cp_df2.corr()

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = df_corr.columns,
        y = df_corr.index,
        z = np.array(df_corr),
        text=df_corr.values,
        texttemplate='%{text:.2f}',
        colorscale = ["#1f76b5", "white", "#ff7e0f"],
        showscale=False
    )
)
fig.update_layout(height=500, title="Correlation Matrix", title_x=.5)
fig.show()

In [255]:
df = pd.get_dummies(df)

In [256]:
df.drop_duplicates(inplace=True)

In [257]:
from sklearn.preprocessing import StandardScaler
y = df[["harga"]]
X = df.drop("harga", axis = 1)

scaler = StandardScaler()
X = scaler.fit_transform(X)
y = scaler.fit_transform(y)

In [258]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, random_state=2)
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(3217, 11) (805, 11)
(3217, 1) (805, 1)


In [259]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10, random_state=0)
rf.fit(x_train, np.ravel(y_train, order='C'))

In [260]:
rf_pred = rf.predict(x_test)
rf_pred

array([-3.09299311e-01, -2.93425799e-01, -1.39611180e-01, -6.82190060e-02,
       -2.81082031e-01,  4.37296766e-01,  1.90089703e-02,  1.22655504e-01,
       -6.59717360e-02, -2.88146509e-01, -3.16080246e-01, -1.38508558e-01,
       -2.21068755e-01, -3.34932915e-01,  1.18621385e+00, -3.36730964e-01,
       -2.16895321e-01, -2.16895321e-01, -2.28529825e-01, -1.97105078e-01,
        1.62419134e+00, -6.82190060e-02, -2.53811388e-01, -2.33334110e-01,
       -3.17891698e-01, -6.82190060e-02, -2.56882980e-01, -1.51659855e-01,
       -1.79671363e-01, -2.45408583e-01,  2.27404660e-01,  3.18371032e-01,
       -2.99508623e-01,  1.04147194e-01, -2.48298275e-01,  2.56772133e+00,
       -3.60434057e-02,  6.37344026e-01, -2.70744522e-01, -1.27836745e-01,
        1.74602015e-01, -1.98640874e-01, -2.66727825e-01, -2.24727543e-01,
       -6.82190060e-02, -1.35279448e-01,  3.22456942e+00, -9.48444299e-02,
       -3.23139211e-01, -2.45768780e-01, -1.23938186e-01, -9.48444299e-02,
       -2.23596723e-01, -

In [261]:
from sklearn.metrics import r2_score
r2_score(y_test, rf_pred)

0.7186167503220015

In [262]:
n_estimators = [5,20,50,100, 200, 300, 400] # number of trees in the random forest
max_features = ['auto', 'sqrt'] # number of features in consideration at every split
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
min_samples_split = [2, 6, 10, 15, 20] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4,6] # minimum sample number that can be stored in a leaf node
bootstrap = [True, False] # method used to sample data points

random_grid = {'n_estimators': n_estimators,

'max_features': max_features,

'max_depth': max_depth,

'min_samples_split': min_samples_split,

'min_samples_leaf': min_samples_leaf,

'bootstrap': bootstrap}

In [263]:
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
               n_iter = 100, cv = 10, verbose=2, random_state=35, n_jobs = -1)

In [264]:
rf_random.fit(x_train, np.ravel(y_train, order='C'))

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [265]:
print ('Random grid: ', random_grid, '\n')
# print the best parameters
print ('Best Parameters: ', rf_random.best_params_, ' \n')

Random grid:  {'n_estimators': [5, 20, 50, 100, 200, 300, 400], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], 'min_samples_split': [2, 6, 10, 15, 20], 'min_samples_leaf': [1, 3, 4, 6], 'bootstrap': [True, False]} 

Best Parameters:  {'n_estimators': 300, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}  



In [272]:
randmf = RandomForestRegressor(n_estimators = 400, min_samples_split = 10, min_samples_leaf= 4, max_features = 'sqrt', max_depth= 40, bootstrap=False) 
randmf.fit( x_train, y_train) 


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [273]:
rf_pred = randmf.predict(x_test)
rf_pred

array([-3.02881368e-01, -2.62659339e-01, -3.60702878e-02, -7.40448397e-02,
       -2.61698829e-01,  4.76342827e-01, -2.57222468e-02,  1.90874258e-01,
       -2.31353187e-01, -2.07229419e-01, -2.89928006e-01, -1.58667492e-01,
       -9.12376346e-02, -3.27453456e-01,  1.26101013e-01, -2.98208368e-01,
       -2.20681008e-01, -2.16412376e-01, -2.41572874e-01, -1.57558068e-01,
        1.23850783e+00, -8.48562563e-02, -2.34374360e-01, -2.08656818e-01,
       -3.12388215e-01,  4.61477749e-01, -2.81516726e-01, -2.52309679e-01,
       -1.96547423e-01, -2.71459339e-01,  2.73020768e-01,  2.03693527e-01,
       -2.81117237e-01,  2.06374343e-01, -2.84130902e-01,  2.96893262e+00,
       -1.26127311e-02,  7.68345881e-01, -2.72577509e-01, -1.07321380e-01,
       -6.85706988e-02, -1.83393813e-01, -2.83663358e-01, -2.31464891e-01,
       -1.27852155e-01, -1.31743250e-01,  2.78634375e+00, -1.06774938e-01,
       -2.97805330e-01, -2.74141227e-01, -1.36948709e-01, -1.05362176e-01,
       -1.68886923e-01, -

In [274]:
from sklearn.metrics import r2_score
r2_score(y_test, rf_pred)

0.7314692693384991