In [247]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.templates.default = 'simple_white'

In [248]:
df = pd.read_csv('data_rumah.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,area,bed,bath,location,price
0,0,69.0,3,2,Jakarta,1600000000
1,1,73.0,2,2,Jakarta,1700000000
2,2,60.0,4,4,Jakarta,1000000000
3,3,60.0,2,2,Jakarta,790000000
4,4,400.0,6,5,Jakarta,12000000000


In [249]:
df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

In [250]:
df.head()

Unnamed: 0,area,bed,bath,location,price
0,69.0,3,2,Jakarta,1600000000
1,73.0,2,2,Jakarta,1700000000
2,60.0,4,4,Jakarta,1000000000
3,60.0,2,2,Jakarta,790000000
4,400.0,6,5,Jakarta,12000000000


In [251]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5992 entries, 0 to 5991
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   area      5992 non-null   float64
 1   bed       5992 non-null   int64  
 2   bath      5992 non-null   int64  
 3   location  5992 non-null   object 
 4   price     5992 non-null   int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 234.2+ KB


In [252]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
area,5992.0,160.843,215.2772,1.0,54.0,89.0,180.0,4600.0
bed,5992.0,3.299232,1.429269,1.0,2.0,3.0,4.0,10.0
bath,5992.0,2.5247,1.505697,1.0,1.0,2.0,3.0,10.0
price,5992.0,16590570000.0,584164100000.0,1000000.0,708312500.0,1258000000.0,3100000000.0,42950000000000.0


In [253]:
df.isnull().sum()

area        0
bed         0
bath        0
location    0
price       0
dtype: int64

In [254]:
df['bed'].unique()

array([ 3,  2,  4,  6,  5,  9,  8,  7, 10,  1], dtype=int64)

In [255]:
fig = go.Figure()
fig.add_trace(go.Histogram(x = df["bed"], name="KT", texttemplate="%{y}")).update_xaxes(categoryorder='total descending')
fig.update_layout(title="Kamar Tidur", title_x=.5)
fig.show()

In [256]:
cp_df = df.copy()

In [257]:
for x in cp_df['area']:
    if x <= 100:
        cp_df.area.replace(x, '<=100', inplace=True)
    elif x > 100 and x <=200:
        cp_df.area.replace(x, '<=200', inplace=True)
    elif x > 200 and x <= 300:
        cp_df.area.replace(x, '<=300', inplace=True)
    else:
        cp_df.area.replace(x, '>400', inplace=True)

In [258]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'xy'}, {'type':'domain'}]])

fig.add_trace(go.Histogram(x = cp_df["area"],name="Luas Tanah"),row=1, col=1)
fig.add_trace(go.Pie(labels = cp_df["area"] ,name="Luas Tanah", textinfo='percent+label', textposition='inside', marker=dict(line=dict(color='white', width=1))),row=1, col=2)

fig.update_layout(showlegend=False, title="Luas Tanah (m²)", title_x=.5)

fig.show()

In [259]:
cp_df.head()

Unnamed: 0,area,bed,bath,location,price
0,<=100,3,2,Jakarta,1600000000
1,<=100,2,2,Jakarta,1700000000
2,<=100,4,4,Jakarta,1000000000
3,<=100,2,2,Jakarta,790000000
4,>400,6,5,Jakarta,12000000000


In [260]:
cp_df['BB'] = df['bed'] + df['bath']

In [261]:
fig = make_subplots(rows=2, cols=2, specs=[[{'type':'xy'}, {'type':'xy'}], [{'type':'xy'}, None]], subplot_titles=("Jumlah Kamar Mandi dan Kamar Tidur", "Jumlah Kamar Mandi", "Jumlah Kamar Tidur"))

fig.add_trace(go.Histogram(x = cp_df["BB"],name="TMG"),row=1, col=1)
fig.add_trace(go.Histogram(x = df["bath"] ,name="Bath Room"),row=1, col=2)
fig.add_trace(go.Histogram(x = df["bed"] ,name="Bed Room"),row=2, col=1)

fig.update_layout(showlegend=False, title_x=.5)

fig.show()

In [262]:
# # Convert categorical data location to numerical for visualize 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
location_label = le.fit_transform(cp_df['location'])
location_label

array([2, 2, 2, ..., 1, 1, 1])

In [263]:
cp_df.drop("location", axis=1, inplace=True)

In [264]:
cp_df["location"] = location_label

In [265]:
fig = px.scatter(cp_df, x="area", y="price", color="location")
fig.show()

In [266]:
df['location'] = location_label

In [267]:
df['area'] = df['area'].astype(int)

In [268]:
df_corr = df.corr()

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = df_corr.columns,
        y = df_corr.index,
        z = np.array(df_corr),
        text=df_corr.values,
        texttemplate='%{text:.2f}',
        colorscale = ["#1f76b5", "white", "#ff7e0f"],
        showscale=False
    )
)
fig.update_layout(height=500, title="Correlation Matrix", title_x=.5)
fig.show()

In [269]:
from sklearn.preprocessing import StandardScaler
X = df.drop(df[['price']], axis=1)
y = df[['price']]

scaler = StandardScaler()
y = scaler.fit_transform(y)

In [270]:
X.head()

Unnamed: 0,area,bed,bath,location
0,69,3,2,2
1,73,2,2,2
2,60,4,4,2
3,60,2,2,2
4,400,6,5,2


In [271]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, train_size = 0.9, random_state=10)
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(5392, 4) (600, 4)
(5392, 1) (600, 1)


In [272]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10, random_state=0)
rf.fit(x_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [273]:
rf_pred = rf.predict(x_test)
rf_pred

array([-2.53389575e-02, -2.74385075e-02, -2.55996435e-02, -1.94930720e-02,
       -2.67176012e-02, -2.28359524e-02,  1.35719605e-03, -2.59508340e-02,
       -2.68086789e-02, -2.63272028e-02, -2.54284037e-02, -2.58978356e-02,
       -2.33650883e-02, -2.70634361e-02, -1.37763127e-02, -2.04992162e-02,
       -2.73614370e-02, -2.16056186e-02, -2.63272028e-02, -2.62909505e-02,
       -2.69530747e-02, -2.70243692e-02, -1.79906609e-02, -2.70727353e-02,
        1.61687673e-02, -2.18716619e-02,  4.99221477e-02, -2.41417392e-02,
       -2.25878446e-02, -2.72976734e-02, -1.54131777e-02, -2.73412453e-02,
       -2.70634361e-02,  5.58268563e-02, -2.71390533e-02, -2.53712273e-02,
       -2.40886896e-02, -2.74730678e-02, -2.62366670e-02, -1.34795363e-02,
        1.00685488e+00, -1.30805906e-02, -2.25829532e-02, -2.62790756e-02,
       -2.66739643e-02, -2.74385075e-02, -2.77132769e-02, -2.57540291e-02,
       -2.24216388e-02, -2.73614370e-02,  5.08141945e-03, -2.60516635e-02,
       -2.63199839e-02, -

In [274]:
from sklearn.metrics import r2_score
r2_score(y_test, rf_pred)

-5.432162807522967