In [5]:
import pandas as pd
import numpy as np
from pandas_datareader import data as pdr
from ta import add_all_ta_features
import yfinance as yf
yf.pdr_override()

# Statistics
from statsmodels.tsa.stattools import adfuller

#Data Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#Supervised Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold

#Reporting 
import matplotlib.pyplot as plt


In [6]:
df = pd.read_csv('SydneyHousePrices.csv')
df.head()

Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType
0,2019-06-19,1,Avalon Beach,2107,1210000,4.0,2,2.0,house
1,2019-06-13,2,Avalon Beach,2107,2250000,4.0,3,4.0,house
2,2019-06-07,3,Whale Beach,2107,2920000,3.0,3,2.0,house
3,2019-05-28,4,Avalon Beach,2107,1530000,3.0,1,2.0,house
4,2019-05-22,5,Whale Beach,2107,8000000,5.0,4,4.0,house


In [7]:
# Count Unique Items for suburd
suburb_text_unique = df['suburb'].unique()
print(len(suburb_text_unique))

685


In [8]:
# Count Unique Items for suburd
prop_type_unique = df['propType'].unique()
print(len(prop_type_unique))

8


In [10]:
labelencoder = LabelEncoder()
encoded_suburb = labelencoder.fit_transform(df['suburb'])
df['suburb_encoderd'] = encoded_suburb
df.tail()

Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType,suburb_encoderd
199499,2014-06-20,199500,Illawong,2234,1900000,5.0,3,7.0,house,318
199500,2014-05-26,199501,Illawong,2234,980000,4.0,3,2.0,house,318
199501,2014-04-17,199502,Alfords Point,2234,850000,4.0,2,2.0,house,5
199502,2013-09-07,199503,Illawong,2234,640000,3.0,2,2.0,townhouse,318
199503,2011-04-16,199504,Alfords Point,2234,1611000,5.0,4,3.0,house,5


In [11]:
# One Hot Encoding
onehot_decoded = pd.get_dummies(df['propType'], prefix='pt', drop_first=True);
df = df.join(onehot_decoded)
df.tail()

Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType,suburb_encoderd,pt_duplex/semi-detached,pt_house,pt_other,pt_terrace,pt_townhouse,pt_villa,pt_warehouse
199499,2014-06-20,199500,Illawong,2234,1900000,5.0,3,7.0,house,318,0,1,0,0,0,0,0
199500,2014-05-26,199501,Illawong,2234,980000,4.0,3,2.0,house,318,0,1,0,0,0,0,0
199501,2014-04-17,199502,Alfords Point,2234,850000,4.0,2,2.0,house,5,0,1,0,0,0,0,0
199502,2013-09-07,199503,Illawong,2234,640000,3.0,2,2.0,townhouse,318,0,0,0,0,1,0,0
199503,2011-04-16,199504,Alfords Point,2234,1611000,5.0,4,3.0,house,5,0,1,0,0,0,0,0


In [12]:
df['TARGET'] = df['sellPrice'] 
df

Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType,suburb_encoderd,pt_duplex/semi-detached,pt_house,pt_other,pt_terrace,pt_townhouse,pt_villa,pt_warehouse,TARGET
0,2019-06-19,1,Avalon Beach,2107,1210000,4.0,2,2.0,house,22,0,1,0,0,0,0,0,1210000
1,2019-06-13,2,Avalon Beach,2107,2250000,4.0,3,4.0,house,22,0,1,0,0,0,0,0,2250000
2,2019-06-07,3,Whale Beach,2107,2920000,3.0,3,2.0,house,654,0,1,0,0,0,0,0,2920000
3,2019-05-28,4,Avalon Beach,2107,1530000,3.0,1,2.0,house,22,0,1,0,0,0,0,0,1530000
4,2019-05-22,5,Whale Beach,2107,8000000,5.0,4,4.0,house,654,0,1,0,0,0,0,0,8000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199499,2014-06-20,199500,Illawong,2234,1900000,5.0,3,7.0,house,318,0,1,0,0,0,0,0,1900000
199500,2014-05-26,199501,Illawong,2234,980000,4.0,3,2.0,house,318,0,1,0,0,0,0,0,980000
199501,2014-04-17,199502,Alfords Point,2234,850000,4.0,2,2.0,house,5,0,1,0,0,0,0,0,850000
199502,2013-09-07,199503,Illawong,2234,640000,3.0,2,2.0,townhouse,318,0,0,0,0,1,0,0,640000


In [13]:
df_drop = df.copy()
df_drop.drop(columns=['Date', 'Id', 'suburb', 'propType', 'sellPrice'], inplace=True); ## Inplace chinh sua trong du lieu goc
df_drop

Unnamed: 0,postalCode,bed,bath,car,suburb_encoderd,pt_duplex/semi-detached,pt_house,pt_other,pt_terrace,pt_townhouse,pt_villa,pt_warehouse,TARGET
0,2107,4.0,2,2.0,22,0,1,0,0,0,0,0,1210000
1,2107,4.0,3,4.0,22,0,1,0,0,0,0,0,2250000
2,2107,3.0,3,2.0,654,0,1,0,0,0,0,0,2920000
3,2107,3.0,1,2.0,22,0,1,0,0,0,0,0,1530000
4,2107,5.0,4,4.0,654,0,1,0,0,0,0,0,8000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199499,2234,5.0,3,7.0,318,0,1,0,0,0,0,0,1900000
199500,2234,4.0,3,2.0,318,0,1,0,0,0,0,0,980000
199501,2234,4.0,2,2.0,5,0,1,0,0,0,0,0,850000
199502,2234,3.0,2,2.0,318,0,0,0,0,1,0,0,640000


In [15]:
is_null = df_drop.isnull().values.any() ## Kiem tra du lieu co null hay khong?
is_inf = df_drop.isin([np.inf, -np.inf]).values.any() ## kiem tra du lieu co du lieu vo han hay khong?
print(is_null, is_inf);

True False


In [16]:
df_drop = df_drop.fillna(df_drop.mean())
df_drop.isnull().values.any()

False




## Min Max Scaling



In [17]:
df_scaling = df_drop.copy();
mms = MinMaxScaler();  ## Min Max Scaling
df_scaling.iloc[:] = mms.fit_transform(df_scaling); ## Transform cac cot ve tu khoang so tu 0 den 1
df_scaling.head()

Unnamed: 0,postalCode,bed,bath,car,suburb_encoderd,pt_duplex/semi-detached,pt_house,pt_other,pt_terrace,pt_townhouse,pt_villa,pt_warehouse,TARGET
0,0.037179,0.030612,0.010204,0.025,0.032164,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000563
1,0.037179,0.030612,0.020408,0.075,0.032164,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.001048
2,0.037179,0.020408,0.020408,0.025,0.95614,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.00136
3,0.037179,0.020408,0.0,0.025,0.032164,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000712
4,0.037179,0.040816,0.030612,0.075,0.95614,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.003725




### Train Test Split




In [19]:
is_deep_learning = True
df_tts = df_scaling.copy() if is_deep_learning else df_drop.copy();
df_tts


Unnamed: 0,postalCode,bed,bath,car,suburb_encoderd,pt_duplex/semi-detached,pt_house,pt_other,pt_terrace,pt_townhouse,pt_villa,pt_warehouse,TARGET
0,0.037179,0.030612,0.010204,0.025,0.032164,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000563
1,0.037179,0.030612,0.020408,0.075,0.032164,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.001048
2,0.037179,0.020408,0.020408,0.025,0.956140,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.001360
3,0.037179,0.020408,0.000000,0.025,0.032164,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000712
4,0.037179,0.040816,0.030612,0.075,0.956140,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.003725
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199499,0.081306,0.040816,0.020408,0.150,0.464912,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000885
199500,0.081306,0.030612,0.020408,0.025,0.464912,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000456
199501,0.081306,0.030612,0.010204,0.025,0.007310,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000396
199502,0.081306,0.020408,0.010204,0.025,0.464912,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000298


In [31]:
# Split X and Y data 
X = df_tts.iloc[:, : -1].values
Y = df_tts.iloc[:, -1].values

Y

# Example: 
# |  Feature_1  |  Feature_2  |  Feature_3  |  Label  |
# |-------------|-------------|-------------|---------|
# |     0.1     |     0.2     |     0.3     |    1    |
# |     0.4     |     0.5     |     0.6     |    0    |
# |     0.7     |     0.8     |     0.9     |    1    |
# X: 
# [[0.1 0.2 0.3]
# [0.4 0.5 0.6]
# [0.7 0.8 0.9]]

# Y: [1 0 1]

array([0.00056345, 0.00104774, 0.00135973, ..., 0.00039581, 0.00029802,
       0.00075018])

In [33]:
## Train Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=1, shuffle=True);

print('X_train', X_train.shape)
#miễn là bạn sử dụng cùng một giá trị cho random_state. 
#Điều này giúp đảm bảo tính nhất quán và tái tạo kết quả, giúp bạn dễ dàng so sánh và đánh giá hiệu suất của các mô hình khác nhau.

X_train (179553, 12)
