In [157]:
import pandas as pd
import numpy as np

# --
# Read Data
# --

In [158]:
data = pd.read_csv("datasets/weatherAUS.csv")

# --
# Data Preprocessing
# --

In [159]:
# data = np.concatenate((X, np.reshape(y, (-1, 1))), axis=1)
t = (data.dtypes == "float64")
num_cols = list(t[t].index)

# Ada Berapa Data Null di Column Numeric?
totalNull = 0
for i in num_cols:
    # print(i, data[i].isnull().sum())
    break
    
# Fill NA
for i in num_cols:
    data[i].fillna(data[i].median(), inplace=True)

In [160]:
# Mengubah agar format tanggal bisa diproses dengan memisahkan tahun, bulan, dan tanggalnya
data['Date']= pd.to_datetime(data["Date"])
data['year'] = data.Date.dt.year
data['month'] = data.Date.dt.month
data['day'] = data.Date.dt.day

In [161]:
from sklearn.preprocessing import LabelEncoder

# Mendapatkan kolom yang bertipe categorical
categorical_columns = (data.dtypes == "object")
object_columns = list(categorical_columns[categorical_columns].index)

# Melengkapi data dari kolom categorical yang memiliki nilai null
for i in object_columns:
    data[i].fillna(data[i].mode()[0], inplace=True)

# Label Encoder untuk mengubah kolom categorical menjadi numerik
label_encoder = LabelEncoder()
for i in object_columns:
    data[i] = label_encoder.fit_transform(data[i])

In [162]:
# Mengubah format data frame menjadi numpy
dataNP = data.to_numpy()

# Mengambil index kolom yang akan digunakan sebagai data X
idx = []
for i in range(26):
    # Index ke 0 yaitu date tidak digunakan karena sudah dipecah menjadi kolom day, month, year. Index 22 tidak digunakan karena merupakan target
    if i == 0 or i == 22:
        continue
    idx.append(i)         

# Inisialisasi X dan y
X = dataNP[:, idx]
y = dataNP[:, 22]

In [163]:
from sklearn.model_selection import train_test_split

# Split untuk data training dan testing
xtrain, xtest, ytrain, ytest = (train_test_split(X, y,test_size = 0.4,random_state=1))

print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(87276, 24)
(58184, 24)
(87276,)
(58184,)


In [164]:
# IQR Detecting Outlier
data = np.concatenate((xtrain, np.reshape(ytrain, (-1, 1))), axis=1)

for out in range(24):
  colData = data[:,out]
  q1,q3 = np.percentile(colData,[25,75])
  iqr = q3-q1
  lb,ub = q1-1.5*iqr, q3+1.5*iqr

  data = data[np.where((colData >= lb) & (colData <= ub))]

xtrain = data[:,0:24]
ytrain = data[:,24]

print(xtrain.shape)
print(ytrain.shape)

(17343, 24)
(17343,)


In [165]:
# Gabungkan Xtrain dan ytrain untuk detecting outlier
data = np.concatenate((xtrain, np.reshape(ytrain, (-1, 1))), axis=1)

for idx in range(0, 24):
  col_data = data[:, idx]
  
  mean = col_data.mean()
  std = col_data.std()
  lb, ub = mean-(3*std), mean+(3*std)

  data = data[np.where((col_data >= lb) & (col_data <= ub))]

# Pisahkan Xtrain dan ytrain yang sebelumnya digabung
xtrain = data[:, 0:24]
ytrain = data[:, 24]

print(xtrain.shape)
print(ytrain.shape)

(16417, 24)
(16417,)


In [166]:
from sklearn.preprocessing import StandardScaler

# Proses Standardization
scaler = StandardScaler()
scaler.fit(xtrain)
xtrain = scaler.transform(xtrain)

In [167]:
from sklearn.preprocessing import MinMaxScaler

# Proses MinMax Scaler
scaler = MinMaxScaler(feature_range=(-1,1))
scaler.fit(xtrain)
xtrain = scaler.transform(xtrain)

Classification

In [168]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

xtrain = xtrain.astype('float')
ytrain = ytrain.astype('float')
xtest = xtest.astype('float')
ytest = ytest.astype('float')

for i in range(2, 11):
    ann = MLPClassifier(hidden_layer_sizes=(i,), max_iter=10000, random_state=1)
    ann.fit(xtrain, ytrain)
    ypred = ann.predict(xtest)
    acc_score = 100 * accuracy_score(ytest, ypred)
    print("Neuron in hidden layer: %g, Accuracy: %.2f %%"%(i,acc_score))
    print("")

Neuron in hidden layer: 2, Accuracy: 78.35 %
Metrics Accuracy Score:  78.35487419221779

Neuron in hidden layer: 3, Accuracy: 78.35 %
Metrics Accuracy Score:  78.35487419221779

Neuron in hidden layer: 4, Accuracy: 78.35 %
Metrics Accuracy Score:  78.35487419221779

Neuron in hidden layer: 5, Accuracy: 31.85 %
Metrics Accuracy Score:  31.85068059947752

Neuron in hidden layer: 6, Accuracy: 78.35 %
Metrics Accuracy Score:  78.35487419221779

Neuron in hidden layer: 7, Accuracy: 78.35 %
Metrics Accuracy Score:  78.35487419221779

Neuron in hidden layer: 8, Accuracy: 65.96 %
Metrics Accuracy Score:  65.96315138182318

Neuron in hidden layer: 9, Accuracy: 25.34 %
Metrics Accuracy Score:  25.33686236766121

Neuron in hidden layer: 10, Accuracy: 21.65 %
Metrics Accuracy Score:  21.645125807782208

