In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import silhouette_score
import math
import random as r
import copy

# DATA EXPLORATION & DATA PREPARATION

In [None]:
# MEMBACA DATA TRAIN & DATA TEST

df_train = pd.read_csv('salju_train.csv')
df_test = pd.read_csv('salju_test.csv')

In [None]:
# MENAMPILKAN 5 DATA TERATAS PADA DATA TRAIN

df_train.head()

In [None]:
 # MENAMPILKAN JUMLAH BARIS DAN KOLOM

df_train.shape

In [None]:
# MENAMPILKAN INFORMASI TIPE DATA SETIAP KOLOM

df_train.info()

In [None]:
# MENAMPILKAN DESKRIPSI RERATA, NILAI MIN & MAX, DLL.

df_train.describe()

In [None]:
# MENAMPILKAN JUMLAH DATA NULL SETIAP KOLOM

df_train.isnull().sum()

In [None]:
# MENAMPILKAN JUMLAH DATA YANG MEMILIKI DUPLIKAT

df_train.duplicated().sum()

In [None]:
# MENGISI NILAI YANG KOSONG

df_train.fillna(df_train.mean(), inplace=True) # untuk data numerik
df_train = df_train.fillna(df_train.mode().iloc[0]) # untuk data string

df_train.head()

In [None]:
# DATA SUDAH TIDAK ADA YANG BERNILAI NULL

df_train.isnull().sum()

In [None]:
# HAPUS KOLOM YANG TIDAK DIPERLUKAN

cols = [0,1,2]
df_train.drop(df_train.columns[cols], axis=1, inplace=True)

In [None]:
# MENEMUKAN OUTLIER

df_train_num = df_train.iloc[:,[0,1,2,3,4,6,9,10,11,12,13,14,15,16,17,18]]
df_train_num.columns

fig, axes = plt.subplots(ncols = 4, nrows = 4, figsize=(20,15))

for i, ax in zip(df_train_num.columns, axes.flat):
    sns.boxplot(x=df_train_num[i],ax=ax)
plt.show()

In [None]:
# MEMBUAT MATRIKS KORELASI

mtrxCor1 = df_train_num.corr()
plt.figure(figsize=(20,15))
sns.heatmap(mtrxCor1, annot=True)
plt.show()

In [None]:
# MENAMPILKAN DISTRIBUSI DATA TIAP KOLOM MENGGUNAKAN HISTOGRAM

hist = df_train_num.hist(figsize=(20,15))

In [None]:
sns.countplot(x = 'ArahAnginTerkencang', data = df_train)

In [None]:
sns.countplot(x = 'ArahAngin9am', data = df_train)

In [None]:
sns.countplot(x = 'ArahAngin3pm', data = df_train)

In [None]:
sns.countplot(x = 'BersaljuHariIni', data = df_train)

In [None]:
sns.countplot(x = 'BersaljuBesok', data = df_train)

FEATURE ENGINEERING

In [None]:
# BINNING ATRIBUT "ARAH ANGIN"

ArahAnginTerkencang = {"ArahAnginTerkencang": 
                        {'WSW' : 'W', 
                        'ESE' : 'E',
                        'SSW' : 'S',
                        'SW' : 'W',
                        'WNW' : 'W',
                        'ENE' : 'T',
                        'SE' : 'T',
                        'SSE' : 'S',
                        'NNW' : 'N',
                        'NW' : 'W',
                        'NE' : 'E',
                        'NNE' : 'N'
                        }}
ArahAngin9am = {"ArahAngin9am": 
                        {'WSW' : 'W', 
                        'ESE' : 'E',
                        'SSW' : 'S',
                        'SW' : 'W',
                        'WNW' : 'W',
                        'ENE' : 'T',
                        'SE' : 'T',
                        'SSE' : 'S',
                        'NNW' : 'N',
                        'NW' : 'W',
                        'NE' : 'E',
                        'NNE' : 'N'
                        }}
ArahAngin3pm = {"ArahAngin3pm": 
                        {'WSW' : 'W', 
                        'ESE' : 'E',
                        'SSW' : 'S',
                        'SW' : 'W',
                        'WNW' : 'W',
                        'ENE' : 'T',
                        'SE' : 'T',
                        'SSE' : 'S',
                        'NNW' : 'N',
                        'NW' : 'W',
                        'NE' : 'E',
                        'NNE' : 'N'
                        }}

df_train.replace(ArahAnginTerkencang, inplace=True)
df_train.replace(ArahAngin9am, inplace=True)
df_train.replace(ArahAngin3pm, inplace=True)

df_train

In [None]:
# ENCODE CATEGORICAL VARIABLE

labelencoder = LabelEncoder()
df_train['ArahAnginTerkencang'] = labelencoder.fit_transform(df_train['ArahAnginTerkencang'])
labelencoder = LabelEncoder()
df_train['ArahAngin9am'] = labelencoder.fit_transform(df_train['ArahAngin9am'])
labelencoder = LabelEncoder()
df_train['ArahAngin3pm'] = labelencoder.fit_transform(df_train['ArahAngin3pm'])
labelencoder = LabelEncoder()
df_train['BersaljuHariIni'] = labelencoder.fit_transform(df_train['BersaljuHariIni'])
labelencoder = LabelEncoder()
df_train['BersaljuBesok'] = labelencoder.fit_transform(df_train['BersaljuBesok'])

df_train

In [None]:
scaler = MinMaxScaler()

df_train.iloc[0:len(df_train),[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]] = scaler.fit_transform(df_train.iloc[0:len(df_train),[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]])

df_train.head()

In [None]:
# SAVE TO CSV

df_train.to_csv(r'E:\Kuliah\Semester 4\Machine Learning\UAS\MLDeteksiSalju\data_for_clustering.csv', index=False, header=True)

K-MEANS CLUSTERING

In [None]:
# MENGHITUNG NILAI EUCLIDEAN DISTANCE

def euclidian_distance(u, v):
    return sum((p-q)**2 for p, q in zip(u, v))**0.5

In [None]:
def kmeans(n_neighbour, n_feat, centroids):
    # Looping Algoritma K-Means sampai nilai centroid sama
    while (True):
        cluster = []

        for i in range(len(X)):
            euclid = []
            #menghitung euclidean distance
            for l in range(0, n_neighbour):
                euclid.append(euclidian_distance(X[i][:n_feat],centroids[l]))
            #memilih cluster dari nilai minimum euclidean distance
            idx = np.argmin(euclid)
            cluster.append(idx+1)
            #menambahkan cluster ke X
            X[i][n_feat] = idx+1

        #clustering centroid
        group = {}
        for j in set(cluster):
            group[j] = [i for i in range(len(cluster)) if cluster[i] == j]
    
        #memasukkan info centroid ke tiap cluster
        dataX = {}
        for j in range(1,n_neighbour+1):
            dataX[j] = [X[group[j][i]][:n_feat] for i in range(len(group[j]))]
        
        #mengassign centroid baru ke tiap cluster
        new_centroids = []
        for l in range(1,n_neighbour+1):
            new_centroids.append(np.mean(dataX[l], axis=0).tolist())
        
        if (centroids == new_centroids):
            return centroids
#             break
        
        centroids = copy.copy(new_centroids)

EKSPERIMEN 1 (Menggunakan Fitur SuhuMin dan Hujan)

In [None]:
kn = [2,3,4,5,7]
sil = []

for k in kn:
    X = df_train[['SuhuMin','Hujan']]
    X['cluster'] = 0
    X = X.values.tolist()
    n_feat = 2
    centroids = []
    for i in range(k):
        rand = np.random.randint(0, len(X)-1)
        centroids.append(X[rand][:n_feat])
    km = kmeans(k, n_feat, centroids)
    xy = pd.DataFrame(data=X)
    sc = silhouette_score(xy.iloc[:,[0,1]], xy[2], metric = 'euclidean')
    sil.append(sc)

In [None]:
plt.plot(kn, sil, 'bx-') 
plt.xlabel('Nilai K') 
plt.ylabel('Silhoutte Score') 
plt.title('Silhoutte Score untuk tiap K') 
plt.show()

In [None]:
# K = 2

X = df_train[['SuhuMin','Hujan']]
X['cluster'] = 0
X = X.values.tolist()
k = 2
n_feat = 2
centroids = []
for i in range(k):
    rand = np.random.randint(0, len(X)-1)
    centroids.append(X[rand][:n_feat])
km = kmeans(k, n_feat, centroids)

In [None]:
X = np.array(X)
xy = pd.DataFrame(data=X)
output = plt.scatter(X[:,0], X[:,1], s = 100, c = xy[2], marker = 'o', alpha = 1, )
centers = np.array(centroids)
plt.scatter(centers[:,0], centers[:,1], c='red', s=200, alpha=1 , marker='o');
plt.title('Hasil Clustering K-Means')
plt.colorbar (output)
plt.show()

In [None]:
# SILHOUETTE SCORE UNTUK K = 2

score = silhouette_score(xy.iloc[:,[0,n_feat-1]], xy[n_feat], metric = 'euclidean')
score

In [None]:
# # K = 3

# X = df_train[['SuhuMin','Hujan']]
# X['cluster'] = 0
# X = X.values.tolist()
# k = 3
# n_feat = 2
# centroids = []
# for i in range(k):
#     rand = np.random.randint(0, len(X)-1)
#     centroids.append(X[rand][:n_feat])
# km = kmeans(k, n_feat, centroids)

In [None]:
# X = np.array(X)
# xy = pd.DataFrame(data=X)
# output = plt.scatter(X[:,0], X[:,1], s = 100, c = xy[2], marker = 'o', alpha = 1, )
# centers = np.array(centroids)
# plt.scatter(centers[:,0], centers[:,1], c='red', s=200, alpha=1 , marker='o');
# plt.title('Hasil Clustering K-Means')
# plt.colorbar (output)
# plt.show()

In [None]:
# # SILHOUETTE SCORE UNTUK K = 3

# score = silhouette_score(xy.iloc[:,[0,n_feat-1]], xy[n_feat], metric = 'euclidean')
# score

In [None]:
# # K = 4

# X = df_train[['SuhuMin','Hujan']]
# X['cluster'] = 0
# X = X.values.tolist()
# k = 4
# n_feat = 2
# centroids = []
# for i in range(k):
#     rand = np.random.randint(0, len(X)-1)
#     centroids.append(X[rand][:n_feat])
# km = kmeans(k, n_feat, centroids)

In [None]:
# X = np.array(X)
# xy = pd.DataFrame(data=X)
# output = plt.scatter(X[:,0], X[:,1], s = 100, c = xy[2], marker = 'o', alpha = 1, )
# centers = np.array(centroids)
# plt.scatter(centers[:,0], centers[:,1], c='red', s=200, alpha=1 , marker='o');
# plt.title('Hasil Clustering K-Means')
# plt.colorbar (output)
# plt.show()

In [None]:
# # SILHOUETTE SCORE UNTUK K = 4

# score = silhouette_score(xy.iloc[:,[0,n_feat-1]], xy[n_feat], metric = 'euclidean')
# score

EKSPERIMEN 2 (Dengan Fitur SinarMatahari dan Penguapan)

In [None]:
kn = [2,3,4,5,7]
sil = []

for k in kn:
    X = df_train[['SinarMatahari','Penguapan']]
    X['cluster'] = 0
    X = X.values.tolist()
    n_feat = 2
    centroids = []
    for i in range(k):
        rand = np.random.randint(0, len(X)-1)
        centroids.append(X[rand][:n_feat])
    km = kmeans(k, n_feat, centroids)
    xy = pd.DataFrame(data=X)
    sc = silhouette_score(xy.iloc[:,[0,1]], xy[2], metric = 'euclidean')
    sil.append(sc)

In [None]:
plt.plot(kn, sil, 'bx-') 
plt.xlabel('Nilai K') 
plt.ylabel('Silhoutte Score') 
plt.title('Silhoutte Score untuk tiap K') 
plt.show()

In [None]:
# K = 2

X = df_train[['SinarMatahari','Penguapan']]
X['cluster'] = 0
X = X.values.tolist()
k = 2
n_feat = 2
centroids = []
for i in range(k):
    rand = np.random.randint(0, len(X)-1)
    centroids.append(X[rand][:n_feat])
km = kmeans(k, n_feat, centroids)

In [None]:
X = np.array(X)
xy = pd.DataFrame(data=X)
output = plt.scatter(X[:,0], X[:,1], s = 100, c = xy[2], marker = 'o', alpha = 1, )
centers = np.array(centroids)
plt.scatter(centers[:,0], centers[:,1], c='red', s=200, alpha=1 , marker='o');
plt.title('Hasil Clustering K-Means')
plt.colorbar (output)
plt.show()

In [None]:
# SILHOUETTE SCORE UNTUK K = 2

score = silhouette_score(xy.iloc[:,[0,n_feat-1]], xy[n_feat], metric = 'euclidean')
score

In [None]:
# # K = 3

# X = df_train[['SinarMatahari','Penguapan']]
# X['cluster'] = 0
# X = X.values.tolist()
# k = 3
# n_feat = 2
# centroids = []
# for i in range(k):
#     rand = np.random.randint(0, len(X)-1)
#     centroids.append(X[rand][:n_feat])
# km = kmeans(k, n_feat, centroids)

In [None]:
# X = np.array(X)
# xy = pd.DataFrame(data=X)
# output = plt.scatter(X[:,0], X[:,1], s = 100, c = xy[2], marker = 'o', alpha = 1, )
# centers = np.array(centroids)
# plt.scatter(centers[:,0], centers[:,1], c='red', s=200, alpha=1 , marker='o');
# plt.title('Hasil Clustering K-Means')
# plt.colorbar (output)
# plt.show()

In [None]:
# # SILHOUETTE SCORE UNTUK K = 3

# score = silhouette_score(xy.iloc[:,[0,n_feat-1]], xy[n_feat], metric = 'euclidean')
# score

In [None]:
# # K = 4

# X = df_train[['SinarMatahari','Penguapan']]
# X['cluster'] = 0
# X = X.values.tolist()
# k = 4
# n_feat = 2
# centroids = []
# for i in range(k):
#     rand = np.random.randint(0, len(X)-1)
#     centroids.append(X[rand][:n_feat])
# km = kmeans(k, n_feat, centroids)

In [None]:
# X = np.array(X)
# xy = pd.DataFrame(data=X)
# output = plt.scatter(X[:,0], X[:,1], s = 100, c = xy[2], marker = 'o', alpha = 1, )
# centers = np.array(centroids)
# plt.scatter(centers[:,0], centers[:,1], c='red', s=200, alpha=1 , marker='o');
# plt.title('Hasil Clustering K-Means')
# plt.colorbar (output)
# plt.show()

In [None]:
# # SILHOUETTE SCORE UNTUK K = 4

# score = silhouette_score(xy.iloc[:,[0,n_feat-1]], xy[n_feat], metric = 'euclidean')
# score