In [1]:
import pandas as pd
import numpy as np
import seaborn as sns                       #visualisation
import matplotlib.pyplot as plt             #visualisation
import os
import time
import pickle
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from xgboost import plot_importance
%matplotlib inline
sns.set(color_codes=True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

PM10: Merupakan singkatan dari Particulate Matter 10 (Mikro Meter). PM10 mengacu pada partikel padat atau cair di udara dengan ukuran 10 mikrometer atau kurang. Partikel ini dapat berasal dari berbagai sumber seperti debu, asap kendaraan, industri, dan lainnya.

PM25: Merupakan singkatan dari Particulate Matter 2.5 (Mikro Meter). PM2.5 adalah partikel padat atau cair di udara dengan ukuran 2.5 mikrometer atau lebih kecil. Partikel ini lebih kecil dari PM10 dan dapat lebih dalam menembus paru-paru manusia, sehingga berpotensi lebih berbahaya bagi kesehatan.

SO2: Merupakan singkatan dari sulfur dioksida. SO2 adalah gas beracun yang dihasilkan dari pembakaran bahan bakar fosil, seperti batu bara dan minyak bumi. Gas ini dapat menyebabkan iritasi pada saluran pernapasan dan berdampak negatif pada kualitas udara.

CO: Merupakan singkatan dari karbon monoksida. CO adalah gas tak berwarna dan tidak berbau yang dihasilkan dari pembakaran tidak sempurna bahan bakar fosil. Pajanan yang tinggi terhadap karbon monoksida dapat menyebabkan masalah kesehatan serius.

O3: Merupakan singkatan dari ozon. Ozon adalah gas beracun yang terbentuk ketika polutan lain, seperti nitrogen dioksida dan senyawa organik teruap, bereaksi dengan sinar matahari. Tingkat ozon yang tinggi di permukaan bumi dapat menyebabkan iritasi paru-paru dan masalah pernapasan.

NO2: Merupakan singkatan dari nitrogen dioksida. NO2 adalah gas beracun yang dihasilkan dari aktivitas manusia, terutama dari pembakaran bahan bakar kendaraan dan industri. Pajanan yang tinggi terhadap nitrogen dioksida dapat menyebabkan masalah pernapasan dan berkontribusi pada pembentukan hujan asam.

Max: Kolom ini mungkin merujuk pada nilai tertinggi dari parameter yang tercatat dalam kolom tersebut pada periode tertentu, seperti dalam rentang waktu harian atau bulanan.

In [2]:
def read_csv_files_from_folder(folder_path):
    all_dataframes = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            all_dataframes[filename] = df
    return all_dataframes

# Ganti 'folder_path' dengan jalur folder yang berisi file-file CSV Anda
folder_path = 'tubes AI'
dataframes = read_csv_files_from_folder(folder_path)

# Sekarang, 'dataframes' berisi semua data yang terbaca dari file-file CSV di folder tersebut.
# Anda dapat mengakses masing-masing DataFrame dengan menggunakan nama file sebagai kunci:
# misalnya: dataframes['nama_file.csv']

In [3]:
# Menggabungkan semua DataFrames dalam satu DataFrame tunggal
train_data = pd.concat(dataframes.values(), ignore_index=True)

# Ubah kolom 'tanggal' menjadi tipe data datetime
train_data['tanggal'] = pd.to_datetime(train_data['tanggal'])

# Mengurutkan DataFrame berdasarkan kolom 'tanggal'
train_data.sort_values(by='tanggal',ignore_index=True, inplace = True)

In [4]:
train_data # Sebelum di apa2in

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,max,critical,categori,location,no3
0,2021-02-01,73,126,38,26,46,34.0,126,PM25,TIDAK SEHAT,DKI5,
1,2021-02-02,53,70,40,14,55,25.0,70,PM25,SEDANG,DKI3,
2,2021-02-03,32,53,40,11,42,19.0,53,PM25,SEDANG,DKI3,
3,2021-02-04,36,59,40,14,47,24.0,59,PM25,SEDANG,DKI5,
4,2021-02-05,29,51,40,14,45,35.0,51,PM25,SEDANG,DKI3,
5,2021-02-06,34,53,40,8,57,15.0,57,O3,SEDANG,DKI2,
6,2021-02-07,33,55,40,10,57,13.0,57,O3,SEDANG,DKI2,
7,2021-02-08,26,44,39,10,54,17.0,54,O3,SEDANG,DKI2,
8,2021-02-09,33,57,40,13,47,22.0,57,PM25,SEDANG,DKI4,
9,2021-02-10,50,64,40,13,49,16.0,64,PM25,SEDANG,DKI3,


# Preposcessing Data

In [5]:
# Checking Tipe Data
train_data.dtypes

tanggal     datetime64[ns]
pm10                 int64
pm25                 int64
so2                  int64
co                   int64
o3                   int64
no2                float64
max                  int64
critical            object
categori            object
location            object
no3                float64
dtype: object

In [6]:
train_data[['critical', 'location','categori']] = train_data[['critical', 'location','categori']].astype('category')
train_data.dtypes
# Ubah tipe data menjadi kategori

tanggal     datetime64[ns]
pm10                 int64
pm25                 int64
so2                  int64
co                   int64
o3                   int64
no2                float64
max                  int64
critical          category
categori          category
location          category
no3                float64
dtype: object

In [7]:
train_data = train_data[['tanggal','pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'critical', 'location', 'categori']]
train_data

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,critical,location,categori
0,2021-02-01,73,126,38,26,46,34.0,PM25,DKI5,TIDAK SEHAT
1,2021-02-02,53,70,40,14,55,25.0,PM25,DKI3,SEDANG
2,2021-02-03,32,53,40,11,42,19.0,PM25,DKI3,SEDANG
3,2021-02-04,36,59,40,14,47,24.0,PM25,DKI5,SEDANG
4,2021-02-05,29,51,40,14,45,35.0,PM25,DKI3,SEDANG
5,2021-02-06,34,53,40,8,57,15.0,O3,DKI2,SEDANG
6,2021-02-07,33,55,40,10,57,13.0,O3,DKI2,SEDANG
7,2021-02-08,26,44,39,10,54,17.0,O3,DKI2,SEDANG
8,2021-02-09,33,57,40,13,47,22.0,PM25,DKI4,SEDANG
9,2021-02-10,50,64,40,13,49,16.0,PM25,DKI3,SEDANG


In [8]:
# Checking Duplicated
train_data.duplicated().sum()

0

In [9]:
# Count missing values per column
missing_values = train_data.isnull().sum()
missing_values

# Count missing values with the percentage
missing_percentage = train_data.isnull().mean() * 100
missing_percentage

# Membuat DataFrame dari hasil
missing_df = pd.DataFrame({'Missing Count': missing_values, 'Missing Percentage': missing_percentage})
missing_df

Unnamed: 0,Missing Count,Missing Percentage
tanggal,0,0.0
pm10,0,0.0
pm25,0,0.0
so2,0,0.0
co,0,0.0
o3,0,0.0
no2,31,10.231023
critical,0,0.0
location,0,0.0
categori,0,0.0


In [10]:
# Hapus missing value dan Diurutkan dengan kolom "tanggalnya"
train_data = train_data.dropna().reset_index(drop=True).sort_values(by='tanggal',ignore_index=True)
train_data

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,critical,location,categori
0,2021-02-01,73,126,38,26,46,34.0,PM25,DKI5,TIDAK SEHAT
1,2021-02-02,53,70,40,14,55,25.0,PM25,DKI3,SEDANG
2,2021-02-03,32,53,40,11,42,19.0,PM25,DKI3,SEDANG
3,2021-02-04,36,59,40,14,47,24.0,PM25,DKI5,SEDANG
4,2021-02-05,29,51,40,14,45,35.0,PM25,DKI3,SEDANG
5,2021-02-06,34,53,40,8,57,15.0,O3,DKI2,SEDANG
6,2021-02-07,33,55,40,10,57,13.0,O3,DKI2,SEDANG
7,2021-02-08,26,44,39,10,54,17.0,O3,DKI2,SEDANG
8,2021-02-09,33,57,40,13,47,22.0,PM25,DKI4,SEDANG
9,2021-02-10,50,64,40,13,49,16.0,PM25,DKI3,SEDANG


In [11]:
train_data.isnull().sum()

tanggal     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
critical    0
location    0
categori    0
dtype: int64

# ambil kolom yang dibutuhkan

In [12]:
train_data = train_data[['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'critical', 'location', 'categori']]

In [13]:
train_data #Tabel Bersihnya

Unnamed: 0,pm10,pm25,so2,co,o3,no2,critical,location,categori
0,73,126,38,26,46,34.0,PM25,DKI5,TIDAK SEHAT
1,53,70,40,14,55,25.0,PM25,DKI3,SEDANG
2,32,53,40,11,42,19.0,PM25,DKI3,SEDANG
3,36,59,40,14,47,24.0,PM25,DKI5,SEDANG
4,29,51,40,14,45,35.0,PM25,DKI3,SEDANG
5,34,53,40,8,57,15.0,O3,DKI2,SEDANG
6,33,55,40,10,57,13.0,O3,DKI2,SEDANG
7,26,44,39,10,54,17.0,O3,DKI2,SEDANG
8,33,57,40,13,47,22.0,PM25,DKI4,SEDANG
9,50,64,40,13,49,16.0,PM25,DKI3,SEDANG


In [14]:
train_data['categori'].value_counts()
#Imbalance Class

SEDANG         153
TIDAK SEHAT    116
BAIK             3
Name: categori, dtype: int64

In [15]:
# Replace 'A' with 'X' and 'B' with 'Y'
replace_dict = {'SEDANG': 'BAIK'}
train_data['categori'] = train_data['categori'].replace(replace_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['categori'] = train_data['categori'].replace(replace_dict)


In [16]:
train_data

Unnamed: 0,pm10,pm25,so2,co,o3,no2,critical,location,categori
0,73,126,38,26,46,34.0,PM25,DKI5,TIDAK SEHAT
1,53,70,40,14,55,25.0,PM25,DKI3,BAIK
2,32,53,40,11,42,19.0,PM25,DKI3,BAIK
3,36,59,40,14,47,24.0,PM25,DKI5,BAIK
4,29,51,40,14,45,35.0,PM25,DKI3,BAIK
5,34,53,40,8,57,15.0,O3,DKI2,BAIK
6,33,55,40,10,57,13.0,O3,DKI2,BAIK
7,26,44,39,10,54,17.0,O3,DKI2,BAIK
8,33,57,40,13,47,22.0,PM25,DKI4,BAIK
9,50,64,40,13,49,16.0,PM25,DKI3,BAIK


In [17]:
column_type_dict = dict(train_data.dtypes)
# column_type_dict
categorical_features = []
numerical_features = []
for key, value in column_type_dict.items():
    if str(value) == "category":
        categorical_features.append(str(key))
    else:
        numerical_features.append(str(key))

categorical_features, numerical_features

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

for col in categorical_features :
    train_data[col] = encoder.fit_transform(train_data[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[col] = encoder.fit_transform(train_data[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[col] = encoder.fit_transform(train_data[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[col] = encoder.fit_transform(train_data[col])


In [18]:
train_data['categori'].value_counts()

0    156
1    116
Name: categori, dtype: int64

In [19]:
train_data

Unnamed: 0,pm10,pm25,so2,co,o3,no2,critical,location,categori
0,73,126,38,26,46,34.0,2,3,1
1,53,70,40,14,55,25.0,2,1,0
2,32,53,40,11,42,19.0,2,1,0
3,36,59,40,14,47,24.0,2,3,0
4,29,51,40,14,45,35.0,2,1,0
5,34,53,40,8,57,15.0,0,0,0
6,33,55,40,10,57,13.0,0,0,0
7,26,44,39,10,54,17.0,0,0,0
8,33,57,40,13,47,22.0,2,2,0
9,50,64,40,13,49,16.0,2,1,0


# Testing

In [20]:
file_path = 'tubes AI\Desember\indeks-standar-pencemar-udara-di-provinsi-dki-jakarta-bulan-desember-tahun-2021.csv'
testing_data = pd.read_csv(file_path)

In [21]:
testing_data

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
0,2021-12-01,63,100,43,13,41,30,100,PM25,SEDANG,DKI4
1,2021-12-02,35,56,42,7,40,14,56,PM25,SEDANG,DKI4
2,2021-12-03,54,71,43,9,48,17,71,PM25,SEDANG,DKI4
3,2021-12-04,50,65,45,13,43,16,65,PM25,SEDANG,DKI3
4,2021-12-05,53,80,44,17,39,29,80,PM25,SEDANG,DKI4
5,2021-12-06,62,91,55,23,45,57,91,PM25,SEDANG,DKI3
6,2021-12-07,179,58,44,12,43,21,179,PM10,TIDAK SEHAT,DKI4
7,2021-12-08,49,76,45,17,51,30,76,PM25,SEDANG,DKI4
8,2021-12-09,46,63,46,11,51,20,63,PM25,SEDANG,DKI4
9,2021-12-10,51,73,47,14,41,30,73,PM25,SEDANG,DKI4


In [22]:
testing_data[['critical', 'location','categori']] = testing_data[['critical', 'location','categori']].astype('category')

In [23]:
testing_data = testing_data[['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'critical', 'location', 'categori']]

In [24]:
testing_data

Unnamed: 0,pm10,pm25,so2,co,o3,no2,critical,location,categori
0,63,100,43,13,41,30,PM25,DKI4,SEDANG
1,35,56,42,7,40,14,PM25,DKI4,SEDANG
2,54,71,43,9,48,17,PM25,DKI4,SEDANG
3,50,65,45,13,43,16,PM25,DKI3,SEDANG
4,53,80,44,17,39,29,PM25,DKI4,SEDANG
5,62,91,55,23,45,57,PM25,DKI3,SEDANG
6,179,58,44,12,43,21,PM10,DKI4,TIDAK SEHAT
7,49,76,45,17,51,30,PM25,DKI4,SEDANG
8,46,63,46,11,51,20,PM25,DKI4,SEDANG
9,51,73,47,14,41,30,PM25,DKI4,SEDANG


In [25]:
column_type_dict = dict(testing_data.dtypes)
# column_type_dict
categorical_features = []
numerical_features = []
for key, value in column_type_dict.items():
    if str(value) == "category":
        categorical_features.append(str(key))
    else:
        numerical_features.append(str(key))

categorical_features, numerical_features

(['critical', 'location', 'categori'],
 ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2'])

In [26]:
for col in categorical_features :
    testing_data[col] = encoder.fit_transform(testing_data[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing_data[col] = encoder.fit_transform(testing_data[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing_data[col] = encoder.fit_transform(testing_data[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing_data[col] = encoder.fit_transform(testing_data[col])


In [27]:
X_test = testing_data[['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'critical', 'location']]
y_test = testing_data['categori']

In [28]:
X = train_data[['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'critical', 'location']]
y = train_data['categori']

In [29]:
from imblearn.over_sampling import SMOTE

# Assuming you have your feature matrix 'X' and target vector 'y'
# X should be a 2D array, and y should be a 1D array or pandas Series

# Create the SMOTE object with a reduced number of neighbors (e.g., 3)
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=2)

# Fit and apply SMOTE to the data
X_resampled, y_resampled = smote.fit_resample(X, y)

In [30]:
y_resampled.value_counts()

1    156
0    156
Name: categori, dtype: int64

In [31]:
from collections import Counter

# Get the value counts
Counter(y_resampled)

Counter({1: 156, 0: 156})

In [32]:

# Get the value counts
Counter(y_test)

Counter({0: 25, 1: 6})

In [33]:
X_resampled.head()

Unnamed: 0,pm10,pm25,so2,co,o3,no2,critical,location
0,73,126,38,26,46,34.0,2,3
1,53,70,40,14,55,25.0,2,1
2,32,53,40,11,42,19.0,2,1
3,36,59,40,14,47,24.0,2,3
4,29,51,40,14,45,35.0,2,1


In [34]:
from sklearn.model_selection import GridSearchCV
# Import MLPClassifer 
from sklearn.neural_network import MLPClassifier

In [35]:
# param_grid = {
#     'hidden_layer_sizes': [(30,),(40,), (50,), (60,),(70,)],
#     'max_iter': [10, 15, 20, 25, 30, 35, 40],
#     'alpha': [1e-4, 1e-5],
#     'learning_rate_init': [0.1, 0.2, 0.3],
#     'verbose':[10,20],
#     'random_state' : [7],
#     'solver' : ["sgd"]
# }

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (150,), (50, 50), (100, 50), (100, 100)],
    'activation': ['relu', 'logistic', 'tanh'],  # Activation functions to try
    'solver': ['adam', 'sgd'],  # Optimizers to try
    'alpha': [0.0001, 0.0005, 0.001],  # L2 regularization parameter
    'learning_rate_init': [0.01, 0.001],  # Initial learning rate
    'max_iter': [100, 200, 300],  # Maximum number of iterations
    'batch_size': [16, 32, 64],  # Batch size for gradient descent
    'random_state': [42],
    'early_stopping': [True],  # To enable early stopping based on validation performance
    'validation_fraction': [0.1],  # Fraction of training data to set aside as validation set
    'verbose': [False],  # Set to True for more verbose output during training
}

In [36]:
# Create the MLPClassifier object
clf = MLPClassifier(
    solver="sgd",
    verbose=18,
    random_state=7,
)

# Create the GridSearchCV object
grid_search = GridSearchCV(clf, param_grid, cv=25, n_jobs=-1)

In [37]:
# Measure the start time
start_time = time.time()

# Fit the model to the training data
grid_search.fit(X, y)

# Measure the end time
end_time = time.time()

# Calculate the total running time
running_time = end_time - start_time

In [38]:
print("Total Running Time: {:.2f} seconds".format(running_time))

Total Running Time: 816.19 seconds


In [39]:
# best_params = grid_search.best_params

validation test

In [40]:
# Make prediction on test dataset
ypred=grid_search.predict(X)

# Import accuracy score 
from sklearn.metrics import accuracy_score

# Calcuate accuracy
accuracy_score(y,ypred)

0.8823529411764706

hasil akurasi terhadap data test(data desember)

In [41]:
# Make prediction on test dataset
y_test_pred = grid_search.predict(X_test)
print(y_test_pred)
# # Import accuracy score 
# from sklearn.metrics import accuracy_score

# Calcuate accuracy
accuracy_score(y_test,y_test_pred)
print(f"hasil akurasi {accuracy_score(y_test,y_test_pred)}")

[1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0]
hasil akurasi 0.9032258064516129


In [42]:
y_test

0     0
1     0
2     0
3     0
4     0
5     0
6     1
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    1
15    1
16    0
17    0
18    1
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    1
27    0
28    0
29    1
30    0
Name: categori, dtype: int32

In [43]:
X_test['category'] = y_test
g= []
for i in X_test['category']:
    if i == 0:
        g.append("Baik")
    else:
        g.append("Tidak Sehat")
X_test['category prediksi'] = y_test_pred
c= []
for i in X_test['category prediksi']:
    if i == 0:
        c.append("Baik")
    else:
        c.append("Tidak Sehat")
X_test['category']=g
X_test['category prediksi']=c
hasil = X_test
hasil

Unnamed: 0,pm10,pm25,so2,co,o3,no2,critical,location,category,category prediksi
0,63,100,43,13,41,30,1,3,Baik,Tidak Sehat
1,35,56,42,7,40,14,1,3,Baik,Baik
2,54,71,43,9,48,17,1,3,Baik,Baik
3,50,65,45,13,43,16,1,2,Baik,Baik
4,53,80,44,17,39,29,1,3,Baik,Baik
5,62,91,55,23,45,57,1,2,Baik,Baik
6,179,58,44,12,43,21,0,3,Tidak Sehat,Tidak Sehat
7,49,76,45,17,51,30,1,3,Baik,Baik
8,46,63,46,11,51,20,1,3,Baik,Baik
9,51,73,47,14,41,30,1,3,Baik,Baik


In [44]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'activation': 'tanh', 'alpha': 0.0005, 'batch_size': 64, 'early_stopping': True, 'hidden_layer_sizes': (50,), 'learning_rate_init': 0.001, 'max_iter': 100, 'random_state': 42, 'solver': 'adam', 'validation_fraction': 0.1, 'verbose': False}
