<a href="https://colab.research.google.com/github/txc-000/data-mining/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **NAMA   : FAIDZUL UMAM**
### **NIM    : A11.2023.14913**
### **MATKUL : PENAMBANGAN DATA**
### **KLMPK  : A11.4501**

# **1. Import Library**


In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import files
uploaded = files.upload()

Saving 13-09-2023.xlsx to 13-09-2023.xlsx


# **2. Baca Dataset**


In [3]:
df = pd.read_excel("13-09-2023.xlsx")

In [4]:
print("===== Data Awal =====")
print(df.head())
print("\nJumlah baris dan kolom:", df.shape)


===== Data Awal =====
   Kode Daerah BPS           Kab/Kota      Komoditas      13/09/23
0             3301       Kab. Cilacap  Kedelai Lokal  18000.000000
1             3302      Kab. Banyumas  Kedelai Lokal  20000.000000
2             3303   Kab. Purbalingga  Kedelai Lokal  14500.000000
3             3304  Kab. Banjarnegara  Kedelai Lokal  12666.666667
4             3305       Kab. Kebumen  Kedelai Lokal  18000.000000

Jumlah baris dan kolom: (19, 4)


# **3. Cleaning**


In [5]:
df.columns = ['kode_daerah', 'kab_kota', 'komoditas', 'harga']

In [6]:
imputer = SimpleImputer(strategy='mean')
df[['harga']] = imputer.fit_transform(df[['harga']])
df.drop_duplicates(inplace=True)


In [7]:
Q1, Q3 = df['harga'].quantile([0.25, 0.75])
IQR = Q3 - Q1
batas_bawah, batas_atas = Q1 - 1.5*IQR, Q3 + 1.5*IQR
df = df[(df['harga'] >= batas_bawah) & (df['harga'] <= batas_atas)]

# **4. Transformasi**


In [15]:
from sklearn.preprocessing import OneHotEncoder


In [16]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded = encoder.fit_transform(df[['kab_kota', 'komoditas']])
encoded_cols = encoder.get_feature_names_out(['kab_kota', 'komoditas'])


In [17]:
df_encoded = pd.DataFrame(encoded, columns=encoded_cols)
df = pd.concat([df_encoded, df[['kode_daerah', 'harga']].reset_index(drop=True)], axis=1)

In [18]:
print("\nData setelah encoding:")
print(df.head())


Data setelah encoding:
   kab_kota_Kab. Banjarnegara  kab_kota_Kab. Blora  kab_kota_Kab. Brebes  \
0                         0.0                  0.0                   0.0   
1                         0.0                  0.0                   0.0   
2                         1.0                  0.0                   0.0   
3                         0.0                  0.0                   0.0   
4                         0.0                  0.0                   0.0   

   kab_kota_Kab. Cilacap  kab_kota_Kab. Demak  kab_kota_Kab. Grobogan  \
0                    1.0                  0.0                     0.0   
1                    0.0                  0.0                     0.0   
2                    0.0                  0.0                     0.0   
3                    0.0                  0.0                     0.0   
4                    0.0                  0.0                     0.0   

   kab_kota_Kab. Jepara  kab_kota_Kab. Kebumen  kab_kota_Kab. Klaten  \
0       

# **5. Reduksi**


In [19]:
df = df[[col for col in df.columns if 'kab_kota' in col or 'komoditas' in col] + ['harga']]


# **6. Diskritisasi**


In [20]:
df['kategori_harga'] = pd.qcut(df['harga'], q=3, labels=['Rendah', 'Sedang', 'Tinggi'])


# **7. Scaling**


In [21]:
scaler = StandardScaler()
df[['harga']] = scaler.fit_transform(df[['harga']])

# **8. Split**


In [22]:
X = df.drop('kategori_harga', axis=1)
y = df['kategori_harga']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# **9. Simpan**


In [23]:
df.to_csv("data_kedelai_preprocessed.csv", index=False)
print("File hasil preprocessing disimpan sebagai data_kedelai_preprocessed.csv")


File hasil preprocessing disimpan sebagai data_kedelai_preprocessed.csv


# **Hasil Akhir**


In [24]:
print("\n===== Data Hasil Preprocessing =====")
print(df.head())


===== Data Hasil Preprocessing =====
   kab_kota_Kab. Banjarnegara  kab_kota_Kab. Blora  kab_kota_Kab. Brebes  \
0                         0.0                  0.0                   0.0   
1                         0.0                  0.0                   0.0   
2                         1.0                  0.0                   0.0   
3                         0.0                  0.0                   0.0   
4                         0.0                  0.0                   0.0   

   kab_kota_Kab. Cilacap  kab_kota_Kab. Demak  kab_kota_Kab. Grobogan  \
0                    1.0                  0.0                     0.0   
1                    0.0                  0.0                     0.0   
2                    0.0                  0.0                     0.0   
3                    0.0                  0.0                     0.0   
4                    0.0                  0.0                     0.0   

   kab_kota_Kab. Jepara  kab_kota_Kab. Kebumen  kab_kota_Kab. Klat

In [25]:
from google.colab import files
files.download("data_kedelai_preprocessed.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>