# **LOAD DATA**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Dataset_DSA/munich.csv")
display(df.head())

Unnamed: 0,day,interval,detid,flow,occ,error,city,speed
0,2017-02-14,600,4118014,57.3,0.005,,munich,
1,2017-02-14,900,4118014,10.0,0.0,1.0,munich,
2,2017-02-14,1200,4118014,12.3,0.0,1.0,munich,
3,2017-02-14,1500,4118014,19.8,0.0,1.0,munich,
4,2017-02-14,1800,4118014,24.6,0.0,1.0,munich,


# **Random Forest - Alur Lalu Lintas**
> * Untuk prediksi `flow` berdasarkan `interval`
> * Drop kolom berikut untuk memprediksi
> > * `day`
> > * `detid`
> > * `speed`
> > * `city`
> > * `occ`
>
> Setelah itu akan dilakukan penghapusan missing values pada Kolom `error`, sehingga diperoleh dataset baru yang bersih dan tidak ada kesalahan pengukuran.

In [None]:
import numpy as np

# Drop the 'speed' column as it contains only NaN values
df_cleaned = df.drop(columns=['day','detid','speed','city','occ'])

# Filter out rows where 'error' is equal to 1. Keep rows where 'error' is not 1 or is NaN.
df_cleaned_2 = df_cleaned[ (df_cleaned['error'] != 1) | (df_cleaned['error'].isna()) ]

# Outlier removal using IQR for 'flow' column
Q1_flow = df_cleaned_2['flow'].quantile(0.25)
Q3_flow = df_cleaned_2['flow'].quantile(0.75)
IQR_flow = Q3_flow - Q1_flow

lower_bound_flow = Q1_flow - 1.5 * IQR_flow
upper_bound_flow = Q3_flow + 1.5 * IQR_flow

df_cleaned_2 = df_cleaned_2[(df_cleaned_2['flow'] >= lower_bound_flow) & (df_cleaned_2['flow'] <= upper_bound_flow)]

display(df_cleaned_2.head())

df_cleaned_flow = df_cleaned_2.drop(columns=['error'])

# Display the first few rows of the cleaned DataFrame
print("\nDrop Error and Outliers from 'flow' column")
display(df_cleaned_flow.head())

Unnamed: 0,interval,flow,error
0,600,57.3,
7,2700,0.0,
10,3600,0.0,
12,4200,0.0,
13,4500,0.0,



Drop Error and Outliers from 'flow' column


Unnamed: 0,interval,flow
0,600,57.3
7,2700,0.0
10,3600,0.0
12,4200,0.0
13,4500,0.0


In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df_cleaned_flow.drop('flow', axis=1)
y = df_cleaned_flow['flow']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (61840, 1)
Shape of X_test: (26504, 1)
Shape of y_train: (61840,)
Shape of y_test: (26504,)


In [None]:
from sklearn.ensemble import ExtraTreesRegressor
import joblib

# 1. Pastikan Data Siap (Gunakan df_cleaned_last atau df_cleaned_occ yang sudah benar)
# Ingat: Buang kolom 'error' jika ada!
X = df_cleaned_flow[['interval']]
y_flow = df_cleaned_flow['flow']
# 2. Training Extra Trees (Flow)
print("Melatih Extra Trees Flow...")
et_flow = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
et_flow.fit(X, y_flow)



Melatih Extra Trees Flow...


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the test data
y_pred = et_flow.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")
print(f"Akurasi R-squared (R2) model adalah: {r2*100:.2f}%")

Mean Absolute Error (MAE): 106.13
Mean Squared Error (MSE): 21324.35
R-squared (R2): 0.38
Akurasi R-squared (R2) model adalah: 37.90%


# **Random Forest - Okupansi Jalan**
> * Untuk prediksi `occ` berdasarkan `interval`
> * Drop kolom berikut untuk memprediksi
> > * `day`
> > * `detid`
> > * `speed`
> > * `city`
> > * `flow`
>
> Setelah itu akan dilakukan penghapusan missing values pada Kolom `error`, sehingga diperoleh dataset baru yang bersih dan tidak ada kesalahan pengukuran.

In [None]:
# Drop the 'speed' column as it contains only NaN values
df_cleaned = df.drop(columns=['day','detid','speed','city','flow'])

# Filter out rows where 'error' is equal to 1. Keep rows where 'error' is not 1 or is NaN.
df_cleaned_2 = df_cleaned[ (df_cleaned['error'] != 1) | (df_cleaned['error'].isna()) ]
display(df_cleaned_2.head())

# Outlier removal using IQR for 'occ' column
Q1_occ = df_cleaned_2['occ'].quantile(0.25)
Q3_occ = df_cleaned_2['occ'].quantile(0.75)
IQR_occ = Q3_occ - Q1_occ

lower_bound_occ = Q1_occ - 1.5 * IQR_occ
upper_bound_occ = Q3_occ + 1.5 * IQR_occ

df_cleaned_2 = df_cleaned_2[(df_cleaned_2['occ'] >= lower_bound_occ) & (df_cleaned_2['occ'] <= upper_bound_occ)]

df_cleaned_occ = df_cleaned_2.drop(columns=['error'])

# Display the first few rows of the cleaned DataFrame
print("\nDrop Error and Outliers from 'occ' column")
display(df_cleaned_occ.head())

Unnamed: 0,interval,occ,error
0,600,0.005,
7,2700,0.0,
10,3600,0.0,
12,4200,0.0,
13,4500,0.0,



Drop Error and Outliers from 'occ' column


Unnamed: 0,interval,occ
0,600,0.005
7,2700,0.0
10,3600,0.0
12,4200,0.0
13,4500,0.0


In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df_cleaned_occ.drop('occ', axis=1)
y = df_cleaned_occ['occ']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (60749, 1)
Shape of X_test: (26036, 1)
Shape of y_train: (60749,)
Shape of y_test: (26036,)


In [None]:
# 3. Training Extra Trees (Occupancy)
X = df_cleaned_occ[['interval']]
y_occ = df_cleaned_occ['occ']
print("Melatih Extra Trees Occupancy...")
et_occ = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
et_occ.fit(X, y_occ)

Melatih Extra Trees Occupancy...


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the test data
y_pred = et_occ.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")
print(f"Akurasi R-squared (R2) model adalah: {r2*100:.2f}%")

Mean Absolute Error (MAE): 0.04
Mean Squared Error (MSE): 0.00
R-squared (R2): 0.32
Akurasi R-squared (R2) model adalah: 31.75%


# **Klasifikasi Kemacetan**

> Menggunakan teori fuzzifikasi untuk klasifikasi dengan rules tertentu berdasarkan grafik lineplot dari,
> * `interval` vs `flow`
> * `interval` vs `occ`

In [None]:
!pip install scikit-fuzzy

Collecting scikit-fuzzy
  Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl (920 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/920.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.2/920.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m920.8/920.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-fuzzy
Successfully installed scikit-fuzzy-0.5.0


In [None]:
import skfuzzy as fuzz
import numpy as np
from skfuzzy import control as ctrl
import skfuzzy as fuzz
from skfuzzy import control as ctrl

def fuzzydefining(predicted_flow, predicted_occ):

  # Define universe for 'flow' (0 to 1000, based on max flow observation from data)
  # Max flow in df_cleaned_2 is around 750, so setting max to 1000 seems reasonable.
  flow = np.arange(0, 1001, 1)

  # Define universe for 'occupancy' (0 to 1, as it's a percentage or ratio)
  occ = np.arange(0, 1.01, 0.01)

  # Define universe for output 'congestion' (0 to 100, representing severity)
  congestion = np.arange(0, 101, 1)

  # Define membership functions for 'flow'
  flow_low = fuzz.trimf(flow, [0, 0, 300])
  flow_medium = fuzz.trimf(flow, [200, 450, 700])
  flow_high = fuzz.trimf(flow, [600, 1000, 1000])

  # Define membership functions for 'occupancy'
  occ_low = fuzz.trimf(occ, [0, 0, 0.3])
  occ_medium = fuzz.trimf(occ, [0.2, 0.5, 0.8])
  occ_high = fuzz.trimf(occ, [0.7, 1, 1])

  # Define membership functions for 'congestion'
  congestion_low = fuzz.trimf(congestion, [0, 0, 40])
  congestion_medium = fuzz.trimf(congestion, [30, 60, 90])
  congestion_high = fuzz.trimf(congestion, [80, 100, 100])

  # print("Fuzzy universes and membership functions for 'flow', 'occupancy', and 'congestion' have been defined.")


  # New Antecedent/Consequent objects hold universe variables and membership functions
  flow_antecedent = ctrl.Antecedent(flow, 'flow')
  occ_antecedent = ctrl.Antecedent(occ, 'occ')
  congestion_consequent = ctrl.Consequent(congestion, 'congestion')

  # Assign membership functions to the Antecedents and Consequents
  flow_antecedent['low'] = flow_low
  flow_antecedent['medium'] = flow_medium
  flow_antecedent['high'] = flow_high

  occ_antecedent['low'] = occ_low
  occ_antecedent['medium'] = occ_medium
  occ_antecedent['high'] = occ_high

  congestion_consequent['Lancar'] = congestion_low
  congestion_consequent['Padat Merayap'] = congestion_medium
  congestion_consequent['Macet'] = congestion_high

  # Define fuzzy rules
  rule1 = ctrl.Rule(flow_antecedent['high'] & occ_antecedent['low'], congestion_consequent['Lancar'])
  rule2 = ctrl.Rule(flow_antecedent['medium'] & occ_antecedent['low'], congestion_consequent['Lancar'])
  rule3 = ctrl.Rule(flow_antecedent['low'] & occ_antecedent['low'], congestion_consequent['Lancar'])

  rule4 = ctrl.Rule(flow_antecedent['high'] & occ_antecedent['medium'], congestion_consequent['Padat Merayap'])
  rule5 = ctrl.Rule(flow_antecedent['medium'] & occ_antecedent['medium'], congestion_consequent['Padat Merayap'])
  rule6 = ctrl.Rule(flow_antecedent['low'] & occ_antecedent['medium'], congestion_consequent['Macet'])

  rule7 = ctrl.Rule(flow_antecedent['high'] & occ_antecedent['high'], congestion_consequent['Padat Merayap'])
  rule8 = ctrl.Rule(flow_antecedent['medium'] & occ_antecedent['high'], congestion_consequent['Padat Merayap'])
  rule9 = ctrl.Rule(flow_antecedent['low'] & occ_antecedent['high'], congestion_consequent['Macet'])

  # print("Fuzzy rules for congestion classification have been defined.")

    # Create a ControlSystem object
  congestion_ctrl = ctrl.ControlSystem([
      rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8, rule9
  ])

  # Create a ControlSystemSimulation object
  congestion_simulation = ctrl.ControlSystemSimulation(congestion_ctrl)

  # Pass the input values to the simulation
  # Ensure predicted_flow and predicted_occ are scalars
  congestion_simulation.input['flow'] = predicted_flow[0] if isinstance(predicted_flow, np.ndarray) else predicted_flow
  congestion_simulation.input['occ'] = predicted_occ[0] if isinstance(predicted_occ, np.ndarray) else predicted_occ

  # Compute the result
  congestion_simulation.compute()

  # Get the defuzzified output
  congestion_level = congestion_simulation.output['congestion']

  # Map the numerical congestion level to a categorical label
  if congestion_level <= 30:
      congestion_status = "Lancar"
  elif congestion_level <= 70:
      congestion_status = "Padat Merayap"
  else:
      congestion_status = "Macet"

  print(f"Persentase Arus Lalu Lintas : {predicted_flow[0]/10:.2f}%")
  print(f"Persentase Okupansi Jalan   : {predicted_occ[0]:.2f}%")
  print(f"Status Jalanan: {congestion_status}")
  print(f"Dengan Level sebesar : {congestion_level:.2f}")

# **Input dari User**

> Input agar user mendapatkan prediksi apakah pada jam tertentu terdapat kemacetan

In [None]:
import pandas as pd
import numpy as np

# --- Input for modelFlow (predicting 'flow') ---
print("--- Prediksi Input Dari User ---")
interval_input = float(input("Interval dalam detik (0 - 86400): "))
error_input = np.nan

# Create DataFrame for flow prediction
interval_user_input = pd.DataFrame([{
    'interval': interval_input,
}])

# Make prediction using modelFlow
predicted_flow = et_flow.predict(interval_user_input)
predicted_occ = et_occ.predict(interval_user_input)
fuzzydefining(predicted_flow, predicted_occ)

--- Prediksi Input Dari User ---
Interval dalam detik (0 - 86400): 25500
Persentase Arus Lalu Lintas : 27.70%
Persentase Okupansi Jalan   : 0.09%
Status Jalanan: Lancar
Dengan Level sebesar : 17.11


In [None]:
joblib.dump(et_flow, 'model_flow_et.pkl')
joblib.dump(et_occ, 'model_occ_et.pkl')

print("✅ Model Extra Trees berhasil disimpan! Silakan download 2 file .pkl tersebut.")

✅ Model Extra Trees berhasil disimpan! Silakan download 2 file .pkl tersebut.
