<a href="https://colab.research.google.com/github/yasinnerten/graduation-thesis/blob/main/Metro_Istanbul_Usage_Prediciton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
!pip install sktime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import tensorflow as tf

print('done')

done


In [3]:
# Load the data
data = pd.read_csv("/content/hourly_transportation_202409.csv")
data

Unnamed: 0,transition_date,transition_hour,transport_type_id,road_type,line,transfer_type,number_of_passage,number_of_passenger,product_kind,transaction_type_desc,town,line_name,station_poi_desc_cd
0,2024-09-01,0,2,RAYLI,KIRAZLI-BASAKSEHIR/METROKENT,Normal,3.0,3.0,TAM,Tam Kontur,BASAKSEHIR,M3,IKITELLI SANAYI
1,2024-09-01,0,2,RAYLI,TOPKAPI-HABIBLER,Normal,1.0,1.0,UCRETSIZ,Ucretsiz,SULTANGAZI,T4,HACI SUKRU
2,2024-09-01,0,2,RAYLI,HALKALI - GEBZE,Normal,1.0,1.0,TAM,Tam Kontur,FATIH,MARMARAY,YENIKAPI-1
3,2024-09-01,0,3,DENİZ,ADALAR,Normal,1.0,1.0,,Tam Kontur,ADALAR,MT-ADA,BUYUKADA
4,2024-09-01,0,2,RAYLI,HALKALI - GEBZE,Normal,1.0,1.0,,Tam Kontur,PENDIK,MARMARAY,PENDIK
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2848444,2024-09-09,6,2,RAYLI,HALKALI - GEBZE,Normal,29.0,29.0,TAM,Tam Abonman,PENDIK,MARMARAY,KAYNARCA
2848445,2024-09-09,6,1,OTOYOL,USKUDAR-SENEVLER,Normal,2.0,2.0,UCRETSIZ,Ucretsiz,BEYKOZ,15ŞN,
2848446,2024-09-09,6,2,RAYLI,YENIKAPI - HACIOSMAN,Normal,6.0,6.0,UCRETSIZ,Ucretsiz,FATIH,M2,HALIC GUNEY
2848447,2024-09-09,6,2,RAYLI,YENIKAPI - HAVALIMANI,Normal,1.0,1.0,UCRETSIZ,Ucretsiz,FATIH,M1,EMNIYET


In [4]:
# Hyperparameter settings
epochs = 15
batch_size = 32
learning_rate = 0.001

In [None]:
filtered_data = data[data['transport_type_id'] == 2].drop(columns=['road_type'], errors='ignore')
data = filtered_data.drop(columns=['transport_type_id'], errors='ignore')
data

In [None]:
# Tarih sırasına göre veriyi son iki haftaya (14 güne) indirecek şekilde filtreleme
last_two_weeks_data = data.sort_values(by='transition_date', ascending=False).head(14 * 24)  # 14 gün x 24 saat

# Yeni veriyi CSV olarak kaydetme
last_two_weeks_data.to_csv("last_two_weeks_data.csv", index=False)

print("Son iki haftalık veri 'last_two_weeks_data.csv' olarak kaydedildi.")

In [17]:
last_two_weeks_data

Unnamed: 0,transition_date,transition_hour,line,transfer_type,number_of_passage,number_of_passenger,product_kind,transaction_type_desc,town,line_name,station_poi_desc_cd
0,2024-09-01,0,KIRAZLI-BASAKSEHIR/METROKENT,Normal,3,3,TAM,Tam Kontur,BASAKSEHIR,M3,IKITELLI SANAYI
1,2024-09-01,0,TOPKAPI-HABIBLER,Normal,1,1,UCRETSIZ,Ucretsiz,SULTANGAZI,T4,HACI SUKRU
2,2024-09-01,0,HALKALI - GEBZE,Normal,1,1,TAM,Tam Kontur,FATIH,MARMARAY,YENIKAPI-1
4,2024-09-01,0,HALKALI - GEBZE,Normal,1,1,,Tam Kontur,PENDIK,MARMARAY,PENDIK
8,2024-09-01,0,HALKALI - GEBZE,Normal,1,1,INDIRIMLI1,Indirimli Abonman,TUZLA,MARMARAY,AYDINTEPE
...,...,...,...,...,...,...,...,...,...,...,...
3958324,2024-09-30,0,USKUDAR-CEKMEKOY,Normal,4,4,INDIRIMLI1,Indirimli Abonman,USKUDAR,M5,ALTUNIZADE 1
3958344,2024-09-30,0,YENIKAPI - HACIOSMAN,Normal,4,4,INDIRIMLI1,Indirimli Abonman,SISLI,M2,ITU GUNEY
3958374,2024-09-30,0,KABATAS-MAHMUTBEY,Aktarma,1,1,UCRETSIZ,Tam Aktarma,,M7,ALIBEYKOY DOGU
3958381,2024-09-30,0,YENIKAPI - HACIOSMAN,Normal,2,1,TAM,Tam Kontur,FATIH,M2,HALIC GUNEY


In [18]:
# Tarih formatı ve eksik değerleri doldurma
data['transition_date'] = pd.to_datetime(data['transition_date'])
data.fillna(method='ffill', inplace=True)

# Feature engineering: create date-related features
data['hour'] = data['transition_date'].dt.hour
data['day_of_week'] = data['transition_date'].dt.dayofweek

# Define features and target
categorical_features = ['hour', 'day_of_week']
numerical_features = ['number_of_passage']
X = data[categorical_features + numerical_features]
y = data['number_of_passenger']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)  # Use drop='first' to avoid redundancy
    ]
)

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

# Split data into station-based sequences
stations = data['station_poi_desc_cd'].unique()
X_stationwise = []
y_stationwise = []

  data.fillna(method='ffill', inplace=True)


In [None]:
for station in stations:
    station_data = data[data['station_poi_desc_cd'] == station].copy()
    station_features = preprocessor.transform(station_data[categorical_features + numerical_features])

    station_series_X = []
    station_series_y = []

    # 168 saatlik ardışık verileri giriş ve çıkış olarak tanımlama
    for i in range(len(station_features) - 168 * 2):
        station_series_X.append(station_features[i:i + 168])
        station_series_y.append(station_features[i + 168:i + 168 * 2, 0])  # Sadece yolcu sayısını hedef olarak kullanıyoruz

    X_stationwise.extend(station_series_X)
    y_stationwise.extend(station_series_y)

# Numpy dizisine çevirme
X, y = np.array(X_stationwise), np.array(y_stationwise)

In [None]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Define LSTM model with Bidirectional LSTM layers
input_layer = Input(shape=(X_train.shape[1], X_train.shape[2]))
x = LSTM(64, return_sequences=True)(input_layer)
x = Dropout(0.2)(x)
x = LSTM(32)(x)
x = Dropout(0.2)(x)
output_layer = Dense(168, activation='linear')(x)  # Predicting 168 hourly values

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')

# Early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[early_stopping])


In [None]:
# Evaluate the model on test data with RMSE
y_test_pred = model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test.flatten(), y_test_pred.flatten()))
print("RMSE on Test Set:", test_rmse)

In [None]:
# Plotting function for actual vs. predicted values with error visualization
def plot_results(actual, predicted, title):
    plt.figure(figsize=(12, 6))
    for i in range(actual.shape[0]):
        plt.plot(range(i * 168, (i + 1) * 168), actual[i], label=f"Actual Day {i + 1}", marker='o')
        plt.plot(range(i * 168, (i + 1) * 168), predicted[i], label=f"Predicted Day {i + 1}", marker='x')
    plt.title(title)
    plt.xlabel("Hours")
    plt.ylabel("Number of Passengers")
    plt.legend()
    plt.show()

In [None]:
# Visualize predictions on test set
plot_results(y_test, y_test_pred, "Model Evaluation on Test Set (Hourly Predictions)")

In [None]:
#EOF

In [None]:
#checkpoint = ModelCheckpoint('/kaggle/working/model_checkpoint.h5.keras', save_best_only=True, monitor='val_loss', mode='min')

In [None]:
#model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_val, y_val), callbacks=[checkpoint])

In [None]:
# Load the best model
#model.load_weights('/kaggle/working/model_checkpoint.h5.keras')

In [None]:
# Make predictions on the test set
#predictions = model.predict(X_test)