Gerekli Kütüphanelerin Eklenmesi

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix


Büyük Veri Setinin Parçalar Halinde Okunması ve İncelenmesi

In [2]:
file_path = "/Users/harunnacar/Desktop/bu/merged_all_months.csv"

chunksize = 200000
chunks = pd.read_csv(file_path, chunksize=chunksize)

df = next(chunks)
df.head()


Unnamed: 0,date,hour,route_code,stop_code,passenger_count,vehicle_type,district,is_outlier,is_peak_hour,day_of_week,is_weekend,mean_passenger_by_route_hour,month
0,2024-08-01,0,MESCIDI SELAM-ARNAVUTKOY-ISTANBUL HAVA LIMANI,OTOYOL,1,1,ATASEHIR,0,0,3,0,1.782609,bilinmeyen_ay
1,2024-08-01,0,MECIDIYEKOY-ISTANBUL HAVALIMANI,OTOYOL,1,1,ATASEHIR,0,0,3,0,1.2,bilinmeyen_ay
2,2024-08-01,0,KOC UNV. RUMELIFENER KAMPUSU - TAKSIM,OTOYOL,3,1,ATASEHIR,0,0,3,0,1.666667,bilinmeyen_ay
3,2024-08-01,0,HACI OSMAN METRO - RUMELI FENERI,OTOYOL,1,1,ATASEHIR,0,0,3,0,1.538462,bilinmeyen_ay
4,2024-08-01,0,USKUDAR-GUZELTEPE-UMRANIYE DEVLET HASTANESI,OTOYOL,3,1,BAKIRKOY,0,0,3,0,4.258065,bilinmeyen_ay


DataFrame Sütun İsimlerinin Görüntülenmesi

In [3]:
df.columns


Index(['date', 'hour', 'route_code', 'stop_code', 'passenger_count',
       'vehicle_type', 'district', 'is_outlier', 'is_peak_hour', 'day_of_week',
       'is_weekend', 'mean_passenger_by_route_hour', 'month'],
      dtype='object')

Özellik Seçimi ve Eksik Veri Temizliği

In [4]:
target = 'vehicle_type'

features = [
    'hour',
    'route_code',
    'district',
    'day_of_week',
    'is_weekend',
    'is_peak_hour',
    'mean_passenger_by_route_hour'
]

df = df[features + [target]]
df = df.dropna()

df.head()


Unnamed: 0,hour,route_code,district,day_of_week,is_weekend,is_peak_hour,mean_passenger_by_route_hour,vehicle_type
0,0,MESCIDI SELAM-ARNAVUTKOY-ISTANBUL HAVA LIMANI,ATASEHIR,3,0,0,1.782609,1
1,0,MECIDIYEKOY-ISTANBUL HAVALIMANI,ATASEHIR,3,0,0,1.2,1
2,0,KOC UNV. RUMELIFENER KAMPUSU - TAKSIM,ATASEHIR,3,0,0,1.666667,1
3,0,HACI OSMAN METRO - RUMELI FENERI,ATASEHIR,3,0,0,1.538462,1
4,0,USKUDAR-GUZELTEPE-UMRANIYE DEVLET HASTANESI,BAKIRKOY,3,0,0,4.258065,1


Kategorik Verilerin Sayısal Kodlanması (Label Encoding)

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['route_code'] = le.fit_transform(df['route_code'])
df['district'] = le.fit_transform(df['district'])


Eğitim ve Test Kümelerine Ayırma

In [6]:
from sklearn.model_selection import train_test_split

X = df.drop('vehicle_type', axis=1)
y = df['vehicle_type']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


KNN Modeli Kurulumu, Eğitimi ve Değerlendirilmesi

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# KNN pipeline
knn_model = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=7))
])

# Modeli eğit
knn_model.fit(X_train, y_train)

# Tahmin
y_pred_knn = knn_model.predict(X_test)

# Skorlar
knn_acc = accuracy_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn, average='macro')

print("KNN Accuracy:", knn_acc)
print("KNN Macro F1:", knn_f1)

# Ek rapor
print("\nConfusion Matrix (KNN):")
print(confusion_matrix(y_test, y_pred_knn))

print("\nClassification Report (KNN):")
print(classification_report(y_test, y_pred_knn))


KNN Accuracy: 0.9390003672665354
KNN Macro F1: 0.5979163871471697

Confusion Matrix (KNN):
[[54495   111   289]
 [ 1526   289   176]
 [ 1448   104  1464]]

Classification Report (KNN):
              precision    recall  f1-score   support

           1       0.95      0.99      0.97     54895
           2       0.57      0.15      0.23      1991
           3       0.76      0.49      0.59      3016

    accuracy                           0.94     59902
   macro avg       0.76      0.54      0.60     59902
weighted avg       0.93      0.94      0.93     59902



Gradient Boosting Modeli Eğitimi ve Değerlendirilmesi

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(
    learning_rate=0.05,
    n_estimators=300,
    max_depth=3
)

gbc.fit(X_train, y_train)

y_pred_gbc = gbc.predict(X_test)

print("\nGradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gbc))
print("Gradient Boosting Macro F1:", f1_score(y_test, y_pred_gbc, average='macro'))

print("\nConfusion Matrix (GBC):")
print(confusion_matrix(y_test, y_pred_gbc))

print("\nClassification Report (GBC):")
print(classification_report(y_test, y_pred_gbc))



Gradient Boosting Accuracy: 0.9932389569630397
Gradient Boosting Macro F1: 0.969880993654101

Confusion Matrix (GBC):
[[54879     0    16]
 [  155  1830     6]
 [  221     7  2788]]

Classification Report (GBC):
              precision    recall  f1-score   support

           1       0.99      1.00      1.00     54895
           2       1.00      0.92      0.96      1991
           3       0.99      0.92      0.96      3016

    accuracy                           0.99     59902
   macro avg       0.99      0.95      0.97     59902
weighted avg       0.99      0.99      0.99     59902

