In [1]:
# 1. Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 2. Load and Display Training Dataset (Initial Exploratory Data Analysis)
df_train = pd.read_csv("train_data_with_traffic_class.csv")
print(f"Total number of rows in training dataset: {len(df_train)}\n")
print("First few rows of the training dataset:")
print(df_train.head(), "\n")
print("Training dataset description:")
print(df_train.describe(), "\n")
print("Training dataset info:")
df_train.info()

# 3. Define Features and Target Variable for Training with Consistent Preprocessing
X_train_full = df_train.drop(columns=['Traffic_Class'])
y_train_full = df_train['Traffic_Class']

# One-hot encode categorical variables for training data
X_train_full = pd.get_dummies(X_train_full, columns=["Country"], drop_first=True)

# 4. Split Training Data into Training and Validation Sets (70%/30%)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=42)

# 5. Feature Selection on the Training Set
# Fit a base Decision Tree to determine feature importances
base_model = DecisionTreeClassifier(random_state=42)
base_model.fit(X_train, y_train)
importances = base_model.feature_importances_

# Select features with importance greater than a threshold (e.g., 0.1)
threshold = 0.1
selected_features = X_train.columns[importances > threshold]
print(f"\nSelected features (importance > {threshold}): {selected_features.tolist()}")

# Filter training and validation sets to keep only the selected features
X_train_selected = X_train[selected_features]
X_val_selected = X_val[selected_features]

# 6. Train and Evaluate the Untuned Model on Training and Validation Sets
untuned_model = DecisionTreeClassifier(random_state=42)
untuned_model.fit(X_train_selected, y_train)

train_accuracy_untuned = untuned_model.score(X_train_selected, y_train)
val_accuracy_untuned = accuracy_score(y_val, untuned_model.predict(X_val_selected))
cv_scores_untuned = cross_val_score(estimator=untuned_model, X=X_train_selected, y=y_train, cv=10)
cm_untuned = confusion_matrix(y_val, untuned_model.predict(X_val_selected))
classification_report_untuned = classification_report(y_val, untuned_model.predict(X_val_selected))

print("\nUntuned Model Evaluation:")
print(f"Training Accuracy: {train_accuracy_untuned:.4f}")
print(f"Validation Accuracy: {val_accuracy_untuned:.4f}")
print(f"10-Fold CV Accuracy: Mean = {cv_scores_untuned.mean():.4f}, Std = {cv_scores_untuned.std():.4f}")
print(f"\nConfusion Matrix:\n{cm_untuned}")
print(f"\nClassification Report:\n{classification_report_untuned}")

# 7. Hyperparameter Tuning (Using GridSearchCV on the Training Set)
param_grid = {
    'max_depth': [6, 8, 10],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2]
}

grid_search = GridSearchCV(estimator=untuned_model, param_grid=param_grid, cv=5)
grid_search.fit(X_train_selected, y_train)

print("\nGridSearchCV Results:")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.4f}")

# 8. Train and Evaluate the Tuned Model on the Validation Set
tuned_model = grid_search.best_estimator_
tuned_model.fit(X_train_selected, y_train)

train_accuracy_tuned = tuned_model.score(X_train_selected, y_train)
val_accuracy_tuned = accuracy_score(y_val, tuned_model.predict(X_val_selected))
cv_scores_tuned = cross_val_score(estimator=tuned_model, X=X_train_selected, y=y_train, cv=10)
cm_tuned = confusion_matrix(y_val, tuned_model.predict(X_val_selected))
classification_report_tuned = classification_report(y_val, tuned_model.predict(X_val_selected))

print("\nTuned Model Evaluation:")
print(f"Training Accuracy: {train_accuracy_tuned:.4f}")
print(f"Validation Accuracy: {val_accuracy_tuned:.4f}")
print(f"10-Fold CV Accuracy: Mean = {cv_scores_tuned.mean():.4f}, Std = {cv_scores_tuned.std():.4f}")
print(f"\nConfusion Matrix:\n{cm_tuned}")
print(f"\nClassification Report:\n{classification_report_tuned}")

# 9. Load and Preprocess the New Testing Dataset (2023-2025)
df_test = pd.read_csv("test_data_with_traffic_class.csv")

# Define test features and target variable as specified
# Drop columns that are not needed for prediction
X_test_raw = df_test.drop(columns=["Total_Traffic", "Departures", "Arrivals", "Traffic_Class", "Month_sin", "Month_cos"])
y_test = df_test["Traffic_Class"]

# One-hot encode the test dataset for categorical variables
X_test_encoded = pd.get_dummies(X_test_raw, columns=["Country"], drop_first=True)

# Align test features with the training set features
# (Missing columns will be filled with 0; extra columns are dropped)
X_test_aligned = X_test_encoded.reindex(columns=X_train_full.columns, fill_value=0)

# Apply the same feature selection from training
X_test_selected = X_test_aligned[selected_features]

# Evaluate the tuned model on the test dataset
test_accuracy_final = accuracy_score(y_test, tuned_model.predict(X_test_selected))
cm_final = confusion_matrix(y_test, tuned_model.predict(X_test_selected))
classification_report_final = classification_report(y_test, tuned_model.predict(X_test_selected))

print("\nFinal Test Dataset Evaluation (2023-2025):")
print(f"Testing Accuracy: {test_accuracy_final:.4f}")
print(f"\nConfusion Matrix:\n{cm_final}")
print(f"\nClassification Report:\n{classification_report_final}")

Total number of rows in training dataset: 4965

First few rows of the training dataset:
   Month  Year         Country  Departures  Arrivals  Total Holidays  \
0      1  2000  United Kingdom     46677.0   45630.0        1.015023   
1      1  2000         Germany     23172.0   24251.0        1.015023   
2      1  2000          France     11098.0   11469.0        1.015023   
3      1  2000          Europe    118255.0  119735.0        1.015023   
4      1  2000         Oceania    139244.0  133358.0        1.015023   

   Inflation  Month_sin  Month_cos  Total_Traffic  Traffic_Class  
0  -0.600846        0.5   0.866025        92307.0              0  
1  -0.407217        0.5   0.866025        47423.0              0  
2  -0.278131        0.5   0.866025        22567.0              0  
3  -0.278131        0.5   0.866025       237990.0              1  
4  -0.278131        0.5   0.866025       272602.0              1   

Training dataset description:
             Month         Year    Departures

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
