*   **Training Accuracy:** Measures how well the model performs on the training data, indicating how well it has learned from the training set.
*   **Testing Accuracy:** Evaluates how well the model generalizes to new, unseen data from the test set.
*   **10-Fold Cross-Validation Accuracy (mean and standard deviation):** Assesses the model’s performance across different subsets of the data to ensure robustness and reduce bias.
*   **Confusion Matrix:** Provides a detailed breakdown of the model’s predictions, showing the counts of true positives, true negatives, false positives, and false negatives.
*   **Classification Report:** Gives detailed metrics like precision, recall, and F1-score for each class, helping to evaluate the model’s performance on individual classes, especially in imbalanced datasets.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from scipy.stats import uniform, randint

# Load dataset
df = pd.read_csv("dataset.csv")

# Drop specified columns related to regions (ignore errors if columns are missing)
columns_to_drop = [
    "Region_East Asia", "Region_Europe", "Region_Middle East",
    "Region_North America", "Region_Oceania", "Region_South Asia",
    "Region_South East Asia"
]
df.drop(columns=columns_to_drop, errors='ignore', inplace=True)

# Display dataset info
print(f"Total number of rows in dataset: {len(df)}")
print(f"\n{df.head()}")

# Define features (X) and target variable (y)
X = df.drop(columns=['Country'])
y = df['Country']

# Split data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize Decision Tree Classifier
model = DecisionTreeClassifier()

# Evaluate the performance of the un-tuned model (before tuning)
print("\nEvaluating the Untuned Model...")

# Train the untuned model on the full training set
model.fit(X_train, y_train)

# Training Accuracy for untuned model
train_accuracy_untuned = model.score(X_train, y_train)
test_accuracy_untuned = accuracy_score(y_test, model.predict(X_test))

# 10-Fold Cross-Validation for untuned model
cv_scores_untuned = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)

# Confusion Matrix for untuned model
cm_untuned = confusion_matrix(y_test, model.predict(X_test))

# Classification Report for untuned model
classification_report_untuned = classification_report(y_test, model.predict(X_test))

# Display results for untuned model
print(f"\nTraining Accuracy (Untuned): {train_accuracy_untuned:.4f}")
print(f"Testing Accuracy (Untuned): {test_accuracy_untuned:.4f}")
print(f"10-Fold Cross Validation Accuracy (Untuned):\n"
      f"Mean CV Accuracy: {cv_scores_untuned.mean():.4f}\n"
      f"Standard Deviation: {cv_scores_untuned.std():.4f}")
print(f"\nConfusion Matrix (Untuned):\n{cm_untuned}")
print(f"\nClassification Report (Untuned):\n{classification_report_untuned}")

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'max_depth': [6, 8, 10],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2]
}

print("\nPerforming GridSearchCV for Decision Tree Classifier...")

# Set up and execute GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

# Display GridSearchCV results
print(f"\nGridSearchCV Best Parameters: {grid.best_params_}")
print(f"Best Cross-validation Score: {grid.best_score_:.4f}")

# Define hyperparameter distributions for RandomizedSearchCV
param_dist = {
    'max_depth': randint(5, 15),
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_weight_fraction_leaf': uniform(0.0, 0.3)
}

print("\nPerforming RandomizedSearchCV for Decision Tree Classifier...")

# Set up and execute RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model, param_distributions=param_dist, n_iter=20, cv=5, random_state=42
)
random_search.fit(X_train, y_train)

# Display RandomizedSearchCV results
print(f"\nRandomizedSearchCV Best Parameters: {random_search.best_params_}")
print(f"Best Cross-validation Score: {random_search.best_score_:.4f}")

# Select the best model (GridSearchCV in this case)
tuned_model = grid.best_estimator_

# Train the selected model on the full training set
tuned_model.fit(X_train, y_train)

print("\nEvaluating the Final Tuned Model...")

# Training Accuracy for tuned model
train_accuracy_tuned = tuned_model.score(X_train, y_train)
test_accuracy_tuned = accuracy_score(y_test, tuned_model.predict(X_test))

# 10-Fold Cross-Validation for tuned model
cv_scores_tuned = cross_val_score(estimator=tuned_model, X=X_train, y=y_train, cv=10)

# Confusion Matrix for tuned model
cm_tuned = confusion_matrix(y_test, tuned_model.predict(X_test))

# Classification Report for tuned model
classification_report_tuned = classification_report(y_test, tuned_model.predict(X_test))

# Display results for tuned model
print(f"\nTraining Accuracy (Tuned): {train_accuracy_tuned:.4f}")
print(f"Testing Accuracy (Tuned): {test_accuracy_tuned:.4f}")
print(f"10-Fold Cross Validation Accuracy (Tuned):\n"
      f"Mean CV Accuracy: {cv_scores_tuned.mean():.4f}\n"
      f"Standard Deviation: {cv_scores_tuned.std():.4f}")
print(f"\nConfusion Matrix (Tuned):\n{cm_tuned}")
print(f"\nClassification Report (Tuned):\n{classification_report_tuned}")

Total number of rows in dataset: 3672

   Month  Year         Country  Departures  Arrivals  Quarter  \
0      1  2000   North America     29012.0   26225.0        1   
1      1  2000  United Kingdom     46677.0   45630.0        1   
2      1  2000         Germany     23172.0   24251.0        1   
3      1  2000          France     11098.0   11469.0        1   
4      1  2000          Europe    118255.0  119735.0        1   

   Years_Since_2000  Departures_Growth  Arrivals_Growth  Is_Holiday_Month  
0                 0                0.0              0.0                 1  
1                 0                0.0              0.0                 1  
2                 0                0.0              0.0                 1  
3                 0                0.0              0.0                 1  
4                 0                0.0              0.0                 1  

Evaluating the Untuned Model...

Training Accuracy (Untuned): 1.0000
Testing Accuracy (Untuned): 0.5535
10-Fold C