**1. Import Libraries**

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

**2. Load and Display Dataset (Initial Exploratory Data Analysis)**

In [4]:
# Load dataset
df = pd.read_csv("final_dataset_with_inflation_by_country.csv")

# Display basic information
print(f"Total number of rows in dataset: {len(df)}\n")
print("First few rows of the dataset:")
print(df.head(), "\n")
print("Dataset description:")
print(df.describe(), "\n")
print("Dataset info:")
df.info()

Total number of rows in dataset: 3671

First few rows of the dataset:
   Month  Year         Country  Departures  Arrivals  Quarter  \
0      1  2000  United Kingdom     46677.0   45630.0        1   
1      1  2000         Germany     23172.0   24251.0        1   
2      1  2000          France     11098.0   11469.0        1   
3      1  2000          Europe    118255.0  119735.0        1   
4      1  2000         Oceania    139244.0  133358.0        1   

   Years_Since_2000  Departures_Growth  Arrivals_Growth  Total Holidays  \
0                 0                0.0              0.0               2   
1                 0                0.0              0.0               2   
2                 0                0.0              0.0               2   
3                 0                0.0              0.0               2   
4                 0                0.0              0.0               2   

   Holiday Ratio  Inflation  
0       0.064516        0.8  
1       0.064516        1.4 

**3. Define Features and Target Variable**

In [5]:
# Define features (X) and target variable (y)
X = df.drop(columns=['Country'])
y = df['Country']

**4. Train-Test Split**

In [6]:
# Split data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

**5. Feature Selection on the Training Set**

In [7]:
# Initialize a base Decision Tree to determine feature importances
base_model = DecisionTreeClassifier(random_state=42)
base_model.fit(X_train, y_train)
importances = base_model.feature_importances_

# Set threshold for selecting features
threshold = 0.1
selected_features = X.columns[importances > threshold]
print(f"\nSelected features (importance > {threshold}): {selected_features.tolist()}")

# Create new training and test sets with only the selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]


Selected features (importance > 0.1): ['Departures', 'Arrivals', 'Years_Since_2000', 'Inflation']


**6. Train and Evaluate the Untuned Model**

In [8]:
# Initialize the Decision Tree Classifier
untuned_model = DecisionTreeClassifier(random_state=42)

# Train the untuned model
untuned_model.fit(X_train_selected, y_train)

# Evaluate the model
train_accuracy_untuned = untuned_model.score(X_train_selected, y_train)
test_accuracy_untuned = accuracy_score(y_test, untuned_model.predict(X_test_selected))
cv_scores_untuned = cross_val_score(estimator=untuned_model, X=X_train_selected, y=y_train, cv=10)
cm_untuned = confusion_matrix(y_test, untuned_model.predict(X_test_selected))
classification_report_untuned = classification_report(y_test, untuned_model.predict(X_test_selected))

# Display results
print("\nUntuned Model Evaluation:")
print(f"Training Accuracy: {train_accuracy_untuned:.4f}")
print(f"Testing Accuracy: {test_accuracy_untuned:.4f}")
print(f"10-Fold CV Accuracy: Mean = {cv_scores_untuned.mean():.4f}, Std = {cv_scores_untuned.std():.4f}")
print(f"\nConfusion Matrix:\n{cm_untuned}")
print(f"\nClassification Report:\n{classification_report_untuned}")


Untuned Model Evaluation:
Training Accuracy: 1.0000
Testing Accuracy: 0.7441
10-Fold CV Accuracy: Mean = 0.7493, Std = 0.0265

Confusion Matrix:
[[36  0  0  2  0  0  5  0  0  0  0  5  0 13  0  3  0  0]
 [ 0 52  0  0  0  0  0  0  5  4  0  0  0  0  0  0  0  0]
 [ 0  0 49  0  0  0  0  0  4  7  0  0  0  0  0  0  2  0]
 [ 1  0  0 41  0 15  6  2  0  0  0  0  3  4  0  1  0  1]
 [ 2  0  0  0 49  0  6  1  0  0  1  2  0  2  0  0  0  0]
 [ 0  0  0  7  0 45  0  0  1  0  0  0  2  0  0  0  0  0]
 [ 2  0  0  5  3  2 28  2  0  1  0  2  0 10  0  1  0  0]
 [ 0  0  0  1  0  0  4 42  0  0  0  2  0  0  0  2  0  0]
 [ 0 10  6  0  0  1  0  0 47  2  0  0  1  0  0  0  1  0]
 [ 0  3  7  0  0  0  0  0  7 38  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 57  1  0  0  2  0  0  0]
 [ 7  0  0  0  3  0  1  2  0  0  0 33  0 11  0  2  0  0]
 [ 0  0  0  2  0  3  0  0  0  2  0  0 47  0  0  0  0  4]
 [ 6  0  0  4  0  0  8  1  0  0  0  6  0 33  0  3  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  0  0 57  0  0  0]

**7. Hyperparameter Tuning (Using Cross-Validation on the Training Set)**

In [9]:
# Define hyperparameter grid for GridSearchCV
param_grid = {
    'max_depth': [6, 8, 10],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=untuned_model, param_grid=param_grid, cv=5)

# Perform GridSearchCV
grid_search.fit(X_train_selected, y_train)

# Display best parameters and score
print("\nGridSearchCV Results:")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.4f}")


GridSearchCV Results:
Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_weight_fraction_leaf': 0.0, 'splitter': 'best'}
Best Cross-Validation Score: 0.7045


**8. Train and Evaluate the Tuned Model**

In [10]:
# Retrieve the best estimator
tuned_model = grid_search.best_estimator_

# Train the tuned model
tuned_model.fit(X_train_selected, y_train)

# Evaluate the tuned model
train_accuracy_tuned = tuned_model.score(X_train_selected, y_train)
test_accuracy_tuned = accuracy_score(y_test, tuned_model.predict(X_test_selected))
cv_scores_tuned = cross_val_score(estimator=tuned_model, X=X_train_selected, y=y_train, cv=10)
cm_tuned = confusion_matrix(y_test, tuned_model.predict(X_test_selected))
classification_report_tuned = classification_report(y_test, tuned_model.predict(X_test_selected))

# Display results
print("\nTuned Model Evaluation:")
print(f"Training Accuracy: {train_accuracy_tuned:.4f}")
print(f"Testing Accuracy: {test_accuracy_tuned:.4f}")
print(f"10-Fold CV Accuracy: Mean = {cv_scores_tuned.mean():.4f}, Std = {cv_scores_tuned.std():.4f}")
print(f"\nConfusion Matrix:\n{cm_tuned}")
print(f"\nClassification Report:\n{classification_report_tuned}")


Tuned Model Evaluation:
Training Accuracy: 0.8571
Testing Accuracy: 0.7305
10-Fold CV Accuracy: Mean = 0.7049, Std = 0.0248

Confusion Matrix:
[[24  0  0  4  1  0  3  1  0  0  0 11  0 18  0  2  0  0]
 [ 0 59  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0]
 [ 0  0 55  0  0  0  0  0  1  6  0  0  0  0  0  0  0  0]
 [ 2  0  0 31  0 14 10  2  0  0  0  0  9  4  0  1  1  0]
 [ 1  0  0  0 54  0  2  1  0  0  0  3  0  1  0  1  0  0]
 [ 0  0  0 13  0 41  0  0  0  0  0  0  0  0  0  0  0  1]
 [ 2  0  0  3  2  1 25  5  0  2  0  1  0 13  0  2  0  0]
 [ 1  0  0  1  3  0  3 38  0  0  0  3  0  1  0  1  0  0]
 [ 0 11  2  0  0  0  0  0 47  6  0  0  2  0  0  0  0  0]
 [ 0  2  7  0  0  0  0  0  9 36  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  0 58  1  0  0  1  0  0  0]
 [ 6  0  0  0  2  0  1  2  0  0  0 44  0  3  0  1  0  0]
 [ 3  0  0  2  0  2  4  0  0  0  0  0 43  0  0  0  0  4]
 [10  0  0  3  1  0  6  1  0  0  0  5  1 35  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 58  0  0  0]
 