In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/destinations.csv")
df.head()

In [None]:
print("Number of destinations: ", df.shape[0])
print("Number of attributes: ", df.shape[1])


In [None]:
# City, country and continent names are not relevant for the training
city_names = df["city"].copy()
destinations = df.drop(columns=["city", "country", "continent"])
destinations

In [None]:
destinations["climate"].unique()

In [None]:
# One-Hot encoding
climate_dummies = pd.get_dummies(
    destinations["climate"],
    prefix="climate"
)

destinations = pd.concat([destinations.drop(columns=["climate"]), climate_dummies], axis=1)

In [None]:
destinations.head()

In [None]:
destinations["safety_rating"].unique()

In [None]:
# Min-Max normalize
min_s = destinations["safety_rating"].min()
max_s = destinations["safety_rating"].max()

destinations["safety_rating"] = (destinations["safety_rating"] - min_s) / (max_s - min_s)
destinations["safety_rating"]

In [None]:
destinations["popularity"].unique()

In [None]:
# Min-Max normalize
min_s = destinations["popularity"].min()
max_s = destinations["popularity"].max()

destinations["popularity"] = (destinations["popularity"] - min_s) / (max_s - min_s)
destinations["popularity"]

# Training the Neural Network with training data

In [None]:
df_training = pd.read_csv("../data/training_data.csv")
df_training.shape

In [None]:
df_training.head()

In [None]:
# Checking to see if there are any missing values
df_training.isnull().sum().sum()

In [None]:
# Split the data for X and y (target)
drop_columns = ['user_id', 'destination_id', 'dest_city', 'dest_country', 'dest_continent', 
                'compatibility_score']

X = df_training.drop(columns=drop_columns + ['label'])
y = df_training['label']

In [None]:
unique_counts = pd.DataFrame.from_records([(col, df_training[col].nunique()) for col in df_training.columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])
unique_counts

In [None]:
for col in df_training.columns:
    if df_training[col].nunique() < 29:
        df_training[col] = df_training[col].astype('category')

In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Categorical columns: {categorical_cols}")

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

In [None]:
from sklearn.preprocessing import StandardScaler

numerical_cols = ['user_safety_importance', 'user_popularity_pref', 'dest_cost', 
                  'dest_popularity', 'dest_safety']

print(f"Numerical columns before scaling:")
print(X[numerical_cols].describe())

# We scale AFTER splitting
scaler = StandardScaler()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, 
    test_size=0.30, # 70% for training
    random_state=42,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp,
    test_size=0.50,
    random_state=42,
    stratify=y_tmp
)

In [None]:
X_train.values

In [None]:
X_val.values

In [None]:
num_cols_indices = [X_train.columns.get_loc(col) for col in numerical_cols]

X_train_array = X_train.values
X_val_array = X_val.values
X_test_array = X_test.values

# Fit scaling only on training data
scaler.fit(X_train_array[:, num_cols_indices])

# Apply scaling to all sets
X_train_array[:, num_cols_indices] = scaler.transform(X_train_array[:, num_cols_indices])
X_val_array[:, num_cols_indices] = scaler.transform(X_val_array[:, num_cols_indices])
X_test_array[:, num_cols_indices] = scaler.transform(X_test_array[:, num_cols_indices])


In [None]:
print(f"Numerical columns after scaling:")
print(pd.DataFrame(X_train_array[:5, num_cols_indices], columns=numerical_cols))

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def build_model(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.2),
        
        layers.Dense(1, activation='sigmoid')
    ])
    
    return model

model = build_model(X_train_array.shape[1])

model.summary()

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', 
             keras.metrics.Precision(),
             keras.metrics.Recall()]
)

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=15,
        restore_best_weights=True,
        verbose=1
    ),
    
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-6,
        verbose=1
    ),
    
    keras.callbacks.ModelCheckpoint(
        'best_model.keras',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
]

In [None]:
print("Starting training...\n")

history = model.fit(
    X_train_array, y_train.values,
    validation_data=(X_val_array, y_val.values),
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=1
)

print("\n Training complete")

# Visualizing the fittnes function

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].plot(history.history['loss'], label='Training Loss')
axes[0].plot(history.history['val_loss'], label='Validation Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Binary Crossentropy Loss')
axes[0].set_title('Loss Over Time (Fitness Function)')
axes[0].legend()
axes[0].grid(True)

axes[1].plot(history.history['accuracy'], label='Training Accuracy')
axes[1].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy Over Time')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Evaluate on test set
test_results = model.evaluate(X_test_array, y_test.values, verbose=0)

print("="*60)
print("TEST SET PERFORMANCE")
print("="*60)
print(f"Loss: {test_results[0]:.4f}")
print(f"Accuracy: {test_results[1]:.4f} ({test_results[1]*100:.2f}%)")
print(f"Precision: {test_results[2]:.4f}")
print(f"Recall: {test_results[3]:.4f}")
print("="*60)