In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/travel_preferences.csv")
df.head()

In [None]:
print(df.columns)

In [None]:
df = df.drop(columns=['Rating_0', 'Rating_1', 'Rating_2', 'Rating_3', 'Rating_4',
       'Rating_5', 'Rating_6', 'Rating_7', 'Rating_8', 'Rating_9', 'Rec_0',
       'Rec_1', 'Rec_2', 'Rec_3', 'Rec_4', 'Rec_5', 'Rec_6', 'Rec_7', 'Rec_8',
       'Rec_9', 'where_to_go', 'where_to_go_exactly', 'Model', 'Retrieval',
       'DynaMatch'])

In [None]:
df.head()

In [None]:
# Proveri missing values
print("Missing values po kolonama:")
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False))

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df.shape[0]
#Vidimo da smo izgubili redove, ali i dalje imamo puno instanci

In [None]:
import ast

def to_list(x):
    if pd.isna(x) or x == '' or x == '[]':
        return []
    if isinstance(x, list):
        return x
    return ast.literal_eval(x)

for col in ['yes_swipes', 'no_swipes', 'maybe_swipes',
    'age_ranges', 'budget_ranges',
    'interests', 'sceneries']:
    df[col] = df[col].apply(to_list)


In [None]:
df.head()

In [None]:
assert df[list_cols].applymap(type).isin([list]).all().all()


In [None]:
def traveller_type(age_ranges):
    n = len(age_ranges)
    if n <= 1:
        return 'solo'
    elif n == 2:
        return 'couple'
    else:
        return 'friends'

df['traveller_type'] = df['age_ranges'].apply(traveller_type)


In [None]:
df.head()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb_interests = MultiLabelBinarizer()
interest_df = pd.DataFrame(
    mlb_interests.fit_transform(df['interests']),
    columns=[f"interest_{c}" for c in mlb_interests.classes_],
    index=df.index
)

df = pd.concat([df, interest_df], axis=1)


In [None]:
df = df.drop(columns='interests')

In [None]:
mlb_budget = MultiLabelBinarizer()
budget_df = pd.DataFrame(
    mlb_budget.fit_transform(df['budget_ranges']),
    columns=[f"budget_{c}" for c in mlb_budget.classes_],
    index=df.index
)

df = pd.concat([df, budget_df], axis=1)


In [None]:
mlb_scene = MultiLabelBinarizer()
scene_df = pd.DataFrame(
    mlb_scene.fit_transform(df['sceneries']),
    columns=[f"scene_{c}" for c in mlb_scene.classes_],
    index=df.index
)

df = pd.concat([df, scene_df], axis=1)


In [None]:
mlb_age = MultiLabelBinarizer()
age_df = pd.DataFrame(
    mlb_age.fit_transform(df['age_ranges']),
    columns=[f"age_{c}" for c in mlb_age.classes_],
    index=df.index
)

df = pd.concat([df, age_df], axis=1)


In [None]:
df.head()

In [None]:
df.shape[1]

In [None]:
df = df.drop(columns=['sceneries', 'budget_ranges', 'age_ranges'])

In [None]:
df.columns

In [None]:
# Umesto (user, [gradovi]), zelimo (user, grad)
import pandas as pd

feature_cols = [
    'season',
    'activity_level',
    'safety_conscious',
    'popularity',
    'traveller_type',

    'interest_Adventure',
    'interest_Beach',
    'interest_Cuisine',
    'interest_Culture',
    'interest_History',
    'interest_Nature',
    'interest_Nightlife',
    'interest_Shopping',

    'budget_0-49',
    'budget_50-99',
    'budget_100-249',
    'budget_300+',

    'scene_Desert',
    'scene_Jungle',
    'scene_Lake',
    'scene_Mountain',
    'scene_Plains',
    'scene_Rural',
    'scene_Sea',
    'scene_Urban',

    'age_0-19',
    'age_20-39',
    'age_40-59',
    'age_60+'
]

rows = []

for _, row in df.iterrows():
    user_id = row['id']
    user_features = row[feature_cols].values.tolist()

    for city in row['yes_swipes']:
        rows.append(
            [user_id, city, 1] + user_features
        )

    for city in row['no_swipes']:
        rows.append(
            [user_id, city, 0] + user_features
        )

df_users = pd.DataFrame(
    rows,
    columns=['user_id', 'destination', 'label'] + feature_cols
)



In [None]:
df_users.shape[0]

In [None]:
df_users.head(15)

# Training the Neural Network with training data

In [None]:
# Checking to see if there are any missing values
df_users.isnull().sum().sum()

In [None]:
X = df_users.drop(columns=['label', 'user_id', 'destination'])
y = df_users['label'].astype(int)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


In [None]:
categorical_cols = [
    'season',
    'activity_level',
    'safety_conscious',
    'popularity',
    'traveller_type'
]

numeric_cols = [c for c in X.columns if c not in categorical_cols]



In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp,
    test_size=0.5,
    stratify=y_tmp,
    random_state=42
)


In [None]:
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc   = preprocessor.transform(X_val)
X_test_proc  = preprocessor.transform(X_test)


In [None]:
X_train_proc

In [None]:
print(X_train_proc.shape)
print(X_val_proc.shape)
print(X_test_proc.shape)

y_train.mean(), y_val.mean(), y_test.mean()


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def build_model(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.2),
        
        layers.Dense(1, activation='sigmoid')
    ])
    
    return model

model = build_model(X_train_proc.shape[1])

model.summary()

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=[
        keras.metrics.AUC(name='auc'),
        keras.metrics.BinaryAccuracy(name='accuracy')
    ]
)


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array([0, 1]),
    y=y_train
)

class_weights = {0: class_weights[0], 1: class_weights[1]}


In [None]:
print(y_train.value_counts())
print(y_train.unique())
print(type(y_train))


In [None]:
y_train= y_train.to_numpy()
y_val= y_val.to_numpy()

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='val_auc',
        patience=3,
        mode='max',
        restore_best_weights=True
    )
]


In [None]:
history = model.fit(
    X_train_proc,
    y_train,
    validation_data=(X_val_proc, y_val),
    epochs=20,
    batch_size=1024,
    class_weight=class_weights,
    callbacks=callbacks,
    verbose=1
)

print("\n Training complete")

# Visualizing the fittnes function

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Binary Crossentropy Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.show()

plt.figure(figsize=(8,5))
plt.plot(history.history['auc'], label='train_auc')
plt.plot(history.history['val_auc'], label='val_auc')
plt.xlabel('Epoch')
plt.ylabel('AUC')
plt.title('Training vs Validation AUC')
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

y_pred_probs = model.predict(X_test_proc, batch_size=1024)

y_pred = (y_pred_probs >= 0.5).astype(int)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_probs))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

