In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/travel_preferences.csv")
df.head()

In [None]:
print(df.columns)

In [None]:
df = df.drop(columns=['Rating_0', 'Rating_1', 'Rating_2', 'Rating_3', 'Rating_4',
       'Rating_5', 'Rating_6', 'Rating_7', 'Rating_8', 'Rating_9', 'Rec_0',
       'Rec_1', 'Rec_2', 'Rec_3', 'Rec_4', 'Rec_5', 'Rec_6', 'Rec_7', 'Rec_8',
       'Rec_9', 'where_to_go', 'where_to_go_exactly', 'Model', 'Retrieval',
       'DynaMatch'])

In [None]:
df.head()

In [None]:
df.to_csv('true_travel_data.csv', index=False)

In [None]:
# Proveri missing values
print("Missing values po kolonama:")
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False))

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df.shape[0]
#Vidimo da smo izgubili redove, ali i dalje imamo puno instanci

In [None]:
import ast

def to_list(x):
    if pd.isna(x) or x == '' or x == '[]':
        return []
    if isinstance(x, list):
        return x
    return ast.literal_eval(x)

for col in ['yes_swipes', 'no_swipes', 'maybe_swipes',
    'age_ranges', 'budget_ranges',
    'interests', 'sceneries']:
    df[col] = df[col].apply(to_list)


In [None]:
df.head()

In [None]:
def traveller_type(age_ranges):
    n = len(age_ranges)
    if n <= 1:
        return 'solo'
    elif n == 2:
        return 'couple'
    else:
        return 'friends'

df['traveller_type'] = df['age_ranges'].apply(traveller_type)


In [None]:
df.head()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb_interests = MultiLabelBinarizer()
interest_df = pd.DataFrame(
    mlb_interests.fit_transform(df['interests']),
    columns=[f"interest_{c}" for c in mlb_interests.classes_],
    index=df.index
)

df = pd.concat([df, interest_df], axis=1)


In [None]:
df = df.drop(columns='interests')

In [None]:
mlb_budget = MultiLabelBinarizer()
budget_df = pd.DataFrame(
    mlb_budget.fit_transform(df['budget_ranges']),
    columns=[f"budget_{c}" for c in mlb_budget.classes_],
    index=df.index
)

df = pd.concat([df, budget_df], axis=1)


In [None]:
mlb_scene = MultiLabelBinarizer()
scene_df = pd.DataFrame(
    mlb_scene.fit_transform(df['sceneries']),
    columns=[f"scene_{c}" for c in mlb_scene.classes_],
    index=df.index
)

df = pd.concat([df, scene_df], axis=1)


In [None]:
mlb_age = MultiLabelBinarizer()
age_df = pd.DataFrame(
    mlb_age.fit_transform(df['age_ranges']),
    columns=[f"age_{c}" for c in mlb_age.classes_],
    index=df.index
)

df = pd.concat([df, age_df], axis=1)


In [None]:
df.head()

In [None]:
df.shape[1]

In [None]:
df = df.drop(columns=['sceneries', 'budget_ranges', 'age_ranges'])

In [None]:
df.columns

In [None]:
# Ucitavanje destinacija
dest1 = pd.read_csv('../data/DestinationFeatures/city_onehot_features_part_01.csv')
dest2 = pd.read_csv('../data/DestinationFeatures/city_onehot_features_part_02.csv')
dest3 = pd.read_csv('../data/DestinationFeatures/city_onehot_features_part_03.csv')
dest4 = pd.read_csv('../data/DestinationFeatures/city_onehot_features_part_04.csv')

dest = pd.concat([dest1, dest2, dest3, dest4], ignore_index=True)

dest['destination_full'] = dest['city'].astype(str).str.strip() + ', ' + dest['country'].astype(str).str.strip()

import re, unicodedata
#zelimo da se otarasimo specijalnih karaktera u nazivima gradova
def norm_key(s: str) -> str:
    s = str(s).strip().lower()
    s = unicodedata.normalize('NFKD', s)
    s = ''.join(ch for ch in s if not unicodedata.combining(ch))
    s = s.replace('&', 'and')
    s = re.sub(r"[â€™'`\.]", "", s)
    s = re.sub(r"[^a-z0-9]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

dest['dest_key'] = dest['destination_full'].apply(norm_key)

dest_feat = dest.drop(columns=['city','country','destination_full'])

dest_keys = set(dest_feat['dest_key'])
print('Destinations loaded:', len(dest_feat))

In [None]:
dest1.head(10)

In [None]:
import random
random.seed(42)

# ne zelimo da imamo previse negativnih misljenja za destinacije, pa za svakog usera dozvolimo po 1 koja mu se svidja 2 negativne

NEG_RATIO = 2
MAX_NEG_PER_USER = 40

rows = [] # instance koje cemo koristiti
skipped = 0

for _, row in df.iterrows():
    user_id = row['id']

    yes = [d for d in row['yes_swipes'] if norm_key(d) in dest_keys]
    # model ne moze da uci ukoliko nemamo pozitivnih instanci za usera pa ih preskacemo
    if len(yes) == 0:
        skipped += 1
        continue


    no = [d for d in row['no_swipes'] if norm_key(d) in dest_keys]

    k = min(len(no), MAX_NEG_PER_USER, NEG_RATIO * len(yes))
    if k > 0 and len(no) > k:
        no = random.sample(no, k)

    for d in yes:
        rows.append([user_id, d, 1])
    for d in no:
        rows.append([user_id, d, 0])

interactions = pd.DataFrame(rows, columns=['user_id', 'destination', 'label'])
interactions['dest_key'] = interactions['destination'].apply(norm_key)

print('Skipped users:', skipped)

In [None]:
df.columns

In [None]:
# Umesto (user, [gradovi]), zelimo (user, grad)
feature_cols = [
    'season',
    'activity_level',
    'safety_conscious',
    'popularity',
    'traveller_type',

    'interest_Adventure',
    'interest_Beach',
    'interest_Cuisine',
    'interest_Culture',
    'interest_History',
    'interest_Nature',
    'interest_Nightlife',
    'interest_Shopping',

    'budget_0-49',
    'budget_50-99',
    'budget_100-249',
    'budget_300+',

    'scene_Desert',
    'scene_Jungle',
    'scene_Lake',
    'scene_Mountain',
    'scene_Plains',
    'scene_Rural',
    'scene_Sea',
    'scene_Urban',

    'age_0-19',
    'age_20-39',
    'age_40-59',
    'age_60+'
]

# User feature frame
user_feat = df[['id'] + feature_cols].rename(columns={'id':'user_id'}).copy()

# Destination feature columns (everything except dest_key)
dest_cols = [c for c in dest_feat.columns if c != 'dest_key']

# Final training table
train_df = interactions.merge(user_feat, on='user_id', how='left').merge(dest_feat, on='dest_key', how='left')
train_df = train_df.dropna()

print(train_df.shape)
train_df.head()



In [None]:
train_df.shape[0]

In [None]:
train_df.head(15)

In [None]:
X = train_df.drop(columns=['label', 'user_id', 'destination', 'dest_key'])
y = train_df['label'].astype(int)

categorical_cols = ['season','activity_level','safety_conscious','popularity','traveller_type']
numeric_cols = [c for c in X.columns if c not in categorical_cols]

In [None]:

from sklearn.model_selection import GroupShuffleSplit

groups = train_df['user_id']

split1 = GroupShuffleSplit(n_splits=1, test_size=0.30, random_state=42)
train_idx, tmp_idx = next(split1.split(X, y, groups=groups))

X_train, X_tmp = X.iloc[train_idx], X.iloc[tmp_idx]
y_train, y_tmp = y.iloc[train_idx], y.iloc[tmp_idx]
g_tmp = groups.iloc[tmp_idx]

split2 = GroupShuffleSplit(n_splits=1, test_size=0.50, random_state=42)
val_idx, test_idx = next(split2.split(X_tmp, y_tmp, groups=g_tmp))

X_val, X_test = X_tmp.iloc[val_idx], X_tmp.iloc[test_idx]
y_val, y_test = y_tmp.iloc[val_idx], y_tmp.iloc[test_idx]

print(X_train.shape, X_val.shape, X_test.shape)


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc   = preprocessor.transform(X_val)
X_test_proc  = preprocessor.transform(X_test)

print(X_train_proc.shape, X_val_proc.shape, X_test_proc.shape)


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def build_model(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.2),
        
        layers.Dense(1, activation='sigmoid')
    ])
    
    return model

model = build_model(X_train_proc.shape[1])

model.summary()

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=[
        keras.metrics.AUC(name='auc'),
        keras.metrics.BinaryAccuracy(name='accuracy')
    ]
)


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array([0, 1]),
    y=y_train
)

In [None]:
print(y_train.value_counts())
print(y_train.unique())
print(type(y_train))


In [None]:
y_train_np = y_train.to_numpy().astype('float32')
y_val_np   = y_val.to_numpy().astype('float32')
y_test_np  = y_test.to_numpy().astype('float32')

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='val_auc',
        patience=3,
        mode='max',
        restore_best_weights=True
    )
]


In [None]:
y_train_np = y_train.to_numpy().astype('float32')
y_val_np   = y_val.to_numpy().astype('float32')
y_test_np  = y_test.to_numpy().astype('float32')

X_train_proc = X_train_proc.astype("float32")
X_val_proc   = X_val_proc.astype("float32")
X_test_proc  = X_test_proc.astype("float32")

In [None]:
y_train.dtype

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
print(f"Class weights: {class_weight_dict}")


In [None]:

history = model.fit(
    X_train_proc, y_train_np,
    validation_data=(X_val_proc, y_val_np),
    epochs=20,
    batch_size=64,
    class_weight=class_weight_dict,
    callbacks=callbacks
)

print("\n Training complete")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Binary Crossentropy Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.show()

plt.figure(figsize=(8,5))
plt.plot(history.history['auc'], label='train_auc')
plt.plot(history.history['val_auc'], label='val_auc')
plt.xlabel('Epoch')
plt.ylabel('AUC')
plt.title('Training vs Validation AUC')
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

y_pred_probs = model.predict(X_test_proc, batch_size=1024)

y_pred = (y_pred_probs >= 0.5).astype(int)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_probs))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

