In [1]:
# run once
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib


In [14]:
# PARAMETERS
n = 1000
np.random.seed(42)

# Features for budget prediction
family_income = np.random.randint(15000, 200000, n)
monthly_allowance = np.random.randint(2000, 20000, n)
accommodation_type = np.random.choice(["Hostel", "PG", "Home"], n)
group_size = np.random.randint(1, 8, n)
travel_mode = np.random.choice(["Bus", "Bike", "Car", "Train"], n)
weekend_spending = np.random.randint(200, 4000, n)
savings_interest = np.random.choice(["Yes", "No"], n)

# Budget known flag
budget_known = np.random.choice(["Yes", "No"], n, p=[0.35, 0.65])

# Predicted budget (synthetic formula, add noise)
predicted_budget = (monthly_allowance * 0.35) + (weekend_spending * 0.25) + (group_size * 120) + np.random.normal(0, 300, n)
predicted_budget = np.round(np.clip(predicted_budget, 300, None)).astype(int)

# Entered budget if user knows it (simulate realistic values)
entered_budget = np.where(budget_known == "Yes", np.random.randint(500, 5000, n), np.nan)

# Final budget to be used by Models 2 & 3
final_budget = np.where(budget_known == "Yes", entered_budget, predicted_budget)

# Preference inputs
activity_interest = np.random.choice(["Beach", "Trek", "Nightlife", "Food", "Heritage", "WaterSports"], n)
personality = np.random.choice(["Chill", "Adventure", "Culture", "Party"], n)
duration = np.random.choice(["Half-Day", "Full-Day", "Weekend"], n)
group_type = np.random.choice(["Solo", "Couple", "Friends"], n)
indoor_outdoor = np.random.choice(["Indoor", "Outdoor"], n)

# Preference label (simple mapping rule)
preference = np.where(activity_interest=="Beach", "Beach & Chill",
               np.where(activity_interest=="Nightlife", "Nightlife & Party",
               np.where(activity_interest=="Trek","Nature & Trek",
               np.where(activity_interest=="Food","Food & Café",
               np.where(activity_interest=="Heritage","Heritage Travel","Adventure Sports")))))

# Location attributes
distance = np.random.randint(1, 60, n)
rating = np.round(np.random.uniform(3.0, 5.0, n), 2)

# Recommended location (rule-based mapping for synthetic target)
recommended_location = np.where(preference=="Beach & Chill", "Colva Beach",
                         np.where(preference=="Nightlife & Party", "Baga Beach",
                         np.where(preference=="Nature & Trek", "Todo Falls",
                         np.where(preference=="Food & Café", "Good Mantra Cafe",
                         np.where(preference=="Heritage Travel", "Reis Magos Fort",
                         "Chapora River Kayaking")))))

# Build dataframe
df = pd.DataFrame({
    "Budget_Known": budget_known,
    "Family_Income": family_income,
    "Monthly_Allowance": monthly_allowance,
    "Accommodation_Type": accommodation_type,
    "Group_Size": group_size,
    "Travel_Mode": travel_mode,
    "Weekend_Spending": weekend_spending,
    "Savings": savings_interest,
    "Entered_Budget": entered_budget,
    "Predicted_Budget": predicted_budget,
    "Final_Budget": final_budget,
    "Activity_Interest": activity_interest,
    "Personality": personality,
    "Duration": duration,
    "Group_Type": group_type,
    "Indoor_Outdoor": indoor_outdoor,
    "Preference": preference,
    "Distance": distance,
    "Rating": rating,
    "Recommended_Location": recommended_location
})

# quick save and peek
df.to_csv("goa_student_travel_synthetic_data_v2.csv", index=False)
df.head()


Unnamed: 0,Budget_Known,Family_Income,Monthly_Allowance,Accommodation_Type,Group_Size,Travel_Mode,Weekend_Spending,Savings,Entered_Budget,Predicted_Budget,Final_Budget,Activity_Interest,Personality,Duration,Group_Type,Indoor_Outdoor,Preference,Distance,Rating,Recommended_Location
0,No,136958,8988,PG,5,Bus,2308,Yes,,4654,4654.0,Nightlife,Adventure,Half-Day,Solo,Outdoor,Nightlife & Party,22,3.42,Baga Beach
1,Yes,161867,16594,Home,5,Car,1352,Yes,1808.0,7402,1808.0,Trek,Adventure,Weekend,Solo,Outdoor,Nature & Trek,54,3.97,Todo Falls
2,Yes,146932,14977,PG,1,Train,3237,No,3486.0,6759,3486.0,Beach,Adventure,Weekend,Friends,Outdoor,Beach & Chill,12,3.17,Colva Beach
3,No,118694,2009,PG,6,Bike,2932,No,,2199,2199.0,Trek,Culture,Weekend,Couple,Indoor,Nature & Trek,18,4.69,Todo Falls
4,No,134879,18644,PG,2,Car,2215,No,,7174,7174.0,Heritage,Culture,Full-Day,Couple,Outdoor,Heritage Travel,56,4.44,Reis Magos Fort


In [15]:
# Copy of df
data = df.copy()

# Label encoders for categorical columns that will be used by models
le_activity = LabelEncoder().fit(data["Activity_Interest"])
le_personality = LabelEncoder().fit(data["Personality"])
le_duration = LabelEncoder().fit(data["Duration"])
le_group_type = LabelEncoder().fit(data["Group_Type"])
le_indoor_outdoor = LabelEncoder().fit(data["Indoor_Outdoor"])
le_preference = LabelEncoder().fit(data["Preference"])
le_location = LabelEncoder().fit(data["Recommended_Location"])

# Apply encodings
data["Activity_Interest_enc"] = le_activity.transform(data["Activity_Interest"])
data["Personality_enc"] = le_personality.transform(data["Personality"])
data["Duration_enc"] = le_duration.transform(data["Duration"])
data["Group_Type_enc"] = le_group_type.transform(data["Group_Type"])
data["Indoor_Outdoor_enc"] = le_indoor_outdoor.transform(data["Indoor_Outdoor"])
data["Preference_enc"] = le_preference.transform(data["Preference"])
data["Location_enc"] = le_location.transform(data["Recommended_Location"])

# Numeric scaler for regression inputs
scaler = StandardScaler().fit(data[["Family_Income", "Monthly_Allowance", "Group_Size", "Weekend_Spending"]])


In [16]:
joblib.dump(le_activity, "le_activity.joblib")
joblib.dump(le_personality, "le_personality.joblib")
joblib.dump(le_duration, "le_duration.joblib")
joblib.dump(le_group_type, "le_group_type.joblib")
joblib.dump(le_indoor_outdoor, "le_indoor_outdoor.joblib")
joblib.dump(le_preference, "le_preference.joblib")
joblib.dump(le_location, "le_location.joblib")
joblib.dump(scaler, "scaler_numeric.joblib")


['scaler_numeric.joblib']

In [17]:
# repaired_model1_kfold.py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# -------------- Load data --------------
df = pd.read_csv("goa_student_travel_synthetic_data.csv")

# Use only rows where budget is unknown (Model 1 is used only then)
df_model1 = df[df["Budget_Known"] == "No"].copy()

# Features and target (choose the same features you used earlier)
X = df_model1[["Family_Income", "Monthly_Allowance", "Group_Size", "Weekend_Spending"]].values
y = df_model1["Predicted_Budget"].values  # numeric target

# -------------- Train/Test split --------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------- Scale features --------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------- Train model --------------
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# -------------- Predictions --------------
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# -------------- Metrics functions (use np.sqrt for RMSE) --------------
def regression_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))  # RMSE computed via sqrt
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

train_mae, train_rmse, train_r2 = regression_metrics(y_train, y_train_pred)
test_mae, test_rmse, test_r2 = regression_metrics(y_test, y_test_pred)

print("Model 1 (Budget prediction) — Train vs Test")
print(f"Train MAE:  {train_mae:.3f}   Train RMSE: {train_rmse:.3f}   Train R2: {train_r2:.3f}")
print(f"Test  MAE:  {test_mae:.3f}   Test  RMSE: {test_rmse:.3f}   Test  R2: {test_r2:.3f}")
print()

# -------------- K-Fold cross-validation (5 folds) --------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []
r2_scores = []

# We will scale folds using StandardScaler fit only on train fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X), start=1):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    # scale
    scaler_fold = StandardScaler()
    X_tr_s = scaler_fold.fit_transform(X_tr)
    X_val_s = scaler_fold.transform(X_val)
    
    # train
    model_fold = LinearRegression()
    model_fold.fit(X_tr_s, y_tr)
    
    # predict
    y_val_pred = model_fold.predict(X_val_s)
    
    # metrics (note: RMSE = sqrt(MSE) -- no 'squared' kw used)
    rmse_fold = np.sqrt(mean_squared_error(y_val, y_val_pred))
    r2_fold = r2_score(y_val, y_val_pred)
    
    rmse_scores.append(rmse_fold)
    r2_scores.append(r2_fold)
    
    print(f"Fold {fold:>2}: RMSE = {rmse_fold:.3f}, R2 = {r2_fold:.3f}")

print()
print(f"K-Fold RMSE : mean = {np.mean(rmse_scores):.3f}, std = {np.std(rmse_scores):.3f}")
print(f"K-Fold R2   : mean = {np.mean(r2_scores):.3f}, std = {np.std(r2_scores):.3f}")


Model 1 (Budget prediction) — Train vs Test
Train MAE:  103.792   Train RMSE: 118.504   Train R2: 0.997
Test  MAE:  100.473   Test  RMSE: 116.009   Test  R2: 0.997

Fold  1: RMSE = 116.009, R2 = 0.997
Fold  2: RMSE = 119.878, R2 = 0.997
Fold  3: RMSE = 122.196, R2 = 0.997
Fold  4: RMSE = 117.817, R2 = 0.997
Fold  5: RMSE = 115.097, R2 = 0.997

K-Fold RMSE : mean = 118.200, std = 2.583
K-Fold R2   : mean = 0.997, std = 0.000


In [18]:
# -------------------------
# MODEL 2 – Preference Classifier
# -------------------------

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# Load dataset
df = pd.read_csv("goa_student_travel_synthetic_data.csv")

# Copy to avoid modifying original
df2 = df.copy()

# Encode all categorical columns needed
label_cols = ["Activity_Interest", "Personality", "Duration",
              "Group_Type", "Indoor_Outdoor", "Preference"]

encoders = {}  # store encoders if needed later

for col in label_cols:
    enc = LabelEncoder()
    df2[col] = enc.fit_transform(df2[col])
    encoders[col] = enc

# Features (X) and Label (y)
X2 = df2[["Activity_Interest", "Personality", "Duration",
          "Group_Type", "Indoor_Outdoor"]]

y2 = df2["Preference"]

# Train–test split
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, random_state=42
)

# Model
model2 = RandomForestClassifier(random_state=42)
model2.fit(X2_train, y2_train)

# Predictions
y2_train_pred = model2.predict(X2_train)
y2_test_pred = model2.predict(X2_test)

# Accuracy
train_acc = accuracy_score(y2_train, y2_train_pred)
test_acc = accuracy_score(y2_test, y2_test_pred)

# Macro F1
train_f1 = f1_score(y2_train, y2_train_pred, average='macro')
test_f1 = f1_score(y2_test, y2_test_pred, average='macro')

# Display Results
print("MODEL 2 — Preference Prediction")
print("Train Accuracy:", round(train_acc, 3))
print("Test Accuracy :", round(test_acc, 3))
print("Train Macro F1:", round(train_f1, 3))
print("Test Macro F1 :", round(test_f1, 3))


MODEL 2 — Preference Prediction
Train Accuracy: 1.0
Test Accuracy : 1.0
Train Macro F1: 1.0
Test Macro F1 : 1.0


In [19]:
# Model 3 — Recommendation classifier (full code)
# Assumes "goa_student_travel_synthetic_data.csv" exists and has columns:
# Final_Budget, Preference, Distance, Rating, Recommended_Location

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# 1) Load data
df = pd.read_csv("goa_student_travel_synthetic_data.csv")

# 2) Basic sanity check (optional)
print("Rows:", len(df))
print("Columns:", df.columns.tolist())

# 3) Encode categorical labels
le_loc = LabelEncoder()
le_pref = LabelEncoder()

# Ensure no missing values in target
df = df.dropna(subset=["Recommended_Location", "Preference", "Final_Budget", "Distance", "Rating"])

df["loc_encoded"] = le_loc.fit_transform(df["Recommended_Location"])
df["pref_encoded"] = le_pref.fit_transform(df["Preference"])

# 4) Features and target
X = df[["Final_Budget", "pref_encoded", "Distance", "Rating"]].copy()
y = df["loc_encoded"]

# 5) Train-test split (stratify to preserve class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# 6) Scale numeric features (Final_Budget, Distance, Rating). pref_encoded is numeric but small-range; still safe to scale.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7) Model definition & training
model3 = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model3.fit(X_train_scaled, y_train)

# 8) Predictions
y_train_pred = model3.predict(X_train_scaled)
y_test_pred  = model3.predict(X_test_scaled)

# 9) Metrics: Accuracy + Macro F1
train_acc = accuracy_score(y_train, y_train_pred)
test_acc  = accuracy_score(y_test, y_test_pred)

train_f1_macro = f1_score(y_train, y_train_pred, average="macro")
test_f1_macro  = f1_score(y_test, y_test_pred, average="macro")

print("MODEL 3 — Location Recommendation")
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test  Accuracy: {test_acc:.4f}")
print(f"Train Macro F1: {train_f1_macro:.4f}")
print(f"Test  Macro F1: {test_f1_macro:.4f}")

# 10) Confusion matrix + classification report (on test set)
print("\nTest set classification report:")
print(classification_report(y_test, y_test_pred, target_names=le_loc.classes_))

print("Confusion matrix (test):")
print(confusion_matrix(y_test, y_test_pred))

# 11) K-Fold cross-validation (Stratified) for accuracy and macro-f1
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# cross_val_score uses the original estimator and data; scale inside cross-val using a pipeline would be cleaner,
# but for simplicity we will create a pipeline to avoid data leakage during CV.
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

# Recreate a pipeline: scaler + classifier
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

# Use original X and y (not pre-scaled) for CV with pipeline
cv_accuracy = cross_val_score(pipeline, X, y, cv=skf, scoring="accuracy", n_jobs=-1)
cv_f1_macro = cross_val_score(pipeline, X, y, cv=skf, scoring="f1_macro", n_jobs=-1)

print("\n5-Fold CV results:")
print(f"Accuracy: mean={cv_accuracy.mean():.4f}  std={cv_accuracy.std():.4f}")
print(f"Macro F1: mean={cv_f1_macro.mean():.4f}  std={cv_f1_macro.std():.4f}")

# 12) Interpretation helper (quick bias/variance check)
print("\nBias/Variance quick check:")
print("- If train >> test metrics: possible overfitting (high variance).")
print("- If both train and test low: possible underfitting (high bias).")
print(f"Observed: train_acc={train_acc:.4f}, test_acc={test_acc:.4f}, train_f1={train_f1_macro:.4f}, test_f1={test_f1_macro:.4f}")


Rows: 3000
Columns: ['Budget_Known', 'Family_Income', 'Monthly_Allowance', 'Accommodation_Type', 'Group_Size', 'Travel_Mode', 'Weekend_Spending', 'Savings', 'Entered_Budget', 'Predicted_Budget', 'Final_Budget', 'Activity_Interest', 'Personality', 'Duration', 'Group_Type', 'Indoor_Outdoor', 'Preference', 'Distance', 'Rating', 'Recommended_Location']
MODEL 3 — Location Recommendation
Train Accuracy: 1.0000
Test  Accuracy: 1.0000
Train Macro F1: 1.0000
Test  Macro F1: 1.0000

Test set classification report:
                        precision    recall  f1-score   support

            Baga Beach       1.00      1.00      1.00       107
Chapora River Kayaking       1.00      1.00      1.00        98
           Colva Beach       1.00      1.00      1.00        98
      Good Mantra Cafe       1.00      1.00      1.00        93
       Reis Magos Fort       1.00      1.00      1.00       103
            Todo Falls       1.00      1.00      1.00       101

              accuracy                  

In [20]:
import pickle

# Save model to file
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
