**Step 1: Fetch Weather & Temperature Data**

In [None]:
import pandas as pd
import requests
import numpy as np

# Loading the dataset
file_path = "/content/Traveler_Trip_Dataset.csv"
df = pd.read_csv(file_path)

# Displaying the first few rows of the dataset
print("Dataset Preview:\n", df.head())

# OpenWeatherMap API Key
API_KEY = " " # Replace space with your own API key

# Function to fetch weather data
def get_weather(city):
    if pd.isna(city):  # Handling NaN values
        return np.nan, "Unknown"

    base_url = "http://api.openweathermap.org/data/2.5/weather"
    params = {"q": city, "appid": API_KEY, "units": "metric"}
    try:
        response = requests.get(base_url, params=params, timeout=5)
        response.raise_for_status()
        data = response.json()
        return data["main"]["temp"], data["weather"][0]["description"]
    except:
        return np.nan, "Unknown"

# Dropping NaN values in "Destination" before fetching unique destinations
df.dropna(subset=["Destination"], inplace=True)

# Extracting unique destinations and fetching weather data
unique_destinations = df["Destination"].unique()
weather_data = {city: get_weather(city) for city in unique_destinations}

# Applying weather data safely
df["Weather_Data"] = df["Destination"].map(weather_data)

# Expanding tuples into separate columns
df["Temperature"] = df["Weather_Data"].apply(lambda x: x[0] if isinstance(x, tuple) else np.nan)
df["Weather"] = df["Weather_Data"].apply(lambda x: x[1] if isinstance(x, tuple) else "Unknown")

# Dropping temporary column
df.drop(columns=["Weather_Data"], inplace=True)

# Handling missing values
df["Temperature"].fillna(df["Temperature"].mean(), inplace=True)
df["Weather"] = df["Weather"].astype(str)

# Encoding weather as categorical data
df["Weather"] = df["Weather"].astype("category").cat.codes

print("\nUpdated Dataset with Weather Information:\n", df.head())


Dataset Preview:
    Trip ID       Destination Start date   End date  Duration (days)  \
0        1        London, UK   5/1/2023   5/8/2023              7.0   
1        2  Phuket, Thailand  6/15/2023  6/20/2023              5.0   
2        3   Bali, Indonesia   7/1/2023   7/8/2023              7.0   
3        4     New York, USA  8/15/2023  8/29/2023             14.0   
4        5      Tokyo, Japan  9/10/2023  9/17/2023              7.0   

   Traveler name  Traveler age Traveler gender Traveler nationality  \
0     John Smith          35.0            Male             American   
1       Jane Doe          28.0          Female             Canadian   
2      David Lee          45.0            Male               Korean   
3  Sarah Johnson          29.0          Female              British   
4     Kim Nguyen          26.0          Female           Vietnamese   

  Accommodation type Accommodation cost Transportation type  \
0              Hotel               1200              Flight   
1 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Temperature"].fillna(df["Temperature"].mean(), inplace=True)


**Step 2: Data Cleaning & Preprocessing**

In [12]:
from sklearn.preprocessing import LabelEncoder

# Handling missing values
df.ffill(inplace=True)  # Forward fill

# Encoding categorical columns
categorical_columns = ["Destination", "Traveler Type", "Purpose"]
label_encoders = {}
for col in categorical_columns:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Converting numeric values properly
numeric_columns = ["Accommodation cost", "Transportation cost", "Duration (days)"]
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Filling NaN values in numeric columns with the median
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())

print("Data Cleaning & Encoding Complete!\n")
print(df.head(10))


Data Cleaning & Encoding Complete!

   Trip ID  Destination  Start date    End date  Duration (days)  \
0        1           30    5/1/2023    5/8/2023              7.0   
1        2           42   6/15/2023   6/20/2023              5.0   
2        3            6    7/1/2023    7/8/2023              7.0   
3        4           36   8/15/2023   8/29/2023             14.0   
4        5           57   9/10/2023   9/17/2023              7.0   
5        6           38   10/5/2023  10/10/2023              5.0   
6        7           54  11/20/2023  11/30/2023             10.0   
7        8           44    1/5/2024   1/12/2024              7.0   
8        9            1   2/14/2024   2/21/2024              7.0   
9       10           20   3/10/2024   3/17/2024              7.0   

   Traveler name  Traveler age Traveler gender Traveler nationality  \
0     John Smith          35.0            Male             American   
1       Jane Doe          28.0          Female             Canadian   
2 

**Step 3: Feature Engineering**

In [13]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Derived features
df["Cost per day"] = (df["Accommodation cost"] + df["Transportation cost"]) / df["Duration (days)"]
df["Trip length category"] = pd.cut(df["Duration (days)"], bins=[0, 5, 10, 15], labels=["Short", "Medium", "Long"])

# Encoding categorical features
categorical_columns = ["Trip length category", "Transportation type", "Accommodation type"]
for col in categorical_columns:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Selecting features & target
features = ["Traveler age", "Duration (days)", "Accommodation cost", "Transportation cost",
            "Cost per day", "Trip length category", "Transportation type", "Accommodation type",
            "Weather", "Temperature"]
target = "Destination"

X = df[features]
y = df[target]

# Handling class imbalance
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Normalizing the features
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)

print("Feature Engineering Complete!")


Feature Engineering Complete!


**Step 4: Sentiment Analysis**

In [14]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder

# Download VADER lexicon (if not already downloaded)
nltk.download("vader_lexicon")

# Initializing Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# User mood input
user_mood = input("How are you feeling about your trip? (Excited, Nervous, etc.): ").strip().capitalize()

# Predefined mood sentiments
mood_mapping = {
    "Excited": "Positive", "Happy": "Positive", "Thrilled": "Positive",
    "Relaxed": "Positive", "Content": "Positive", "Adventurous": "Positive",
    "Neutral": "Neutral", "Okay": "Neutral", "Fine": "Neutral",
    "Nervous": "Negative", "Worried": "Negative", "Anxious": "Negative",
    "Stressed": "Negative", "Tired": "Negative", "Sad": "Negative",
    "Fearful": "Negative", "Disappointed": "Negative"
}

# Assigning sentiment category based on predefined mapping
sentiment_category = mood_mapping.get(user_mood, "Neutral")

# Calculating sentiment score using VADER
sentiment_score = sia.polarity_scores(user_mood)["compound"]

# Print Sentiment Analysis Output
print(f"\n📝 Sentiment Analysis Result:")
print(f"   - User Mood: {user_mood}")
print(f"   - Sentiment Category: {sentiment_category}")
print(f"   - Sentiment Score: {sentiment_score:.4f}")
print("\nSentiment Analysis Complete!")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


How are you feeling about your trip? (Excited, Nervous, etc.): Thrilled

📝 Sentiment Analysis Result:
   - User Mood: Thrilled
   - Sentiment Category: Positive
   - Sentiment Score: 0.4404

Sentiment Analysis Complete!


**Step 5: Model Selection & Training**

In [15]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle  # To save the trained model

def train_xgboost_model(X, y):
    """
    Train an XGBoost model and return the trained model along with accuracy.
    """
    # Train-test split (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Initializing XGBoost model (Removed use_label_encoder=False to avoid warnings)
    model = XGBClassifier(eval_metric="mlogloss", random_state=42)

    # Training the model
    model.fit(X_train, y_train)

    # Predicting on test data
    y_pred = model.predict(X_test)

    # Calculating accuracy
    accuracy = accuracy_score(y_test, y_pred) * 100

    print(f"\n✅ Model Training Complete! Accuracy: {accuracy:.2f}%")

    # Saving the trained model as a pickle file(.pkl file)
    model_filename = "/content/smart_travel_recommendation.pkl"
    with open(model_filename, "wb") as model_file:
        pickle.dump(model, model_file)

    print(f"📁 Model saved successfully as: {model_filename}")

    return model, model_filename

# Training the model and getting the saved model file path
trained_model, model_path = train_xgboost_model(X_resampled, y_resampled)



✅ Model Training Complete! Accuracy: 96.39%
📁 Model saved successfully as: /content/smart_travel_recommendation.pkl


**Step 6: Interactive Recommendation System**

In [17]:
def recommend_trip(age, budget, trip_type, scaler, model, feature_names, label_encoders, top_n=3):
    """
    Recommend the best destinations based on user inputs.
    Ensures input matches the trained model’s expected feature format.
    """
    # Ensuring trip_type is encoded correctly
    trip_type_mapping = {"Short": 0, "Medium": 1, "Long": 2}
    trip_type_encoded = trip_type_mapping.get(trip_type, 1)  # Default to "Medium" if input is invalid

    # Creating a DataFrame with the same structure as training data
    input_data = pd.DataFrame(columns=feature_names)

    # Filling in user inputs
    input_data["Traveler age"] = [age]
    input_data["Cost per day"] = [budget]
    input_data["Trip length category"] = [trip_type_encoded]

    # Filling in other required features with default values (median for numeric, mode for categorical)
    for col in feature_names:
        if col not in input_data.columns:
            if col in ["Weather", "Temperature", "Accommodation cost", "Transportation cost", "Duration (days)"]:
                input_data[col] = df[col].median()
            else:
                input_data[col] = df[col].mode()[0]  # Most frequent category

    # Ensuring columns are in the correct order
    input_data = input_data[feature_names]

    # Normalizing using the same scaler as training
    input_scaled = scaler.transform(input_data)

    # Making a prediction (returning probabilities for multiple recommendations)
    prediction_probs = model.predict_proba(input_scaled)

    # Getting top N destinations with highest probabilities
    top_indices = np.argsort(prediction_probs[0])[-top_n:][::-1]
    top_destinations = label_encoders["Destination"].inverse_transform(top_indices)

    return top_destinations

# Example user input
user_age = int(input("Enter your age: "))
user_budget = float(input("Enter your budget per day: "))
user_trip_type = input("Enter trip type (Short, Medium, Long): ")

# Getting the feature names from training
feature_names = list(X.columns)

# Getting top 3 recommendations
recommendations = recommend_trip(user_age, user_budget, user_trip_type, scaler, trained_model, feature_names, label_encoders, top_n=3)

# Printing the recommendations
print("\n✅ Recommended Destinations:")
for i, dest in enumerate(recommendations, 1):
    print(f"   {i}. {dest}")


Enter your age: 21
Enter your budget per day: 250
Enter trip type (Short, Medium, Long): Medium

✅ Recommended Destinations:
   1. Bangkok
   2. Greece
   3. Dubai
