In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error

In [2]:
!pip install pandas-profiling



In [3]:
pip install ydata-profiling

Collecting ydata-profiling
  Using cached ydata_profiling-4.16.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting scipy<1.16,>=1.4.1 (from ydata-profiling)
  Using cached scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Using cached visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Using cached multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.1 (from ydata-profiling)
  Using cached ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting dacite>=1.8 (from ydata-profiling)
  Using cached dacite-1.9.2-py3-none-any.whl.metadata (17 kB)
Collecting puremagic (from visions<0.8.2,>=0.7.5->visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Downloading puremagic-1.30-py3-none-any.whl.metadata (5.8 kB)
Downloading ydata_profiling-4.16.1-py2.py3-none-any.whl (400

In [4]:
df=pd.read_csv('/content/trip_cost.csv')

In [5]:
from ydata_profiling import ProfileReport

prof = ProfileReport(df, title="Travel Cost Report")
prof.to_file("output.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/14 [00:00<?, ?it/s][A
  7%|▋         | 1/14 [00:00<00:11,  1.15it/s][A
 29%|██▊       | 4/14 [00:01<00:02,  4.16it/s][A
 43%|████▎     | 6/14 [00:01<00:01,  5.15it/s][A
 79%|███████▊  | 11/14 [00:01<00:00, 10.92it/s][A
100%|██████████| 14/14 [00:01<00:00,  7.88it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
target_cols = ['distance_km','food_cost_per_day', 'accommodation_cost_per_night', 'activities_cost', 'transport_cost_per_km','total_cost']
X = df.drop(columns=target_cols+['total_cost'])
y = df[target_cols]

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

transformer = ColumnTransformer(transformers=[
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['season']),
    ('onehot', OneHotEncoder(handle_unknown='ignore'), ['city', 'transport_mode']),
], remainder='passthrough',force_int_remainder_cols=False)


In [9]:
#identify categorical columns
categorical_columns=df.select_dtypes(include=['object']).columns
#categorical fill with mode
for col in categorical_columns:
 if df[col].isnull().sum()>0:
  mode_value=df[col].mode()[0]
  df[col].fillna(mode_value,inplace=True)

#one hot encoding
data_encoded=pd.get_dummies(df,columns=categorical_columns,drop_first=True)
data_encoded.head()

Unnamed: 0,distance_km,transport_cost_per_km,accommodation_cost_per_night,trip_days,trip_nights,food_cost_per_day,num_travelers,activities_cost,total_cost,start_location_Chennai,...,end_location_Udaipur,transport_mode_car,transport_mode_flight,transport_mode_train,accommodation_type_hostel,accommodation_type_hotel,season_off-peak,season_peak,season_summer,season_winter
0,1196,3.2,833,3,2,712,4,2689,20553,True,...,False,False,False,False,True,False,False,True,False,False
1,1826,0.75,998,8,7,202,4,2645,18834,True,...,True,False,False,True,True,False,False,False,False,False
2,692,1.98,719,9,8,352,1,1442,13102,False,...,True,False,False,False,True,False,False,False,False,False
3,2227,8.68,1666,9,8,549,3,1664,68476,False,...,False,False,True,False,False,True,False,False,False,False
4,1156,0.56,848,7,6,789,3,1627,24579,False,...,False,False,False,True,True,False,True,False,False,False


In [10]:
column_transformer = ColumnTransformer(transformers=[
    ('ordinal', OrdinalEncoder(categories=[['off-peak',' monsoon','peak','winter', 'monsoon', 'summer']]), ['season']),
    ('onehot', OneHotEncoder(drop='first'), ['start_location','end_location','transport_mode','accommodation_type'])
], remainder='passthrough')

In [11]:
ct=ColumnTransformer(transformers=[
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['season']),
    ('onehot', OneHotEncoder(handle_unknown='ignore'), ['city']),
    ...
])


In [12]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ('preprocessing', column_transformer),
    ('regression', MultiOutputRegressor(LinearRegression()))
])

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
Y_test

Unnamed: 0,distance_km,food_cost_per_day,accommodation_cost_per_night,activities_cost,transport_cost_per_km,total_cost
595,1948,701,630,1093,3.07,17748
587,1309,407,832,2174,0.67,14247
543,2432,649,1870,1336,4.77,30224
644,742,450,1899,2182,1.62,25079
487,538,621,1755,2425,1.99,17283
...,...,...,...,...,...,...
351,2019,566,1320,1233,2.05,40447
79,580,543,2340,2014,3.85,24696
148,2075,207,1618,1559,3.21,32830
333,110,539,1612,988,3.03,9730


In [15]:
model.fit(X_train,Y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [16]:
Y_pred = model.predict(X_test)

In [17]:
pd.DataFrame(Y_pred,columns=target_cols)

Unnamed: 0,distance_km,food_cost_per_day,accommodation_cost_per_night,activities_cost,transport_cost_per_km,total_cost
0,1388.128716,617.445354,671.764571,1737.588738,3.964478,14448.739830
1,1253.192968,539.594300,1509.993769,1749.247168,1.424343,25031.653673
2,1219.327158,515.973303,1777.431990,1691.404587,4.148454,15061.343959
3,1229.869234,469.329923,1640.192995,1597.207098,1.419125,24816.934067
4,1236.049163,496.038514,1620.392804,1736.616874,2.683676,19507.017757
...,...,...,...,...,...,...
151,1299.264264,421.979568,1647.204236,2103.551861,1.331088,32565.136465
152,1248.809566,494.585528,1829.679963,1618.941428,4.022237,27865.988715
153,1298.137651,504.059160,1711.453758,1832.161703,3.876372,38064.446494
154,1244.607405,488.351497,1746.687148,1685.465314,3.737344,21200.347949


In [18]:
print("R² Score:", r2_score(Y_test, Y_pred))
print("MSE:", mean_squared_error(Y_test, Y_pred))
print("MAE:", mean_absolute_error(Y_test, Y_pred))


R² Score: 0.3078803985612432
MSE: 12313075.78160304
MAE: 1412.8330877977187


In [23]:
user_data = {
     'start_location': input("Enter start location: "),
     'end_location':  input("Enter end location: "),
     'transport_mode': input("Enter transport mode (bus/train/flight): "),
     'accommodation_type': input("Enter accommodation type (hotel/homestay/etc.): "),
     'trip_days': int(input("Enter number of trip days: ")),
     'trip_nights': int(input("Enter number of trip nights: ")),
     'num_travelers':  int(input("Enter number of travelers: ")),
     'season': input("Enter season (off-peak/monsoon/peak): ")
     ,
}
user_df = pd.DataFrame([user_data])


for col in categorical_columns:
  if user_df[col].isnull().sum()>0:
    mode_value=user_df[col].mode()[0]
    user_df[col].fillna(mode_value,inplace=True)

# Step 3: Predict using your trained model
predicted_output = model.predict(user_df)

# Step 4: Display the output
target_cols = ['distance_km','food_cost_per_day', 'accommodation_cost_per_night', 'activities_cost', 'transport_cost_per_km']
for col, value in zip(target_cols, predicted_output[0]):
    print(f"{col}: {round(value, 2)}")




Enter start location: Delhi
Enter end location: Shimla
Enter transport mode (bus/train/flight): flight
Enter accommodation type (hotel/homestay/etc.): hotel
Enter number of trip days: 4
Enter number of trip nights: 3
Enter number of travelers: 2
Enter season (off-peak/monsoon/peak): peak
distance_km: 1323.87
food_cost_per_day: 494.92
accommodation_cost_per_night: 2028.87
activities_cost: 1742.29
transport_cost_per_km: 7.21
