In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("data/cleaned_taxi_trip_pricing.csv")

df.head()


Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
2,30.33,Evening,Weekday,4.0,Low,Unknown,3.48,0.51,0.15,116.81,36.4698
3,25.83,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618
4,8.64,Afternoon,Weekend,2.0,Medium,Clear,2.55,1.71,0.48,89.33,60.2028


In [22]:
X, y = df.drop("Trip_Price", axis = 1), df["Trip_Price"]

print("Namn:", y.name)
print("Dtyp:", y.dtype)

Namn: Trip_Price
Dtyp: float64


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train.shape, X_test.shape

((521, 10), (257, 10))

In [24]:
# 1) Dela upp kolumner
num_cols = X_train.select_dtypes(include="number").columns
cat_cols = X_train.select_dtypes(exclude="number").columns

# 2) Använd numeriska direkt (beöver INTE skala när jag användare Random Forest)
X_train_num = X_train[num_cols].to_numpy()
X_test_num  = X_test[num_cols].to_numpy()

# 3) One-hot-koda kategoriska kolumner
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_train_cat = encoder.fit_transform(X_train[cat_cols])
X_test_cat  = encoder.transform(X_test[cat_cols])

# 4) Slå ihop numeriska + kategoriska kolumner till en enda feature-matrix
import numpy as np
X_train_prepared = np.hstack([X_train_num, X_train_cat])
X_test_prepared  = np.hstack([X_test_num, X_test_cat])


In [25]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_prepared, y_train)

y_pred = model.predict(X_test_prepared)

In [26]:

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("MSE :", mean_squared_error(y_test, y_pred))
print("MAE :", mean_absolute_error(y_test, y_pred))
print("R²  :", r2_score(y_test, y_pred))


MSE : 79.61651468441787
MAE : 5.618353732968777
R²  : 0.9512072305626842


##### Då r2-värdet är 0.951 behöver jag undersöka om datasetet är 'overfitted'. 

In [None]:
#Obs... Kod från ChatGPT för att testa om datan är overfitted pga hade svårt att lösa just detta på egen hand...

def eval_model(y_true, y_pred, name="Set"):
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2   = r2_score(y_true, y_pred)
    print(f"{name}")
    print(f"  MAE : {mae:.3f}")
    print(f"  RMSE: {rmse:.3f}")
    print(f"  R²  : {r2:.3f}")
    print("-"*30)

# Prediktion på train och test
y_train_pred = model.predict(X_train_prepared)
y_test_pred  = model.predict(X_test_prepared)

eval_model(y_train, y_train_pred, "Train")
eval_model(y_test, y_test_pred,   "Test")


Train
  MAE : 2.116
  RMSE: 3.691
  R²  : 0.992
------------------------------
Test
  MAE : 5.618
  RMSE: 8.923
  R²  : 0.951
------------------------------


### **Träna modellen på all data**



In [28]:
# 1. Dela upp kolumner
num_cols = X.select_dtypes(include="number").columns
cat_cols = X.select_dtypes(exclude="number").columns

In [29]:
# 2. One-hot-koda kategoriska kolumner
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_cat = encoder.fit_transform(X[cat_cols])

In [31]:
# 3. Gör df av de one-hot-kodade
oh_cols = encoder.get_feature_names_out(cat_cols)
X_cat_df = pd.DataFrame(X_cat, columns=oh_cols, index=X.index)

In [32]:
# 4. Slå ihop numeriska + kategoriska kolumner
X_prepared = pd.concat([X[num_cols], X_cat_df], axis=1)

X_prepared

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Afternoon,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,...,Day_of_Week_Weekday,Day_of_Week_Weekend,Traffic_Conditions_High,Traffic_Conditions_Low,Traffic_Conditions_Medium,Traffic_Conditions_Unknown,Weather_Clear,Weather_Rain,Weather_Snow,Weather_Unknown
0,19.35,3.0,3.56,0.80,0.32,53.82,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,36.87,1.0,2.70,1.21,0.15,37.27,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,30.33,4.0,3.48,0.51,0.15,116.81,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,25.83,3.0,2.93,0.63,0.32,22.64,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,8.64,2.0,2.55,1.71,0.48,89.33,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
773,40.17,3.0,3.81,0.66,0.42,62.66,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
774,14.34,1.0,3.23,1.01,0.29,45.07,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
775,18.69,3.0,4.90,1.79,0.17,79.41,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
776,5.49,4.0,2.39,0.62,0.49,58.39,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [None]:
5. #Träna modellen på hela datan
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_prepared, y)

In [34]:
hasattr(model, "n_features_in_")

True

In [36]:
import joblib

joblib.dump(model, "models/taxi_price_regressor.joblib")
joblib.dump(encoder, "models/encoder.joblib")

['models/encoder.joblib']