remembering normalization scaling and etc

In [25]:
import pandas as pd
import numpy as np
import math

import matplotlib.pylab as plt 

In [3]:
# тут в класі додаткові функції, які я або свіснув або через чат
class AdditionalFunctions:
  # свіснув код з якогось сайтика
  @staticmethod
  def frange(start,end,step):
    if step == 0 : raise ValueError("frange() step argument must not be zero")
    if (end-start)/step <= 0: step = -step
    while abs(start) <= abs(end):
      yield start
      start += step
      
  # а це чатік написав
  @staticmethod
  def convert_to_float(s) -> float:
    if not isinstance(s, str):
        return math.nan  # Return NaN for non-string input
    
    parts = s.split('.')
    if len(parts) > 2:
        s = parts[0] + '.' + ''.join(parts[1:])  # Fix multiple dots
    
    try:
        return float(s)
    except ValueError:
        return math.nan  # Return NaN for invalid float conversion
      
  @staticmethod
  def fix_coordinates(coord):
    """
    Convert incorrectly formatted coordinates (with '.' as thousand separators) into floats.
    Validates input to handle NaN values and ensure it's a string.

    Example:
    "13.012.793" -> 13.012793
    "80.289.982" -> 80.289982
    """
    if pd.isna(coord) or not isinstance(coord, str):
        return None  # Return None for NaN values or non-string inputs

    parts = coord.split(".")
    if len(parts) > 2:
        return float(parts[0] + "." + "".join(parts[1:]))
    return float(coord)  # Handle cases where no correction is needed


In [15]:
# а цей клас шоб покроково чистить датасетик
class FoodTimeCleaner:
  def __init__(self):
    pass
  
  def clean(self, df: pd.DataFrame) -> pd.DataFrame:
    # списки окремих колонок які будем чистить
    id_cols = ['ID', 'Delivery_person_ID',]
    coord_cols = ['Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude']
    target_col = 'TARGET'
    
    # чистим по отих окремих колонках (окремі бо вони по свому були записані і треба було окремо розбираця)
    df = self.clean_ids(df, id_cols)
    df = self.clean_coords(df, coord_cols)
    df = self.clean_target(df, target_col)
    df = self.clean_delivery_person_rating(df, discerete_step=0.5)
    
    # а тут чистим категоріальні
    df, _ = self.clean_cat(df)
    
        
    distance_median = df['Distance (km)'].mean()
    df['Distance (km)'] = df['Distance (km)'].fillna(distance_median)
    
    return df
  
  @staticmethod
  def clean_coords(df: pd.DataFrame, coord_cols: list[str]) -> pd.DataFrame:    
    for coord_col in coord_cols:
      df[coord_col] = df[coord_col].apply(lambda row_value: AdditionalFunctions.fix_coordinates(row_value))
    return df
    
  @staticmethod
  def clean_ids(df: pd.DataFrame, id_cols: list[str]) -> pd.DataFrame:
    return df.drop(id_cols, axis=1)
  
  @staticmethod
  def clean_target(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    df[target_col] = df[target_col].apply(lambda x: AdditionalFunctions.convert_to_float(x))
    df = df.dropna(subset=[target_col])
    return df
  
  @staticmethod
  def clean_delivery_person_rating(df: pd.DataFrame, discerete_step: float = 0.5) -> pd.DataFrame:
    rating_min, rating_max, discerete_step = 1, 6, 0.5

    rating_map = {}
    for i in AdditionalFunctions.frange(rating_min, rating_max, discerete_step):
      rating_map[(i, i+discerete_step)] = []
      
    for i, value in df['Delivery_person_Ratings'].items():
      for value_range in rating_map.keys():
        lim_a, lim_b = value_range
        value = float(value)
        if value >= lim_a and value < lim_b:
          rating_map[value_range].append(value)
          
    for key, value in rating_map.items():
      rating_map[key] = len(value)
    
    rating_str_f = lambda lim_a, lim_b: f"Delivery-Person Rating [{lim_a}, {lim_b})"
    rating_discrete = [(1, 3.5), (3.5, 4.5), (4.5, 5.0), (5.0, 6.5)]

    for lim_a, lim_b in rating_discrete:
      df[rating_str_f(lim_a, lim_b)] = np.zeros(df.shape[0])

    # we use the old 'clean' ds to iterate through but make changes in the new one
    for i, value in df['Delivery_person_Ratings'].items():
      for value_range in rating_discrete:
        lim_a, lim_b = value_range
        value = float(value)
        if value >= lim_a and value < lim_b:
          df[rating_str_f(lim_a, lim_b)][i] = 1
          
    for col in [f"Delivery-Person Rating [{lim_a}, {lim_b})" for lim_a, lim_b in [(1, 3.5), (3.5, 4.5), (4.5, 5.0), (5.0, 6.5)]]:
      df[col] = df[col].astype(bool)
    return df.drop('Delivery_person_Ratings', axis=1)
  
  @staticmethod
  def clean_cat(df: pd.DataFrame) -> tuple[pd.DataFrame, list]:
    """
    Returns:
        tuple[pd.DataFrame, list]: a dataset and new cat_cols
    """
    # Traffic_Level
    traffic_level_map = {
      'Very High': 4,
      'High': 3,
      'Moderate': 2,
      'Low': 1,
      'Very Low': 0,
    }
    df['Traffic_Level'] = df['Traffic_Level'].map(traffic_level_map)
    
    # weather_description
    weather_order_visibility = {
        "clear sky": 11,
        "few clouds": 10,
        "scattered clouds": 9,
        "broken clouds": 8,
        "overcast clouds": 7,
        "haze": 6,
        "mist": 5,
        "fog": 4,
        "smoke": 3,
        "light rain": 2,
        "moderate rain": 1
    }
    df['Weather Visibility'] = df['weather_description'].map(weather_order_visibility)
    df = df.drop('weather_description', axis=1)
    
    df = pd.get_dummies(df, columns=['Type_of_order'], drop_first=True)
    df = pd.get_dummies(df, columns=['Type_of_vehicle'], drop_first=True)
    
    cat_cols = [
      'Traffic_Level',
      'Weather Visibility',
      'Type_of_order_Drinks',
      'Type_of_order_Meal',
      'Type_of_order_Snack',
      'Type_of_vehicle_electric_scooter',
      'Type_of_vehicle_motorcycle',
      'Type_of_vehicle_scooter',
    ]
    
    return df, cat_cols
  

In [16]:
ds = pd.read_csv('data/Food_Time new.csv')
ds.head()

Unnamed: 0,Traffic_Level,ID,Delivery_person_ID,weather_description,Type_of_order,Type_of_vehicle,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,temperature,humidity,precipitation,Distance (km),TARGET
0,High,70A2,CHENRES12DEL01,mist,Snack,scooter,32,4.6,12.972.793,80.249.982,13.012.793,80.289.982,26.55,87.0,0.0,9.89,43.45
1,High,95B4,RANCHIRES15DEL01,clear sky,Meal,scooter,33,4.7,23.369.746,8.533.982,23.479.746,8.544.982,17.51,69.0,0.0,19.11,3.816.666.667
2,High,CDCD,DEHRES17DEL01,clear sky,Snack,motorcycle,36,4.2,30.327.968,78.046.106,30.397.968,78.116.106,12.44,77.0,0.0,11.59,3.636.666.667
3,High,2784,PUNERES13DEL03,clear sky,Drinks,scooter,23,4.7,1.856.245,73.916.619,1.865.245,74.006.619,19.37,65.0,0.0,21.93,49.45
4,High,6F67,HYDRES14DEL01,overcast clouds,Snack,motorcycle,34,4.9,17.426.228,78.407.495,17.496.228,78.477.495,21.29,64.0,0.0,18.26,5.248.333.333


In [17]:
# тут може вилізти багато warning-ів але то не страшно
ds = FoodTimeCleaner().clean(ds)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[rating_str_f(lim_a, lim_b)][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[rating_str_f(lim_a, lim_b)][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[rating_str_f(lim_a, lim_b)][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[rating_str_f(lim_a, lim_b)][i] = 1
A value is tryin

In [13]:
cat_cols = [
  'Traffic_Level',
  'Weather Visibility',
  'Type_of_order_Drinks',
  'Type_of_order_Meal',
  'Type_of_order_Snack',
  'Type_of_vehicle_electric_scooter',
  'Type_of_vehicle_motorcycle',
  'Type_of_vehicle_scooter',
]
ds[cat_cols]

Unnamed: 0,Traffic_Level,Weather Visibility,Type_of_order_Drinks,Type_of_order_Meal,Type_of_order_Snack,Type_of_vehicle_electric_scooter,Type_of_vehicle_motorcycle,Type_of_vehicle_scooter
0,3,5,False,False,True,False,False,True
1,3,11,False,True,False,False,False,True
2,3,11,False,False,True,False,True,False
3,3,11,True,False,False,False,False,True
4,3,7,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...
9995,0,6,False,False,False,False,True,False
9996,0,5,True,False,False,True,False,False
9997,0,5,True,False,False,False,False,True
9998,0,5,False,False,True,False,False,True


In [36]:
ds.columns

Index(['Traffic_Level', 'Delivery_person_Age', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'temperature', 'humidity',
       'precipitation', 'Distance (km)', 'TARGET',
       'Delivery-Person Rating [1, 3.5)', 'Delivery-Person Rating [3.5, 4.5)',
       'Delivery-Person Rating [4.5, 5.0)',
       'Delivery-Person Rating [5.0, 6.5)', 'Weather Visibility',
       'Type_of_order_Drinks', 'Type_of_order_Meal', 'Type_of_order_Snack',
       'Type_of_vehicle_electric_scooter', 'Type_of_vehicle_motorcycle',
       'Type_of_vehicle_scooter'],
      dtype='object')

In [None]:
coord_cols = [
  ('Restaurant', 'Restaurant_latitude', 'Restaurant_longitude'), 
  ('Delivery_location', 'Delivery_location_latitude', 'Delivery_location_longitude')
]
for name, lat, long in coord_cols:
  ds[f"{name}_x"] = (0.866 * ds[lat]) + (0.5 * ds[long])
  ds[f"{name}_y"] = (0.866 * ds[long]) + (0.5 * ds[lat])

In [None]:
ds = ds.drop(
  ['Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude'],
  axis=1
)

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [116]:
from sklearn.model_selection import train_test_split

X = ds.drop('TARGET', axis = 1)
y = ds['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
#X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
#X_test_scaled = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns, index=X_test.index)

In [118]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [119]:
y_pred = lr_model.predict(X_test)

In [120]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
mse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}\nMSE: {mse}\nR2: {r2}")

MAE: 15.00928699874905
MSE: 17.530495354471363
R2: 0.10324181670014076


In [126]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

rf_model = RandomForestRegressor(
  
)

knn_model = KNeighborsRegressor(
  n_neighbors=20,
)

pipe = Pipeline([
  #("scale", StandardScaler()),
  ("model", rf_model)
])

In [127]:
pipe.fit(X_train, y_train)

In [128]:
y_pred = pipe.predict(X_test)

In [98]:
def print_metrics(y_test, y_pred) -> None:
  mae = mean_absolute_error(y_test, y_pred)
  mse = np.sqrt(mean_squared_error(y_test, y_pred))
  r2 = r2_score(y_test, y_pred)

  print(f"MAE: {mae}\nMSE: {mse}\nR2: {r2}")

In [129]:
print_metrics(y_test, y_pred)

MAE: 15.238390296623484
MSE: 18.758075621344545
R2: -0.026747223581520263


In [130]:
pipe.get_params()

{'memory': None,
 'steps': [('model', RandomForestRegressor())],
 'verbose': False,
 'model': RandomForestRegressor(),
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__criterion': 'squared_error',
 'model__max_depth': None,
 'model__max_features': 1.0,
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__monotonic_cst': None,
 'model__n_estimators': 100,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': None,
 'model__verbose': 0,
 'model__warm_start': False}

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
  estimator=pipe,
  param_grid={
    'model__n_estimators': [100, 250, 500],
    'model__max_depth': [5, 10, 50],
    'model__criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    #'model__min_samples_split': [2, 4, 7],
    #'model__max_features': ['sqrt', 'log2', 1]
  },
  cv=3
)
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 