In [1]:
import pandas as pd
import numpy as np
import math

In [9]:
# тут в класі додаткові функції, які я або свіснув або через чат
class AdditionalFunctions:
  # свіснув код з якогось сайтика
  @staticmethod
  def frange(start,end,step):
    if step == 0 : raise ValueError("frange() step argument must not be zero")
    if (end-start)/step <= 0: step = -step
    while abs(start) <= abs(end):
      yield start
      start += step
      
  # а це чатік написав
  @staticmethod
  def convert_to_float(s) -> float:
    if not isinstance(s, str):
        return math.nan  # Return NaN for non-string input
    
    parts = s.split('.')
    if len(parts) > 2:
        s = parts[0] + '.' + ''.join(parts[1:])  # Fix multiple dots
    
    try:
        return float(s)
    except ValueError:
        return math.nan  # Return NaN for invalid float conversion
      
  @staticmethod
  def fix_coordinates(coord):
    """
    Convert incorrectly formatted coordinates (with '.' as thousand separators) into floats.
    Validates input to handle NaN values and ensure it's a string.

    Example:
    "13.012.793" -> 13.012793
    "80.289.982" -> 80.289982
    """
    if pd.isna(coord) or not isinstance(coord, str):
        return None  # Return None for NaN values or non-string inputs

    parts = coord.split(".")
    if len(parts) > 2:
        return float(parts[0] + "." + "".join(parts[1:]))
    return float(coord)  # Handle cases where no correction is needed


In [6]:
# а цей клас шоб покроково чистить датасетик
class FoodTimeCleaner:
  def __init__(self):
    pass
  
  def clean(self, df: pd.DataFrame) -> pd.DataFrame:
    # списки окремих колонок які будем чистить
    id_cols = ['ID', 'Delivery_person_ID',]
    coord_cols = ['Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude']
    target_col = 'TARGET'
    
    # чистим по отих окремих колонках (окремі бо вони по свому були записані і треба було окремо розбираця)
    df = self.clean_ids(df, id_cols)
    df = self.clean_coords(df, coord_cols)
    df = self.clean_target(df, target_col)
    df = self.clean_delivery_person_rating(df, discerete_step=0.5)
    
    # а тут чистим категоріальні
    df, _ = self.clean_cat(df)
    
    return df
  
  @staticmethod
  def clean_coords(df: pd.DataFrame, coord_cols: list[str]) -> pd.DataFrame:    
    for coord_col in coord_cols:
      df[coord_col] = df[coord_col].apply(lambda row_value: AdditionalFunctions.fix_coordinates(row_value))
    return df
    
  @staticmethod
  def clean_ids(df: pd.DataFrame, id_cols: list[str]) -> pd.DataFrame:
    return df.drop(id_cols, axis=1)
  
  @staticmethod
  def clean_target(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    df[target_col] = df[target_col].apply(lambda x: AdditionalFunctions.convert_to_float(x))
    df = df.dropna(subset=[target_col])
    return df
  
  @staticmethod
  def clean_delivery_person_rating(df: pd.DataFrame, discerete_step: float = 0.5) -> pd.DataFrame:
    rating_min, rating_max, discerete_step = 1, 6, 0.5

    rating_map = {}
    for i in AdditionalFunctions.frange(rating_min, rating_max, discerete_step):
      rating_map[(i, i+discerete_step)] = []
      
    for i, value in df['Delivery_person_Ratings'].items():
      for value_range in rating_map.keys():
        lim_a, lim_b = value_range
        value = float(value)
        if value >= lim_a and value < lim_b:
          rating_map[value_range].append(value)
          
    for key, value in rating_map.items():
      rating_map[key] = len(value)
    
    rating_str_f = lambda lim_a, lim_b: f"Delivery-Person Rating [{lim_a}, {lim_b})"
    rating_discrete = [(1, 3.5), (3.5, 4.5), (4.5, 5.0), (5.0, 6.5)]

    for lim_a, lim_b in rating_discrete:
      df[rating_str_f(lim_a, lim_b)] = np.zeros(df.shape[0])

    # we use the old 'clean' ds to iterate through but make changes in the new one
    for i, value in df['Delivery_person_Ratings'].items():
      for value_range in rating_discrete:
        lim_a, lim_b = value_range
        value = float(value)
        if value >= lim_a and value < lim_b:
          df[rating_str_f(lim_a, lim_b)][i] = 1
          
    for col in [f"Delivery-Person Rating [{lim_a}, {lim_b})" for lim_a, lim_b in [(1, 3.5), (3.5, 4.5), (4.5, 5.0), (5.0, 6.5)]]:
      df[col] = df[col].astype(bool)
    return df.drop('Delivery_person_Ratings', axis=1)
  
  @staticmethod
  def clean_cat(df: pd.DataFrame) -> tuple[pd.DataFrame, list]:
    """
    Returns:
        tuple[pd.DataFrame, list]: a dataset and new cat_cols
    """
    # Traffic_Level
    traffic_level_map = {
      'Very High': 4,
      'High': 3,
      'Moderate': 2,
      'Low': 1,
      'Very Low': 0,
    }
    df['Traffic_Level'] = df['Traffic_Level'].map(traffic_level_map)
    
    # weather_description
    weather_order_visibility = {
        "clear sky": 11,
        "few clouds": 10,
        "scattered clouds": 9,
        "broken clouds": 8,
        "overcast clouds": 7,
        "haze": 6,
        "mist": 5,
        "fog": 4,
        "smoke": 3,
        "light rain": 2,
        "moderate rain": 1
    }
    df['Weather Visibility'] = df['weather_description'].map(weather_order_visibility)
    df = df.drop('weather_description', axis=1)
    
    df = pd.get_dummies(df, columns=['Type_of_order'], drop_first=True)
    df = pd.get_dummies(df, columns=['Type_of_vehicle'], drop_first=True)
    
    cat_cols = [
      'Traffic_Level',
      'Weather Visibility',
      'Type_of_order_Drinks',
      'Type_of_order_Meal',
      'Type_of_order_Snack',
      'Type_of_vehicle_electric_scooter',
      'Type_of_vehicle_motorcycle',
      'Type_of_vehicle_scooter',
    ]
    
    return df, cat_cols
  

In [12]:
ds = pd.read_csv('data/Food_Time new.csv')
ds.head()

Unnamed: 0,Traffic_Level,ID,Delivery_person_ID,weather_description,Type_of_order,Type_of_vehicle,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,temperature,humidity,precipitation,Distance (km),TARGET
0,High,70A2,CHENRES12DEL01,mist,Snack,scooter,32,4.6,12.972.793,80.249.982,13.012.793,80.289.982,26.55,87.0,0.0,9.89,43.45
1,High,95B4,RANCHIRES15DEL01,clear sky,Meal,scooter,33,4.7,23.369.746,8.533.982,23.479.746,8.544.982,17.51,69.0,0.0,19.11,3.816.666.667
2,High,CDCD,DEHRES17DEL01,clear sky,Snack,motorcycle,36,4.2,30.327.968,78.046.106,30.397.968,78.116.106,12.44,77.0,0.0,11.59,3.636.666.667
3,High,2784,PUNERES13DEL03,clear sky,Drinks,scooter,23,4.7,1.856.245,73.916.619,1.865.245,74.006.619,19.37,65.0,0.0,21.93,49.45
4,High,6F67,HYDRES14DEL01,overcast clouds,Snack,motorcycle,34,4.9,17.426.228,78.407.495,17.496.228,78.477.495,21.29,64.0,0.0,18.26,5.248.333.333


In [13]:
# тут може вилізти багато warning-ів але то не страшно
ds = FoodTimeCleaner().clean(ds)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[rating_str_f(lim_a, lim_b)][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[rating_str_f(lim_a, lim_b)][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[rating_str_f(lim_a, lim_b)][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[rating_str_f(lim_a, lim_b)][i] = 1
A value is tryin

In [14]:
ds.head()

Unnamed: 0,Traffic_Level,Delivery_person_Age,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,temperature,humidity,precipitation,Distance (km),...,"Delivery-Person Rating [3.5, 4.5)","Delivery-Person Rating [4.5, 5.0)","Delivery-Person Rating [5.0, 6.5)",Weather Visibility,Type_of_order_Drinks,Type_of_order_Meal,Type_of_order_Snack,Type_of_vehicle_electric_scooter,Type_of_vehicle_motorcycle,Type_of_vehicle_scooter
0,3,32,12.972793,80.249982,13.012793,80.289982,26.55,87.0,0.0,9.89,...,False,True,False,5,False,False,True,False,False,True
1,3,33,23.369746,8.533982,23.479746,8.544982,17.51,69.0,0.0,19.11,...,False,True,False,11,False,True,False,False,False,True
2,3,36,30.327968,78.046106,30.397968,78.116106,12.44,77.0,0.0,11.59,...,True,False,False,11,False,False,True,False,True,False
3,3,23,1.856245,73.916619,1.865245,74.006619,19.37,65.0,0.0,21.93,...,False,True,False,11,True,False,False,False,False,True
4,3,34,17.426228,78.407495,17.496228,78.477495,21.29,64.0,0.0,18.26,...,False,True,False,7,False,False,True,False,True,False
