In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Car Rental Dataset Preprocessing

In [51]:
data_path = '../data/raw/car_rental_sample.csv'
raw_df = pd.read_csv(data_path)

## Data Summary

In [52]:
raw_df.head()

Unnamed: 0,airport,airport_iata,country,city,rental_length,start_date,start_time,return_date,return_time,date_offset,...,condition,dropoff_time,efficiency,location,pickup_time,value_for_money,no_of_ratings,RunDate,setup_prams,tid
0,Heathrow Airport,LHR,GB,London,2,2023-07-25,10:00,2023-07-27,10:00,0,...,7.0,5.5,6.3,4.5,4.0,6.1,210,2023-07-23 05:05:17,device=Pixel 4&network_type=wifi&languagecode=...,4269268
1,Heathrow Airport,LHR,GB,London,2,2023-07-25,10:00,2023-07-27,10:00,0,...,7.9,8.6,6.9,5.8,5.6,6.6,4909,2023-07-23 05:05:17,device=Pixel 4&network_type=wifi&languagecode=...,4269269
2,Heathrow Airport,LHR,GB,London,2,2023-07-25,10:00,2023-07-27,10:00,0,...,7.9,8.6,6.9,5.8,5.6,6.6,4909,2023-07-23 05:05:17,device=Pixel 4&network_type=wifi&languagecode=...,4269270
3,Heathrow Airport,LHR,GB,London,2,2023-07-25,10:00,2023-07-27,10:00,0,...,7.9,8.6,6.9,5.8,5.6,6.6,4909,2023-07-23 05:05:17,device=Pixel 4&network_type=wifi&languagecode=...,4269271
4,Heathrow Airport,LHR,GB,London,2,2023-07-25,10:00,2023-07-27,10:00,0,...,7.9,8.6,6.9,5.8,5.6,6.6,4909,2023-07-23 05:05:17,device=Pixel 4&network_type=wifi&languagecode=...,4269272


In [53]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2730 entries, 0 to 2729
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   airport                2730 non-null   object 
 1   airport_iata           2730 non-null   object 
 2   country                2730 non-null   object 
 3   city                   2730 non-null   object 
 4   rental_length          2730 non-null   int64  
 5   start_date             2730 non-null   object 
 6   start_time             2730 non-null   object 
 7   return_date            2730 non-null   object 
 8   return_time            2730 non-null   object 
 9   date_offset            2730 non-null   int64  
 10  deposit_price          2730 non-null   float64
 11  drive_away_price       2730 non-null   float64
 12  price                  2730 non-null   float64
 13  currency               2730 non-null   object 
 14  product_name           2730 non-null   object 
 15  prod

In [54]:
raw_df.describe(include='all')

Unnamed: 0,airport,airport_iata,country,city,rental_length,start_date,start_time,return_date,return_time,date_offset,...,condition,dropoff_time,efficiency,location,pickup_time,value_for_money,no_of_ratings,RunDate,setup_prams,tid
count,2730,2730,2730,2730,2730.0,2730,2730,2730,2730,2730.0,...,2730.0,2730.0,2730.0,2730.0,2730.0,2730.0,2730.0,2730,2730,2730.0
unique,14,14,11,13,,1,1,1,1,,...,,,,,,,,1,11,
top,Adolfo Suárez Madrid–Barajas Airport,MAD,ES,Madrid,,2023-07-25,10:00,2023-07-27,10:00,,...,,,,,,,,2023-07-23 05:05:17,device=Pixel 4&network_type=wifi&languagecode=...,
freq,392,392,734,392,,2730,2730,2730,2730,,...,,,,,,,,2730,734,
mean,,,,,2.0,,,,,0.0,...,8.24022,8.827326,7.938718,7.761978,6.897363,7.30978,1949.138095,,,4270692.0
std,,,,,0.0,,,,,0.0,...,0.989743,1.019272,1.117402,1.164938,1.15766,0.93887,2017.464056,,,858.583
min,,,,,2.0,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,4269268.0
25%,,,,,2.0,,,,,0.0,...,8.0,8.7,7.4,7.3,6.5,7.0,761.0,,,4269950.0
50%,,,,,2.0,,,,,0.0,...,8.4,9.0,8.1,8.1,7.1,7.4,1423.0,,,4270632.0
75%,,,,,2.0,,,,,0.0,...,8.7,9.3,8.8,8.4,7.7,7.9,2233.0,,,4271447.0


In [55]:
missing_values = raw_df.isnull().sum()
print("Missing or null values in each column:\n", missing_values)

Missing or null values in each column:
 airport                     0
airport_iata                0
country                     0
city                        0
rental_length               0
start_date                  0
start_time                  0
return_date                 0
return_time                 0
date_offset                 0
deposit_price               0
drive_away_price            0
price                       0
currency                    0
product_name                0
product_id                  0
airbags                     0
aircon                      0
free_cancellation           0
doors                       0
group                       0
seats                       0
fuel_type                2730
transmission                0
mileage                     0
supplier_name               0
supplier_address            0
supplier_loction_type       0
average                     0
average_text              103
cleanliness                 0
condition                   0


#### Dropping Missing Values
Since `fuel_type` is the only column with missing values, and all 2730 rows of the data set are missing, we will drop the `fuel_type` column. Missing values for `average_text` can be derived.

In [56]:
cleaned_df = raw_df.drop(columns=['fuel_type'])
missing_values = cleaned_df.isnull().sum()
print("Data after dropping 'fuel_type' column:\n", missing_values)

Data after dropping 'fuel_type' column:
 airport                    0
airport_iata               0
country                    0
city                       0
rental_length              0
start_date                 0
start_time                 0
return_date                0
return_time                0
date_offset                0
deposit_price              0
drive_away_price           0
price                      0
currency                   0
product_name               0
product_id                 0
airbags                    0
aircon                     0
free_cancellation          0
doors                      0
group                      0
seats                      0
transmission               0
mileage                    0
supplier_name              0
supplier_address           0
supplier_loction_type      0
average                    0
average_text             103
cleanliness                0
condition                  0
dropoff_time               0
efficiency                 0
lo

### Non-null columns

In [60]:
missing_values = cleaned_df.isnull().sum()
print("Missing or null values in each column:\n", missing_values)

Missing or null values in each column:
 airport                    0
airport_iata               0
country                    0
city                       0
rental_length              0
start_date                 0
start_time                 0
return_date                0
return_time                0
date_offset                0
deposit_price              0
drive_away_price           0
price                      0
currency                   0
product_name               0
product_id                 0
airbags                    0
aircon                     0
free_cancellation          0
doors                      0
group                      0
seats                      0
transmission               0
mileage                    0
supplier_name              0
supplier_address           0
supplier_loction_type      0
average                    0
average_text             103
cleanliness                0
condition                  0
dropoff_time               0
efficiency                 0
loc

## Data Type Conversions

In [72]:
processed_df = cleaned_df
print("Columns in cleaned_df:", processed_df.columns)


Columns in cleaned_df: Index(['airport', 'airport_iata', 'country', 'city', 'rental_length',
       'start_date', 'start_time', 'return_date', 'return_time', 'date_offset',
       'deposit_price', 'drive_away_price', 'price', 'currency',
       'product_name', 'product_id', 'airbags', 'aircon', 'free_cancellation',
       'doors', 'group', 'seats', 'transmission', 'mileage', 'supplier_name',
       'supplier_address', 'supplier_loction_type', 'average', 'average_text',
       'cleanliness', 'condition', 'dropoff_time', 'efficiency', 'location',
       'pickup_time', 'value_for_money', 'no_of_ratings', 'RunDate',
       'setup_prams', 'tid'],
      dtype='object')


In [None]:
processed_df['start_date'] = pd.to_datetime(processed_df['start_date'], errors='coerce')
processed_df['return_date'] = pd.to_datetime(processed_df['return_date'], errors='coerce')

print("Number of missing values in 'rental_date':", processed_df['start_date'].isna().sum())
print("Number of missing values in 'return_date':", processed_df['return_date'].isna().sum())

print("Data after converting date columns and handling missing values:\n")
processed_df.head()


Number of NaT values in 'rental_date': 0
Number of NaT values in 'return_date': 0
Data after converting date columns and handling NaT values:



Unnamed: 0,airport,airport_iata,country,city,rental_length,start_date,start_time,return_date,return_time,date_offset,...,condition,dropoff_time,efficiency,location,pickup_time,value_for_money,no_of_ratings,RunDate,setup_prams,tid
0,Heathrow Airport,LHR,GB,London,2,2023-07-25,10:00,2023-07-27,10:00,0,...,7.0,5.5,6.3,4.5,4.0,6.1,210,2023-07-23 05:05:17,device=Pixel 4&network_type=wifi&languagecode=...,4269268
1,Heathrow Airport,LHR,GB,London,2,2023-07-25,10:00,2023-07-27,10:00,0,...,7.9,8.6,6.9,5.8,5.6,6.6,4909,2023-07-23 05:05:17,device=Pixel 4&network_type=wifi&languagecode=...,4269269
2,Heathrow Airport,LHR,GB,London,2,2023-07-25,10:00,2023-07-27,10:00,0,...,7.9,8.6,6.9,5.8,5.6,6.6,4909,2023-07-23 05:05:17,device=Pixel 4&network_type=wifi&languagecode=...,4269270
3,Heathrow Airport,LHR,GB,London,2,2023-07-25,10:00,2023-07-27,10:00,0,...,7.9,8.6,6.9,5.8,5.6,6.6,4909,2023-07-23 05:05:17,device=Pixel 4&network_type=wifi&languagecode=...,4269271
4,Heathrow Airport,LHR,GB,London,2,2023-07-25,10:00,2023-07-27,10:00,0,...,7.9,8.6,6.9,5.8,5.6,6.6,4909,2023-07-23 05:05:17,device=Pixel 4&network_type=wifi&languagecode=...,4269272


## Svaing the Cleaned Dataset

In [None]:
cleaned_file_path = '../data/cleaned/car_rental_sample.csv'
processed_df.to_csv(cleaned_file_path, index=False)

print("\nData Preprocessing Complete. Cleaned data saved to:", cleaned_file_path)


Data Preprocessing Complete. Cleaned data saved to: ../data/cleaned/enterprise_station.csv
