# Importing Libraries for Data Cleaning

In [98]:
# Importing libraries for data cleaning and manipulation  
import pandas as pd
import numpy as np

# Loading Datasets for Data Cleaning

In [99]:
# Loading datasets for data cleaning and processing  
customers_df = pd.read_csv('customers.csv')
restaurants_df = pd.read_csv('restaurants.csv')
orders_df = pd.read_csv('orders.csv')
delivery_persons_df = pd.read_csv('delivery_persons.csv')
deliveries_df = pd.read_csv('deliveries.csv')

# Checking Customer Information for Data Cleaning

In [100]:
# Displaying summary information about the customers dataset  
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   customer_id        200 non-null    object 
 1   name               200 non-null    object 
 2   email              200 non-null    object 
 3   phone              200 non-null    int64  
 4   location           200 non-null    object 
 5   signup_date        200 non-null    object 
 6   is_premium         200 non-null    bool   
 7   preferred_cuisine  200 non-null    object 
 8   total_orders       200 non-null    int64  
 9   average_rating     200 non-null    float64
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 14.4+ KB


## Checking for Duplicate Values in the 'Name' Column of the Customers Table

In [101]:
# Count occurrences of each name
name_counts = customers_df['name'].value_counts()

# Filter names that appear more than once (duplicates)
duplicate_names = name_counts[name_counts > 1]

# Display each duplicate name with its count
print("▫️ Duplicate Names with Counts:")
for name, count in duplicate_names.items():
    print(f"{name}({count})")

# Calculate total duplicate occurrences
total_duplicates = duplicate_names.sum()
print(f"\n🔹 Total Duplicate Entries: {total_duplicates}")


▫️ Duplicate Names with Counts:
Urishilla(3)
Faras(3)
Chakrika(3)
Lucky(3)
Samaksh(2)
Ekalinga(2)
Pallavi(2)
Wridesh(2)
Nicholas(2)
Hiral(2)
Anusha(2)
Adweta(2)
Robert(2)
Amrita(2)
Turvi(2)
Vaishnavi(2)
Tanmayi(2)
Finn(2)
Qadim(2)
Logan(2)
Chatresh(2)
Leena(2)
Dayamai(2)
Azad(2)

🔹 Total Duplicate Entries: 52


## Creating Unique Identifiers for Duplicate Names in the Customers Table

In [102]:
# Creating the email only for duplicate names
customers_df['name'] = customers_df.apply(
    lambda row: row['name'] + "@" + str(customers_df[customers_df['name'] == row['name']].index.get_loc(row.name)) 
    if customers_df['name'].duplicated(keep=False)[row.name] else row['name'], axis=1
)

# Printing the result
print(customers_df['name'])

0        Timothy
1         Gunbir
2        Qadim@0
3          Leela
4      Hemangini
         ...    
195        Manan
196       Upkaar
197    Alexander
198    Samaksh@1
199      Hiral@1
Name: name, Length: 200, dtype: object


## Checking and Verifying the Removal of Duplicate Names in the Customers Table

In [103]:
name_counts = customers_df['name'].value_counts()
duplicate_names = name_counts[name_counts > 1]

if duplicate_names.empty:
    print("🔹 No duplicate names found. The correction was successful!")
else:
    print(f"🔹 Duplicate Names with Counts: {duplicate_names.sum()}")
    for name, count in duplicate_names.items():
        print(f"{name}({count})")


🔹 No duplicate names found. The correction was successful!


## Checking for Duplicate Values in the 'Email' Column of the Customers Table

In [104]:
# Count occurrences of each email
email_counts = customers_df['email'].value_counts()

# Filter emails that appear more than once (duplicates)
duplicate_emails = email_counts[email_counts > 1]

# Display each duplicate email with its count
print("\n🔹 Duplicate Emails with Counts:")
for email, count in duplicate_emails.items():
    print(f"{email} ({count})")

# Calculate total duplicate occurrences
total_duplicate_emails = duplicate_emails.sum()
print(f"\n🔹 Total Duplicate Email Entries: {total_duplicate_emails}")



🔹 Duplicate Emails with Counts:
urishilla@gmail.com (3)
faras@gmail.com (3)
chakrika@gmail.com (3)
lucky@gmail.com (3)
samaksh@gmail.com (2)
ekalinga@gmail.com (2)
pallavi@gmail.com (2)
wridesh@gmail.com (2)
nicholas@gmail.com (2)
hiral@gmail.com (2)
anusha@gmail.com (2)
adweta@gmail.com (2)
robert@gmail.com (2)
amrita@gmail.com (2)
turvi@gmail.com (2)
vaishnavi@gmail.com (2)
tanmayi@gmail.com (2)
finn@gmail.com (2)
qadim@gmail.com (2)
logan@gmail.com (2)
chatresh@gmail.com (2)
leena@gmail.com (2)
dayamai@gmail.com (2)
azad@gmail.com (2)

🔹 Total Duplicate Email Entries: 52


## Creating Unique Identifiers for Duplicate Email in the Customers Table

In [105]:
customers_df['email'] = customers_df.groupby('email').cumcount().astype(str).radd(customers_df['email'].str.split('@').str[0] + "_").add("@gmail.com")

# Ensure non-duplicate emails remain unchanged
customers_df.loc[customers_df['email'].str.endswith("_0@gmail.com"), 'email'] = customers_df['email'].str.replace("_0", "", regex=False)

# Print the final output
print(customers_df['email'])



0        timothy@gmail.com
1         gunbir@gmail.com
2          qadim@gmail.com
3          leela@gmail.com
4      hemangini@gmail.com
              ...         
195        manan@gmail.com
196       upkaar@gmail.com
197    alexander@gmail.com
198    samaksh_1@gmail.com
199      hiral_1@gmail.com
Name: email, Length: 200, dtype: object


## Checking and Verifying the Removal of Duplicate Names in the Customers Table

In [106]:
email_counts = customers_df['email'].value_counts()
duplicate_emails = email_counts[email_counts > 1]

if duplicate_emails.empty:
    print("🔹 No duplicate emails found. The correction was successful!")
else:
    print(f"🔹 Duplicate Emails with Counts: {duplicate_emails.sum()}")
    for email, count in duplicate_emails.items():
        print(f"{email}({count})")



🔹 No duplicate emails found. The correction was successful!


## Correcting the 'signup_date' Format in the Customers Table

In [107]:
# Correct the signup_date format
customers_df['signup_date'] = pd.to_datetime(customers_df['signup_date']).dt.date


## Checking for Duplicate Values in the 'Name' Column of the Restaurants Table

In [108]:
# Count occurrences of each restaurant name
restaurant_name_counts = restaurants_df['name'].value_counts()
duplicate_restaurant_names = restaurant_name_counts[restaurant_name_counts > 1]

# Count occurrences of each owner name
owner_name_counts = restaurants_df['owner_name'].value_counts()
duplicate_owner_names = owner_name_counts[owner_name_counts > 1]

# Display duplicate restaurant names
print("▫️ Duplicate Restaurant Names with Counts:")
for name, count in duplicate_restaurant_names.items():
    print(f"{name}({count})")
print(f"\n🔹 Total Duplicate Restaurant Name Entries: {duplicate_restaurant_names.sum()}")

# Display duplicate owner names
print("\n▫️ Duplicate Owner Names with Counts:")
for name, count in duplicate_owner_names.items():
    print(f"{name}({count})")
print(f"\n🔹 Total Duplicate Owner Name Entries: {duplicate_owner_names.sum()}")


▫️ Duplicate Restaurant Names with Counts:
Naan Tale(2)
Flavors Express(2)
Tandoori Bliss(2)
Kebab Kingdom(2)

🔹 Total Duplicate Restaurant Name Entries: 8

▫️ Duplicate Owner Names with Counts:

🔹 Total Duplicate Owner Name Entries: 0


## Creating Unique Identifiers for Duplicate Names in the Restaurants Table


In [109]:
# Correct duplicates in 'name' column
restaurants_df['name'] = restaurants_df.apply(
    lambda row: row['name'] + "@" + str(restaurants_df.groupby('name').cumcount()[row.name]) 
    if restaurants_df['name'].duplicated(keep=False)[row.name] else row['name'], axis=1
)

## Checking and Verifying the Removal of Duplicate Names in the Restaurants Table

In [110]:
# Re-check for duplicates after correction
updated_name_counts = restaurants_df['name'].value_counts()
updated_duplicate_names = updated_name_counts[updated_name_counts > 1]

if updated_duplicate_names.empty:
    print("\n🔹 No duplicate restaurant names found. The correction was successful!")
else:
    print("\n🔹 Duplicate Restaurant Names Still Exist!")



🔹 No duplicate restaurant names found. The correction was successful!


## Checking Orders Information for Data Cleaning

In [111]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   order_id          400 non-null    object 
 1   customer_id       400 non-null    object 
 2   restaurant_id     400 non-null    object 
 3   order_date        400 non-null    object 
 4   delivery_time     400 non-null    object 
 5   status            400 non-null    object 
 6   total_amount      400 non-null    float64
 7   payment_mode      400 non-null    object 
 8   discount_applied  400 non-null    int64  
 9   feedback_rating   400 non-null    float64
dtypes: float64(2), int64(1), object(7)
memory usage: 31.4+ KB


## Correcting the 'order_date' Format in the Orders Table


In [112]:
orders_df['order_date'] = pd.to_datetime(orders_df['order_date']).dt.date

## Formatting 'delivery_time' Column in the Orders Table to AM/PM Format

In [113]:
def format_delivery_time(time_str):
    # Convert time string to a datetime object
    time_obj = pd.to_datetime(time_str, format="%I:%M %p")
    return time_obj.strftime("%I:%M %p")  # Ensure formatting with AM/PM

orders_df['delivery_time'] = orders_df['delivery_time'].apply(format_delivery_time)


## Checking for Duplicate Values in the 'Name' Column of the Delivery_Persons Table

In [114]:
# Count occurrences of each name
name_counts = delivery_persons_df['name'].value_counts()
duplicate_names = name_counts[name_counts > 1]

# Check if any duplicates exist
if duplicate_names.empty:
    print("🔹 No duplicate names found.")
else:
    print(f"🔹 Duplicate Names with Counts: {duplicate_names.sum()}")
    for name, count in duplicate_names.items():
        print(f"{name}({count})")


🔹 Duplicate Names with Counts: 8
Teerth(2)
Guneet(2)
Jatin(2)
Pranit(2)


## Creating Unique Identifiers for Duplicate Names in the Delivery_Persons Table


In [115]:
delivery_persons_df['name'] = delivery_persons_df.apply(
    lambda row: row['name'] + "@" + str(delivery_persons_df.groupby('name').cumcount()[row.name]) 
    if delivery_persons_df['name'].duplicated(keep=False)[row.name] else row['name'], axis=1
)

print(delivery_persons_df['name'])

0          Ayaan
1         Udarsh
2       Nachiket
3         Rushil
4       Teerth@0
5       Harrison
6            Jai
7         Atharv
8        Jatin@0
9       Pranit@0
10         Faras
11      Balendra
12        Samesh
13    Chakradhar
14         Daksh
15         Advay
16      Pranit@1
17      Guneet@0
18         Gagan
19      Guneet@1
20       Utkarsh
21         Gopal
22         Naksh
23        Charan
24          Liam
25     Chandresh
26      Mohammed
27        Rayaan
28       Jatin@1
29      Teerth@1
30      Baljiwan
31         Ikbal
32       Gaurang
33          Owen
34        Ekavir
35          Ojas
36         Mason
37       Gabriel
38        Aarnav
39         Harsh
Name: name, dtype: object


## Checking and Verifying the Removal of Duplicate Names in the Delivery_Persons Table


In [116]:
# Re-check for duplicates after correction
name_counts = delivery_persons_df['name'].value_counts()
duplicate_names = name_counts[name_counts > 1]

if duplicate_names.empty:
    print("🔹 No duplicate names found. The correction was successful!")
else:
    print(f"🔹 Duplicate Names with Counts: {duplicate_names.sum()}")
    for name, count in duplicate_names.items():
        print(f"{name}({count})")


🔹 No duplicate names found. The correction was successful!


## Saving the Cleaned DataFrames to Updated CSV Files

In [117]:
customers_df.to_csv('updated_customers.csv', index=False)
restaurants_df.to_csv("updated_restaurants.csv", index=False)
orders_df.to_csv("updated_orders.csv", index=False)
delivery_persons_df.to_csv("updated_delivery_persons.csv", index=False)
deliveries_df.to_csv("updated_deliveries.csv", index=False)


In [118]:

updated_customers_df = pd.read_csv('updated_customers.csv')

updated_customers_df.iloc[25:50]




Unnamed: 0,customer_id,name,email,phone,location,signup_date,is_premium,preferred_cuisine,total_orders,average_rating
25,C026,Tejas,tejas@gmail.com,7054040373,"22/468\nSibal Nagar, Panihati, ARUNACHAL PRADESH",2020-04-17,False,mexican,4,1.6
26,C027,Turvi@0,turvi@gmail.com,7068669338,"10/26\nSawhney Road, Kollam, HIMACHAL PRADESH",2020-03-14,False,mexican,3,1.8
27,C028,Urishilla@0,urishilla@gmail.com,7072728613,"273\nSaraf Nagar, Jamnagar, ASSAM",2021-11-01,False,indian,2,4.9
28,C029,Manthan,manthan@gmail.com,8010934293,"430\nMitra Marg, Tenali, WEST BENGAL",2022-10-24,False,indian,3,1.8
29,C030,Lucky@1,lucky_1@gmail.com,8014776573,"H.No. 48, Nazareth Marg, Bhiwandi, MAHARASHTRA",2022-07-11,False,italian,4,1.6
30,C031,Zarna,zarna@gmail.com,7056002785,"63, Bhargava Street, Baranagar, ARUNACHAL PRADESH",2023-12-31,True,italian,7,2.6
31,C032,Watika,watika@gmail.com,8076823775,"H.No. 96, Mannan, Dewas, UTTARAKHAND",2022-01-04,False,mexican,2,1.9
32,C033,Urishilla@1,urishilla_1@gmail.com,6033580732,"43\nKapur Nagar, Burhanpur, PUNJAB",2023-08-08,True,chinese,3,4.8
33,C034,Robert@0,robert@gmail.com,8053716113,"48\nSane Circle, Rajahmundry, MIZORAM",2024-12-10,False,italian,6,2.5
34,C035,Sara,sara@gmail.com,9033451049,"H.No. 54, Mane, South Dumdum, MAHARASHTRA",2020-12-01,False,chinese,10,4.8


In [119]:
updated_restaurants_df = pd.read_csv("updated_restaurants.csv")

updated_restaurants_df.iloc[0:10]

Unnamed: 0,restaurant_id,name,cuisine_type,location,owner_name,average_delivery_time(min),contact_number,rating,total_orders,is_active
0,R001,Garam Palace,Thai,"Sikar, TRIPURA",Tanay Bhargava,35,9082703425,4.9,5,False
1,R002,Tandoor Place,Chinese,"Adoni, GOA",Wazir Kale,36,7031727188,2.3,6,False
2,R003,Mithai Corner,Mexican,"Imphal, GUJARAT",Yadavi Lal,23,7095149150,3.4,5,False
3,R004,Sizzler Royale,Continental,"Baranagar, ANDHRA PRADESH",Ekta Desai,35,7025638466,2.7,10,True
4,R005,Flavors Express@0,Chinese,"Asansol, NAGALAND",Gaurang Issac,35,7013689628,4.6,5,False
5,R006,Annapurna Kitchen,Thai,"Jhansi, PUNJAB",Saumya Bora,47,8070623985,2.2,7,True
6,R007,Flavors Tale,Continental,"Dehradun, PUNJAB",Janaki Bhargava,38,6063976605,2.4,10,False
7,R008,Dosa Co,Thai,"Khammam, BIHAR",Widisha Kapur,60,9044719028,1.5,10,True
8,R009,Sizzler Kitchen,Italian,"Anantapur, KARNATAKA",Chandresh Hegde,30,6097068652,2.6,6,True
9,R010,Flavors Spice,Indian,"Akola, GOA",Advik Bhatt,41,9094887630,4.0,7,True


In [120]:
updated_orders_df = pd.read_csv('updated_orders.csv')

updated_orders_df.iloc[0:10]

Unnamed: 0,order_id,customer_id,restaurant_id,order_date,delivery_time,status,total_amount,payment_mode,discount_applied,feedback_rating
0,O001,C093,R070,2025-01-05,04:14 AM,Cancelled,856.52,Credit Card,71,4.1
1,O002,C193,R079,2025-01-01,08:35 AM,Delivered,750.89,UPI,54,2.9
2,O003,C139,R063,2025-01-03,01:38 PM,Pending,197.27,UPI,143,4.9
3,O004,C162,R048,2025-01-08,06:12 PM,Cancelled,635.17,Cash,91,2.7
4,O005,C113,R075,2025-01-13,08:02 PM,Cancelled,216.03,Cash,56,1.8
5,O006,C017,R008,2025-01-23,09:50 PM,Pending,399.84,Credit Card,97,3.1
6,O007,C139,R073,2025-01-29,01:58 AM,Pending,988.17,Credit Card,62,3.9
7,O008,C085,R043,2025-01-05,09:31 AM,Cancelled,446.94,Cash,145,3.7
8,O009,C077,R030,2025-02-01,02:34 PM,Delivered,648.4,UPI,87,3.4
9,O010,C033,R076,2025-01-20,04:20 PM,Delivered,786.38,Credit Card,139,1.0


In [121]:
updated_delivery_persons_df = pd.read_csv('updated_delivery_persons.csv')

updated_delivery_persons_df.iloc[0:10]

Unnamed: 0,delivery_person_id,name,contact_number,vehicle_type,total_deliveries,average_rating,location
0,D001,Ayaan,8014794818,Bike,8,4.4,"Bilaspur, UTTARAKHAND"
1,D002,Udarsh,6063479590,Car,7,4.4,"Gulbarga, MIZORAM"
2,D003,Nachiket,8022813378,Bike,14,3.8,"Kakinada, PUNJAB"
3,D004,Rushil,7041895700,Car,6,4.4,"Naihati, MEGHALAYA"
4,D005,Teerth@0,6053687095,Car,6,3.5,"Bhalswa Jahangir Pur, ARUNACHAL PRADESH"
5,D006,Harrison,8063919269,Bike,9,1.6,"Kharagpur, MIZORAM"
6,D007,Jai,9093333241,Bike,13,3.9,"Muzaffarnagar, MIZORAM"
7,D008,Atharv,6086982044,Bike,10,1.4,"Gangtok, TAMIL NADU"
8,D009,Jatin@0,8043708210,Bike,11,4.5,"Jaipur, MIZORAM"
9,D010,Pranit@0,9022759401,Car,10,4.1,"Nellore, ODISHA"


In [122]:
updated_deliveries_df = pd.read_csv('updated_deliveries.csv')

updated_deliveries_df.iloc[0:10]

Unnamed: 0,delivery_id,order_id,delivery_person_id,delivery_status,distance(km),delivery_time(min),estimated_time(min),delivery_fee,vehicle_type
0,DLY001,O059,D006,Delivered,4,50,47,134,Bike
1,DLY002,O044,D020,Delivered,3,38,53,105,Bike
2,DLY003,O128,D018,On the way,14,30,50,64,Car
3,DLY004,O040,D024,On the way,14,43,59,79,Bike
4,DLY005,O056,D019,On the way,2,41,37,130,Bike
5,DLY006,O291,D002,Delivered,5,49,36,72,Car
6,DLY007,O121,D022,Delivered,12,28,37,141,Car
7,DLY008,O102,D019,On the way,8,34,39,125,Bike
8,DLY009,O194,D025,Delivered,14,55,29,130,Bike
9,DLY010,O247,D040,Delivered,7,37,44,51,Car
