Purpose: Dataset basic overview + structure + missing values check.

In [1]:
# 1. Basic Import and Settings

import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

In [2]:
pd.__version__


'2.3.3'

In [3]:
# 2. CSV File Load

file_path = '../data_raw/food_orders_raw.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,Order_ID,Customer_ID,Customer_Age,Customer_Gender,City,Area,Restaurant_ID,Restaurant_Name,Cuisine_Type,Order_Date,Order_Time,Delivery_Time_Min,Distance_km,Order_Value,Discount_Applied,Final_Amount,Payment_Mode,Order_Status,Cancellation_Reason,Delivery_Partner_ID,Delivery_Rating,Restaurant_Rating,Order_Day,Peak_Hour,Profit_Margin
0,ORD000001,CUST6948,19.0,Male,,Central,RES936,Restaurant_29,Chinese,10/20/2024,0:00,187.0,15.75,,,,UPI,Delivered,,DP563,5.0,4.4,Weekend,True,0.13
1,ORD000002,CUST6515,,Female,Chennai,North,RES689,Restaurant_419,Chinese,8/12/2024,0:00,20.0,29.5,4869.0,20.0,4849.0,COD,Delivered,,DP369,5.0,4.7,Weekday,True,0.48
2,ORD000003,CUST1765,,Male,Delhi,,RES723,Restaurant_244,Arabian,12/8/2024,0:00,207.0,,757.0,20.0,737.0,Wallet,Delivered,,DP580,4.0,4.9,Weekend,True,0.08
3,ORD000004,CUST2744,,Male,Mumbai,Central,RES951,Restaurant_178,Chinese,10/8/2024,0:00,143.0,15.68,,,,UPI,Cancelled,Late Delivery,DP155,2.0,3.4,Weekday,,0.04
4,ORD000005,CUST4389,57.0,Female,Chennai,South,RES419,Restaurant_262,Chinese,2/4/2024,0:00,51.0,9.6,372.0,20.0,352.0,Card,Delivered,,DP728,2.0,4.4,Weekend,False,0.12


In [4]:
# 3. Shape, Columns and Data Types  

print("Rows, Columns:", df.shape)

Rows, Columns: (100000, 25)


In [5]:
print("\nColumns:\n", df.columns.tolist())



Columns:
 ['Order_ID', 'Customer_ID', 'Customer_Age', 'Customer_Gender', 'City', 'Area', 'Restaurant_ID', 'Restaurant_Name', 'Cuisine_Type', 'Order_Date', 'Order_Time', 'Delivery_Time_Min', 'Distance_km', 'Order_Value', 'Discount_Applied', 'Final_Amount', 'Payment_Mode', 'Order_Status', 'Cancellation_Reason', 'Delivery_Partner_ID', 'Delivery_Rating', 'Restaurant_Rating', 'Order_Day', 'Peak_Hour', 'Profit_Margin']


In [6]:
print("\nInfo:")
df.info()


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 25 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Order_ID             100000 non-null  object 
 1   Customer_ID          100000 non-null  object 
 2   Customer_Age         49907 non-null   float64
 3   Customer_Gender      75144 non-null   object 
 4   City                 83274 non-null   object 
 5   Area                 83315 non-null   object 
 6   Restaurant_ID        100000 non-null  object 
 7   Restaurant_Name      100000 non-null  object 
 8   Cuisine_Type         83115 non-null   object 
 9   Order_Date           98986 non-null   object 
 10  Order_Time           98002 non-null   object 
 11  Delivery_Time_Min    66641 non-null   float64
 12  Distance_km          66530 non-null   float64
 13  Order_Value          66673 non-null   float64
 14  Discount_Applied     83285 non-null   float64
 15  Final_Amoun

In [7]:
print("\nDescribe (numeric):")
df.describe()


Describe (numeric):


Unnamed: 0,Customer_Age,Delivery_Time_Min,Distance_km,Order_Value,Discount_Applied,Final_Amount,Delivery_Rating,Restaurant_Rating,Profit_Margin
count,49907.0,66641.0,66530.0,66673.0,83285.0,44303.0,83477.0,100000.0,100000.0
mean,38.976516,127.475923,16.449242,2081.830126,93.936243,1961.10119,2.991531,4.24968,0.150362
std,12.372157,90.805839,12.256742,1553.628891,108.209904,1557.354417,1.414108,0.722554,0.201888
min,18.0,20.0,1.0,150.0,0.0,-150.0,1.0,3.0,-0.2
25%,28.0,45.0,5.47,673.0,20.0,559.0,2.0,3.6,-0.02
50%,39.0,120.0,9.97,1197.0,50.0,1156.0,3.0,4.2,0.15
75%,50.0,210.0,27.43,3494.0,100.0,3375.0,4.0,4.9,0.32
max,60.0,300.0,40.0,5000.0,300.0,4980.0,5.0,5.5,0.5


In [8]:
# 4. Missing values & duplicates

print("\nMissing values:\n", df.isnull().sum())


Missing values:
 Order_ID                   0
Customer_ID                0
Customer_Age           50093
Customer_Gender        24856
City                   16726
Area                   16685
Restaurant_ID              0
Restaurant_Name            0
Cuisine_Type           16885
Order_Date              1014
Order_Time              1998
Delivery_Time_Min      33359
Distance_km            33470
Order_Value            33327
Discount_Applied       16715
Final_Amount           55697
Payment_Mode           19911
Order_Status               0
Cancellation_Reason    90969
Delivery_Partner_ID        0
Delivery_Rating        16523
Restaurant_Rating          0
Order_Day                  0
Peak_Hour              32962
Profit_Margin              0
dtype: int64


In [9]:
print("\nDuplicate Order_ID count:",
      df['Order_ID'].duplicated().sum())


Duplicate Order_ID count: 0


In [10]:
# 5. Basic value_counts for key columns

print(df['Order_Status'].value_counts())

Order_Status
Delivered    84964
Cancelled    15036
Name: count, dtype: int64


In [11]:
print(df['City'].value_counts().head(10))


City
Hyderabad    16884
Bangalore    16732
Delhi        16695
Mumbai       16493
Chennai      16470
Name: count, dtype: int64


In [12]:
print(df['Cuisine_Type'].value_counts().head(10))


Cuisine_Type
Indian     16685
Arabian    16658
Chinese    16651
Mexican    16602
Italian    16519
Name: count, dtype: int64


In [13]:
print(df['Payment_Mode'].value_counts())


Payment_Mode
Card      20094
Wallet    20086
COD       19977
UPI       19932
Name: count, dtype: int64
