In [35]:
# Step-by-step data cleaning pipeline (quality control) in pandas using sample dataset

In [24]:
import pandas as pd
import numpy as np
from dateutil import parser

# Sample messy dataset
data = {
    'Order ID': [1, 2, 2, 3, 4, None],
    'Store ': ['Lagos', 'Abuja', 'Abuja', 'Kano', None, 'Lagos'],
    'Item': ['Rice', 'Beans', 'Beans', 'Yam', 'Maize', 'Rice'],
    'Quantity': [2, '3', 3, 4, np.nan, 5],
    'Unit Price': ['1,200', '1500', '1500', '2000', '1,000', '1200'],
    'Order Date': ['2025/01/05', '05-02-2025', '05-02-2025', '2025-03-15', 'March 20, 2025', '2025/01/05']
}

df = pd.DataFrame(data)

print("Original Dataset:\n", df)

Original Dataset:
    Order ID Store    Item Quantity Unit Price      Order Date
0       1.0  Lagos   Rice        2      1,200      2025/01/05
1       2.0  Abuja  Beans        3       1500      05-02-2025
2       2.0  Abuja  Beans        3       1500      05-02-2025
3       3.0   Kano    Yam        4       2000      2025-03-15
4       4.0   None  Maize      NaN      1,000  March 20, 2025
5       NaN  Lagos   Rice        5       1200      2025/01/05


In [25]:
# Check missing (Step 1: Handle Missing Values)
print("\nMissing values:\n", df.isnull().sum())

print(df)

# Fill missing Store with "Unknown", Quantity with 0
df['Store '] = df['Store '].fillna("Unknown")
df['Quantity'] = df['Quantity'].fillna(0)
df['Order ID'] = df['Order ID'].fillna(-1)  # placeholder for missing ID

# Handle missing values (e.g., by dropping or imputing)
# Example: df.dropna() or df.fillna()


Missing values:
 Order ID      1
Store         1
Item          0
Quantity      1
Unit Price    0
Order Date    0
dtype: int64
   Order ID Store    Item Quantity Unit Price      Order Date
0       1.0  Lagos   Rice        2      1,200      2025/01/05
1       2.0  Abuja  Beans        3       1500      05-02-2025
2       2.0  Abuja  Beans        3       1500      05-02-2025
3       3.0   Kano    Yam        4       2000      2025-03-15
4       4.0   None  Maize      NaN      1,000  March 20, 2025
5       NaN  Lagos   Rice        5       1200      2025/01/05


In [26]:
print(df)

   Order ID   Store    Item Quantity Unit Price      Order Date
0       1.0    Lagos   Rice        2      1,200      2025/01/05
1       2.0    Abuja  Beans        3       1500      05-02-2025
2       2.0    Abuja  Beans        3       1500      05-02-2025
3       3.0     Kano    Yam        4       2000      2025-03-15
4       4.0  Unknown  Maize        0      1,000  March 20, 2025
5      -1.0    Lagos   Rice        5       1200      2025/01/05


In [27]:
# Check for duplicate rows (Step 2: Handle Duplicates)
print("Any duplicates?:", df.duplicated().any())

# If duplicate rows exist, we use df = df.drop_duplicates()
# Then use the following code lines to check if the duplicates have heen handled
# df = pd.DataFrame(data)

# print("📌 Original DataFrame with Duplicates:")
# print(df)

#df = df.drop_duplicates()

# print("\n✅ DataFrame after Removing Duplicates:")
# print(df)

Any duplicates?: False


In [28]:
df = df.drop_duplicates()
print(df)

   Order ID   Store    Item Quantity Unit Price      Order Date
0       1.0    Lagos   Rice        2      1,200      2025/01/05
1       2.0    Abuja  Beans        3       1500      05-02-2025
2       2.0    Abuja  Beans        3       1500      05-02-2025
3       3.0     Kano    Yam        4       2000      2025-03-15
4       4.0  Unknown  Maize        0      1,000  March 20, 2025
5      -1.0    Lagos   Rice        5       1200      2025/01/05


In [29]:
df.dtypes

Order ID      float64
Store          object
Item           object
Quantity       object
Unit Price     object
Order Date     object
dtype: object

In [30]:
# Step 3: Fix Wrong Data Types

# Convert Quantity to int
df['Quantity'] = df['Quantity'].astype(int)

# Remove commas and convert Unit Price to float
df['Unit Price'] = df['Unit Price'].astype(str).str.replace(",", "").astype(float)

# Standardize Order Date to datetime using format for MM/DD/YYYY
# 3a. Function to parse multiple formats
def parse_date(date_str):
    try:
        return parser.parse(date_str, dayfirst=False)
    except:
        return pd.NaT

# 3b. Apply the function
df["Order Date"] = df["Order Date"].apply(parse_date)

# 3c. Format consistently as MM/DD/YYYY
df["Order Date"] = df["Order Date"].dt.strftime("%m/%d/%Y")

print(df)

   Order ID   Store    Item  Quantity  Unit Price  Order Date
0       1.0    Lagos   Rice         2      1200.0  01/05/2025
1       2.0    Abuja  Beans         3      1500.0  05/02/2025
2       2.0    Abuja  Beans         3      1500.0  05/02/2025
3       3.0     Kano    Yam         4      2000.0  03/15/2025
4       4.0  Unknown  Maize         0      1000.0  03/20/2025
5      -1.0    Lagos   Rice         5      1200.0  01/05/2025


In [31]:
# Strip spaces, lowercase, replace spaces with underscores (Step 4: Standardize Column Names)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print(df)

   order_id    store   item  quantity  unit_price  order_date
0       1.0    Lagos   Rice         2      1200.0  01/05/2025
1       2.0    Abuja  Beans         3      1500.0  05/02/2025
2       2.0    Abuja  Beans         3      1500.0  05/02/2025
3       3.0     Kano    Yam         4      2000.0  03/15/2025
4       4.0  Unknown  Maize         0      1000.0  03/20/2025
5      -1.0    Lagos   Rice         5      1200.0  01/05/2025


In [32]:
# Create total_amount = quantity * unit_price (Step 5: Add Derived Column)
df['total_amount'] = df['quantity'] * df['unit_price']
print(df)

   order_id    store   item  quantity  unit_price  order_date  total_amount
0       1.0    Lagos   Rice         2      1200.0  01/05/2025        2400.0
1       2.0    Abuja  Beans         3      1500.0  05/02/2025        4500.0
2       2.0    Abuja  Beans         3      1500.0  05/02/2025        4500.0
3       3.0     Kano    Yam         4      2000.0  03/15/2025        8000.0
4       4.0  Unknown  Maize         0      1000.0  03/20/2025           0.0
5      -1.0    Lagos   Rice         5      1200.0  01/05/2025        6000.0


In [33]:
print("\nCleaned Dataset:\n", df) # Display Cleaned Dataset


Cleaned Dataset:
    order_id    store   item  quantity  unit_price  order_date  total_amount
0       1.0    Lagos   Rice         2      1200.0  01/05/2025        2400.0
1       2.0    Abuja  Beans         3      1500.0  05/02/2025        4500.0
2       2.0    Abuja  Beans         3      1500.0  05/02/2025        4500.0
3       3.0     Kano    Yam         4      2000.0  03/15/2025        8000.0
4       4.0  Unknown  Maize         0      1000.0  03/20/2025           0.0
5      -1.0    Lagos   Rice         5      1200.0  01/05/2025        6000.0


In [35]:
# Automated Function for Reuse
def data(df):
    # ✅ Step 1: Standardize column names at the very beginning
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
    
    # ✅ Step 2: Handle missing values
    df = df.fillna({"store": "Unknown", "quantity": 0, "order_id": -1})
    
    # ✅ Step 3: Remove duplicates
    df = df.drop_duplicates()
    
    # ✅ Step 4: Fix data types
    if "quantity" in df.columns:
        df["quantity"] = df["quantity"].astype(int)
    
    if "unit_price" in df.columns:
        df["unit_price"] = df["unit_price"].astype(str).str.replace(",", "").astype(float)
    
    if "order_date" in df.columns:
        df["order_date"] = pd.to_datetime(df["order_date"], errors="coerce")
    
    # ✅ Step 5: Derived column
    if {"quantity", "unit_price"}.issubset(df.columns):
        df["total_amount"] = df["quantity"] * df["unit_price"]
    
    return df
cleaned_df = data(df)
print("\nAutomated Cleaned Dataset:\n", cleaned_df)


Automated Cleaned Dataset:
    order_id    store   item  quantity  unit_price order_date  total_amount
0       1.0    Lagos   Rice         2      1200.0 2025-01-05        2400.0
1       2.0    Abuja  Beans         3      1500.0 2025-05-02        4500.0
3       3.0     Kano    Yam         4      2000.0 2025-03-15        8000.0
4       4.0  Unknown  Maize         0      1000.0 2025-03-20           0.0
5      -1.0    Lagos   Rice         5      1200.0 2025-01-05        6000.0


In [36]:
# Step-by-step data cleaning pipeline (quality control) in pandas using Real life dataset

In [37]:
import os
df = pd.read_csv('/home/jovyan/input_data/Superstore.csv', encoding='latin1')

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   object 
 17  Sales          9994 non-null   float64
 18  Quantity

In [39]:
# Check missing (Step 1: Handle Missing Values)
print("\nMissing values:\n", df.isnull().sum())


Missing values:
 Row ID           0
Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer ID      0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Product ID       0
Category         0
Sub-Category     0
Product Name     0
Sales            0
Quantity         0
Discount         0
Profit           0
dtype: int64


In [40]:
# Check for duplicate rows (Step 2: Handle Duplicates)
print("Any duplicates?:", df.duplicated().any())

Any duplicates?: False


In [41]:
# Step 3: Fix Wrong Data Types
# Standardize Order Date to datetime using format for MM/DD/YYYY
if "Order Date" in df.columns:
        df["Order Date"] = pd.to_datetime(df["Order Date"], errors="coerce")
if "Ship Date" in df.columns:
        df["Ship Date"] = pd.to_datetime(df["Ship Date"], errors="coerce")

df.head(3)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714


In [42]:
# Strip spaces, lowercase, replace spaces with underscores (Step 4: Standardize Column Names)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df.columns = df.columns.str.replace("-", "_")
df.head(3)

Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,postal_code,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   row_id         9994 non-null   int64         
 1   order_id       9994 non-null   object        
 2   order_date     9994 non-null   datetime64[ns]
 3   ship_date      9994 non-null   datetime64[ns]
 4   ship_mode      9994 non-null   object        
 5   customer_id    9994 non-null   object        
 6   customer_name  9994 non-null   object        
 7   segment        9994 non-null   object        
 8   country        9994 non-null   object        
 9   city           9994 non-null   object        
 10  state          9994 non-null   object        
 11  postal_code    9994 non-null   int64         
 12  region         9994 non-null   object        
 13  product_id     9994 non-null   object        
 14  category       9994 non-null   object        
 15  sub_category   9994 n

In [44]:
# slicing the main dataframe (aim is to join tables):
order_details = df[['order_id', 'order_date', 'ship_date', 'ship_mode', 'customer_id', 'region']]
product_info = df[['order_id', 'product_id', 'category', 'sub_category', 'product_name']]
financials = df[['order_id', 'sales', 'quantity', 'discount', 'profit']]

In [45]:
order_details.head(3)

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id,region
0,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South
1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South
2,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,West


In [46]:
product_info.head(3)

Unnamed: 0,order_id,product_id,category,sub_category,product_name
0,CA-2016-152156,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase
1,CA-2016-152156,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,..."
2,CA-2016-138688,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...


In [47]:
financials.head(3)

Unnamed: 0,order_id,sales,quantity,discount,profit
0,CA-2016-152156,261.96,2,0.0,41.9136
1,CA-2016-152156,731.94,3,0.0,219.582
2,CA-2016-138688,14.62,2,0.0,6.8714


In [48]:
# Rejoining tables
#join order (called merge in pandas)

df_a = pd.merge(order_details, product_info, on = 'order_id', how = 'inner')
df_a.head(3)

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id,region,product_id,category,sub_category,product_name
0,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase
1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,..."
2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase


In [49]:
final_df = pd.merge(df_a, financials, on = 'order_id', how = 'inner')
final_df.head(3)

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
0,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,731.94,3,0.0,219.582
2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",261.96,2,0.0,41.9136


In [50]:
# Average deliver days [Aim to determine the percentage of products that arrived on time and late]
final_df['delivery_days'] = (final_df['ship_date'] - final_df['order_date']).dt.days

In [51]:
final_df.head()

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit,delivery_days
0,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,3
1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,731.94,3,0.0,219.582,3
2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",261.96,2,0.0,41.9136,3
3,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582,3
4,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,3


In [52]:
final_df['unit_price'] = final_df['sales'] / final_df['quantity']
final_df.head()

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit,delivery_days,unit_price
0,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,3,130.98
1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,731.94,3,0.0,219.582,3,243.98
2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",261.96,2,0.0,41.9136,3,130.98
3,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582,3,243.98
4,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,3,130.98


In [53]:
print("Any duplicates?:", df.duplicated().any())

Any duplicates?: False


Aggregation

In [54]:
# Sales by region
pd.set_option('display.float_format', '{:,.2f}'.format)
final_df.groupby('region')['sales'].sum()

region
Central    6,041,149.99
East      10,592,510.58
South      5,878,859.20
West       8,823,293.44
Name: sales, dtype: float64

In [54]:
final_df.groupby(['category'])['profit'].mean()

category
Furniture         20.91
Office Supplies   26.68
Technology        38.09
Name: profit, dtype: float64

In [55]:
final_df.groupby(['sub_category'])['profit'].mean()

sub_category
Accessories    25.69
Appliances     24.12
Art            19.18
Binders        19.58
Bookcases      14.57
Chairs         40.17
Copiers       216.89
Envelopes      38.22
Fasteners      19.39
Furnishings    13.88
Labels         27.11
Machines        1.11
Paper          32.53
Phones         40.24
Storage        37.70
Supplies       17.06
Tables          6.26
Name: profit, dtype: float64

In [62]:
Super_store_df = final_df.groupby(['category', 'sub_category'])['profit'].mean()

In [60]:
Super_store_df

category         sub_category
Furniture        Bookcases       14.57
                 Chairs          40.17
                 Furnishings     13.88
                 Tables           6.26
Office Supplies  Appliances      24.12
                 Art             19.18
                 Binders         19.58
                 Envelopes       38.22
                 Fasteners       19.39
                 Labels          27.11
                 Paper           32.53
                 Storage         37.70
                 Supplies        17.06
Technology       Accessories     25.69
                 Copiers        216.89
                 Machines         1.11
                 Phones          40.24
Name: profit, dtype: float64

In [72]:
# Automated Function for Reuse
Super_store_df = pd.read_csv('/home/jovyan/input_data/Superstore.csv', encoding='latin1')
def data(Super_store_df):
    # ✅ Step 1: Standardize column names at the very beginning
    Super_store_df.columns = Super_store_df.columns.str.strip().str.lower().str.replace(" ", "_")
    Super_store_df.columns = Super_store_df.columns.str.replace("-", "_")

    # ✅ Step 2: Checking for missing values
    print("\nMissing values:\n", df.isnull().sum())

    # ✅ Step 3: Checking for duplicates
    print("Any duplicates?:", df.duplicated().any())

    # ✅ Step 4: Fix data types
    if "Order Date" in df.columns:
        df["Order Date"] = pd.to_datetime(df["Order Date"], errors="coerce")
    if "Ship Date" in df.columns:
        df["Ship Date"] = pd.to_datetime(df["Ship Date"], errors="coerce")

    # ✅ Step 5: Derived column
    Super_store_df = Super_store_df.groupby(['category', 'sub_category'])['profit'].mean()
    return Super_store_df  # ✅ Return cleaned dataframe

# Apply function
cleaned_df = data(Super_store_df)
print("\nAutomated Cleaned Dataset:\n", cleaned_df)


Missing values:
 row_id           0
order_id         0
order_date       0
ship_date        0
ship_mode        0
customer_id      0
customer_name    0
segment          0
country          0
city             0
state            0
postal_code      0
region           0
product_id       0
category         0
sub_category     0
product_name     0
sales            0
quantity         0
discount         0
profit           0
dtype: int64
Any duplicates?: False

Automated Cleaned Dataset:
 category         sub_category
Furniture        Bookcases      -15.23
                 Chairs          43.10
                 Furnishings     13.65
                 Tables         -55.57
Office Supplies  Appliances      38.92
                 Art              8.20
                 Binders         19.84
                 Envelopes       27.42
                 Fasteners        4.38
                 Labels          15.24
                 Paper           24.86
                 Storage         25.15
                 Sup

In [85]:
# Loading the output file and saving as a JSON file
Super_store_df.to_json('/home/jovyan/output_data/Super_store_df.json')