## Data Cleaning

### Importing necessary libraries

In [1]:
# Importing necessary libraries:
import os

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

### Importing the data

In [2]:
# Importing data:
script_dir_path = os.path.dirname(os.path.abspath(__name__))
base_dir_path = os.path.abspath(os.path.join(script_dir_path, '../'))

data_path = os.path.abspath(os.path.join(base_dir_path, "data/Amazon Sale Report.csv"))

# reading the data:
data = pd.read_csv(data_path, encoding='latin1')
data.head()

Unnamed: 0,index,Order ID,Date,Status,Fulfilment,Sales Channel,ship-service-level,Category,Size,Courier Status,...,currency,Amount,ship-city,ship-state,ship-postal-code,ship-country,B2B,fulfilled-by,New,PendingS
0,0,405-8078784-5731545,04-30-22,Cancelled,Merchant,Amazon.in,Standard,T-shirt,S,On the Way,...,INR,647.62,MUMBAI,MAHARASHTRA,400081.0,IN,False,Easy Ship,,
1,1,171-9198151-1101146,04-30-22,Shipped - Delivered to Buyer,Merchant,Amazon.in,Standard,Shirt,3XL,Shipped,...,INR,406.0,BENGALURU,KARNATAKA,560085.0,IN,False,Easy Ship,,
2,2,404-0687676-7273146,04-30-22,Shipped,Amazon,Amazon.in,Expedited,Shirt,XL,Shipped,...,INR,329.0,NAVI MUMBAI,MAHARASHTRA,410210.0,IN,True,,,
3,3,403-9615377-8133951,04-30-22,Cancelled,Merchant,Amazon.in,Standard,Blazzer,L,On the Way,...,INR,753.33,PUDUCHERRY,PUDUCHERRY,605008.0,IN,False,Easy Ship,,
4,4,407-1069790-7240320,04-30-22,Shipped,Amazon,Amazon.in,Expedited,Trousers,3XL,Shipped,...,INR,574.0,CHENNAI,TAMIL NADU,600073.0,IN,False,,,


### Basic information about the data:

In [3]:
# shape:
print(f"The dataset contains {data.shape[0]} rows and {data.shape[1]} columns")

The dataset contains 128976 rows and 21 columns


In [4]:
# Columns:
data.columns

Index(['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel',
       'ship-service-level', 'Category', 'Size', 'Courier Status', 'Qty',
       'currency', 'Amount', 'ship-city', 'ship-state', 'ship-postal-code',
       'ship-country', 'B2B', 'fulfilled-by', 'New', 'PendingS'],
      dtype='object')

In [5]:
# datatype of each columns:
data.dtypes

index                   int64
Order ID               object
Date                   object
Status                 object
Fulfilment             object
Sales Channel          object
ship-service-level     object
Category               object
Size                   object
Courier Status         object
Qty                     int64
currency               object
Amount                float64
ship-city              object
ship-state             object
ship-postal-code      float64
ship-country           object
B2B                      bool
fulfilled-by           object
New                   float64
PendingS              float64
dtype: object

### Dropping irrelevant columns

In [6]:
# drop "index" and "Order ID" columns:
data.drop(["index", "Order ID"], axis=1, inplace=True)
data.dtypes

Date                   object
Status                 object
Fulfilment             object
Sales Channel          object
ship-service-level     object
Category               object
Size                   object
Courier Status         object
Qty                     int64
currency               object
Amount                float64
ship-city              object
ship-state             object
ship-postal-code      float64
ship-country           object
B2B                      bool
fulfilled-by           object
New                   float64
PendingS              float64
dtype: object

### Percantage of missing values:

In [7]:
def percentage_of_missing_data():
    no_of_missing_data = data.isna().sum()
    percentage_of_missing_data = (no_of_missing_data * 100) / data.shape[0]
    return round(percentage_of_missing_data, 2)

In [8]:
percentage_of_missing_data()

Date                    0.00
Status                  0.00
Fulfilment              0.00
Sales Channel           0.00
ship-service-level      0.00
Category                0.00
Size                    0.00
Courier Status          0.00
Qty                     0.00
currency                6.05
Amount                  6.05
ship-city               0.03
ship-state              0.03
ship-postal-code        0.03
ship-country            0.03
B2B                     0.00
fulfilled-by           69.56
New                   100.00
PendingS              100.00
dtype: float64

##### "fulfilled-by", "New" and "PendingS" columns have more null velues. They are not required for the analysis. So, it can be removed.

In [9]:
# dropping "fulfilled-by", "New" and "PendingS" columns:
data.drop(["fulfilled-by","New","PendingS"], axis=1, inplace=True)
data.dtypes

Date                   object
Status                 object
Fulfilment             object
Sales Channel          object
ship-service-level     object
Category               object
Size                   object
Courier Status         object
Qty                     int64
currency               object
Amount                float64
ship-city              object
ship-state             object
ship-postal-code      float64
ship-country           object
B2B                      bool
dtype: object

##### Here "Status" and "Courier Status" columns conveys the same message. So, we can use any one column for our analysis.

In [10]:
# "Status" column can be dropped.
data.drop(["Status"], axis=1, inplace=True)
data.dtypes

Date                   object
Fulfilment             object
Sales Channel          object
ship-service-level     object
Category               object
Size                   object
Courier Status         object
Qty                     int64
currency               object
Amount                float64
ship-city              object
ship-state             object
ship-postal-code      float64
ship-country           object
B2B                      bool
dtype: object

In [11]:
data.head()

Unnamed: 0,Date,Fulfilment,Sales Channel,ship-service-level,Category,Size,Courier Status,Qty,currency,Amount,ship-city,ship-state,ship-postal-code,ship-country,B2B
0,04-30-22,Merchant,Amazon.in,Standard,T-shirt,S,On the Way,0,INR,647.62,MUMBAI,MAHARASHTRA,400081.0,IN,False
1,04-30-22,Merchant,Amazon.in,Standard,Shirt,3XL,Shipped,1,INR,406.0,BENGALURU,KARNATAKA,560085.0,IN,False
2,04-30-22,Amazon,Amazon.in,Expedited,Shirt,XL,Shipped,1,INR,329.0,NAVI MUMBAI,MAHARASHTRA,410210.0,IN,True
3,04-30-22,Merchant,Amazon.in,Standard,Blazzer,L,On the Way,0,INR,753.33,PUDUCHERRY,PUDUCHERRY,605008.0,IN,False
4,04-30-22,Amazon,Amazon.in,Expedited,Trousers,3XL,Shipped,1,INR,574.0,CHENNAI,TAMIL NADU,600073.0,IN,False


#### Lets analyse "currency" and "ship-country" columns

In [12]:
# "currency" column:
data["currency"].value_counts()

currency
INR    121176
Name: count, dtype: int64

In [13]:
# "ship-country" column:
data["ship-country"].value_counts()

ship-country
IN    128941
Name: count, dtype: int64

* Here, the entire data is based on "India" and "Indian currency". So, we can drop "currency", "ship-country" and "ship-postal-code".

In [14]:
# dropping "currency" and "ship-country" columns:
data.drop(["currency","ship-country","ship-postal-code"], axis=1, inplace=True)
data.dtypes

Date                   object
Fulfilment             object
Sales Channel          object
ship-service-level     object
Category               object
Size                   object
Courier Status         object
Qty                     int64
Amount                float64
ship-city              object
ship-state             object
B2B                      bool
dtype: object

In [15]:
data.head()

Unnamed: 0,Date,Fulfilment,Sales Channel,ship-service-level,Category,Size,Courier Status,Qty,Amount,ship-city,ship-state,B2B
0,04-30-22,Merchant,Amazon.in,Standard,T-shirt,S,On the Way,0,647.62,MUMBAI,MAHARASHTRA,False
1,04-30-22,Merchant,Amazon.in,Standard,Shirt,3XL,Shipped,1,406.0,BENGALURU,KARNATAKA,False
2,04-30-22,Amazon,Amazon.in,Expedited,Shirt,XL,Shipped,1,329.0,NAVI MUMBAI,MAHARASHTRA,True
3,04-30-22,Merchant,Amazon.in,Standard,Blazzer,L,On the Way,0,753.33,PUDUCHERRY,PUDUCHERRY,False
4,04-30-22,Amazon,Amazon.in,Expedited,Trousers,3XL,Shipped,1,574.0,CHENNAI,TAMIL NADU,False


### Lets see all the unique values present in categoric columns

#### Fulfilment

In [16]:
data["Fulfilment"].unique()

array(['Merchant', 'Amazon'], dtype=object)

#### Sales Channel

In [17]:
data["Sales Channel"].unique()

array(['Amazon.in', 'Non-Amazon'], dtype=object)

#### ship-service-level

In [18]:
data["ship-service-level"].unique()

array(['Standard', 'Expedited'], dtype=object)

#### Category

In [19]:
data["Category"].unique()

array(['T-shirt', 'Shirt', 'Blazzer', 'Trousers', 'Perfume', 'Socks',
       'Shoes', 'Wallet', 'Watch'], dtype=object)

#### Size

In [21]:
data['Size'].unique()

array(['S', '3XL', 'XL', 'L', 'XXL', 'XS', '6XL', 'M', '4XL', 'Free',
       '5XL'], dtype=object)

#### Courier Status

In [22]:
data["Courier Status"].unique()

array(['On the Way', 'Shipped', 'Cancelled', 'Unshipped'], dtype=object)

#### Qty

In [24]:
data["Qty"].unique()

array([ 0,  1,  2, 15,  3,  9, 13,  5,  4,  8])

#### ship-city

In [31]:
data["ship-city"].unique()

array(['MUMBAI', 'BENGALURU', 'NAVI MUMBAI', ...,
       'GULABPURA, Distt BHILWARA', 'Prayagraj (ALLAHABAD)', 'Halol'],
      dtype=object)

* Here there are many format errors in the spelling of the cities. We need to format them.

#### ship-state

In [32]:
data["ship-state"].unique()

array(['MAHARASHTRA', 'KARNATAKA', 'PUDUCHERRY', 'TAMIL NADU',
       'UTTAR PRADESH', 'CHANDIGARH', 'TELANGANA', 'ANDHRA PRADESH',
       'RAJASTHAN', 'DELHI', 'HARYANA', 'ASSAM', 'JHARKHAND',
       'CHHATTISGARH', 'ODISHA', 'KERALA', 'MADHYA PRADESH',
       'WEST BENGAL', 'NAGALAND', 'Gujarat', 'UTTARAKHAND', 'BIHAR',
       'JAMMU & KASHMIR', 'PUNJAB', 'HIMACHAL PRADESH',
       'ARUNACHAL PRADESH', 'Goa', 'MEGHALAYA', 'GOA', 'MANIPUR',
       'TRIPURA', 'LADAKH', 'DADRA AND NAGAR', 'SIKKIM', 'Delhi', nan,
       'ANDAMAN & NICOBAR', 'Punjab', 'Rajshthan', 'Manipur', 'rajasthan',
       'Odisha', 'NL', 'Bihar', 'MIZORAM', 'punjab', 'New Delhi',
       'Rajasthan', 'Punjab/Mohali/Zirakpur', 'Puducherry', 'delhi', 'RJ',
       'Chandigarh', 'orissa', 'LAKSHADWEEP', 'goa', 'PB', 'APO',
       'Arunachal Pradesh', 'AR', 'Pondicherry', 'Sikkim',
       'Arunachal pradesh', 'Nagaland', 'bihar', 'Mizoram', 'rajsthan',
       'Orissa', 'Rajsthan', 'Meghalaya'], dtype=object)

* Here there are many format errors in the spelling of the States. We need to format them.

#### B2B

In [37]:
data["B2B"].unique()

array([False,  True])

## Clean Individual Columns

### Date column

Here I "Date" column instead of "2022" some rows have "22". We need to convert to a specific standard format.

In [45]:
# replace "22" to "2022"
dates = []
for date in data["Date"]:
    if "2022" in date:
        dates.append(date)
    else:
        a = date.replace("22", "2022")
        dates.append(a)

# replace the column:
data["Date"] = dates
data.head()

Unnamed: 0,Date,Fulfilment,Sales Channel,ship-service-level,Category,Size,Courier Status,Qty,Amount,ship-city,ship-state,B2B
0,04-30-2022,Merchant,Amazon.in,Standard,T-shirt,S,On the Way,0,647.62,MUMBAI,MAHARASHTRA,False
1,04-30-2022,Merchant,Amazon.in,Standard,Shirt,3XL,Shipped,1,406.0,BENGALURU,KARNATAKA,False
2,04-30-2022,Amazon,Amazon.in,Expedited,Shirt,XL,Shipped,1,329.0,NAVI MUMBAI,MAHARASHTRA,True
3,04-30-2022,Merchant,Amazon.in,Standard,Blazzer,L,On the Way,0,753.33,PUDUCHERRY,PUDUCHERRY,False
4,04-30-2022,Amazon,Amazon.in,Expedited,Trousers,3XL,Shipped,1,574.0,CHENNAI,TAMIL NADU,False


### B2B Column

Here B2B is a type of business or a business model. There are also other models. Lets rename this column as "Business Model". It will be more relevant.

In [46]:
# rename the column as "Business Model"
data.rename(columns={"B2B":"Business Model"}, inplace=True)
data.columns

Index(['Date', 'Fulfilment', 'Sales Channel', 'ship-service-level', 'Category',
       'Size', 'Courier Status', 'Qty', 'Amount', 'ship-city', 'ship-state',
       'Business Model'],
      dtype='object')

In [47]:
data.head(2)

Unnamed: 0,Date,Fulfilment,Sales Channel,ship-service-level,Category,Size,Courier Status,Qty,Amount,ship-city,ship-state,Business Model
0,04-30-2022,Merchant,Amazon.in,Standard,T-shirt,S,On the Way,0,647.62,MUMBAI,MAHARASHTRA,False
1,04-30-2022,Merchant,Amazon.in,Standard,Shirt,3XL,Shipped,1,406.0,BENGALURU,KARNATAKA,False


Now lets replace the values in the "Business Model" column. {False: "Others", True: "B2B"}

In [48]:
# replace the values:
data["Business Model"].replace({False:"Others", True:"B2B"}, inplace=True)
data["Business Model"].unique()

array(['Others', 'B2B'], dtype=object)

### ship-state column

In [50]:
data["ship-state"].unique()

array(['MAHARASHTRA', 'KARNATAKA', 'PUDUCHERRY', 'TAMIL NADU',
       'UTTAR PRADESH', 'CHANDIGARH', 'TELANGANA', 'ANDHRA PRADESH',
       'RAJASTHAN', 'DELHI', 'HARYANA', 'ASSAM', 'JHARKHAND',
       'CHHATTISGARH', 'ODISHA', 'KERALA', 'MADHYA PRADESH',
       'WEST BENGAL', 'NAGALAND', 'Gujarat', 'UTTARAKHAND', 'BIHAR',
       'JAMMU & KASHMIR', 'PUNJAB', 'HIMACHAL PRADESH',
       'ARUNACHAL PRADESH', 'Goa', 'MEGHALAYA', 'GOA', 'MANIPUR',
       'TRIPURA', 'LADAKH', 'DADRA AND NAGAR', 'SIKKIM', 'Delhi', nan,
       'ANDAMAN & NICOBAR', 'Punjab', 'Rajshthan', 'Manipur', 'rajasthan',
       'Odisha', 'NL', 'Bihar', 'MIZORAM', 'punjab', 'New Delhi',
       'Rajasthan', 'Punjab/Mohali/Zirakpur', 'Puducherry', 'delhi', 'RJ',
       'Chandigarh', 'orissa', 'LAKSHADWEEP', 'goa', 'PB', 'APO',
       'Arunachal Pradesh', 'AR', 'Pondicherry', 'Sikkim',
       'Arunachal pradesh', 'Nagaland', 'bihar', 'Mizoram', 'rajsthan',
       'Orissa', 'Rajsthan', 'Meghalaya'], dtype=object)

In [57]:
state = []
for i in data["ship-state"]:
    try:
        a = i.strip().title()
        
        if "/" in a:
            b = a.split("/")[0]
            state.append(b)

        elif a=="Orissa":
            state.append("Odisha")
        
        elif (a=='Rajshthan') or (a=='Rj') or (a=="Rajsthan"):
            state.append("Rajasthan")

        elif (a=="Pb"):
            state.append("Punjab")

        elif (a=="Pondicherry"):
            state.append("Puducherry")
        
        elif (a=="Ar"):
            state.append("Arunachal Pradesh")

        elif (a=="Nl"):
            state.append("Nagaland")
        
        else:
            state.append(a)
        
    except:
        state.append("unknown")

data["ship-state"] = state


In [59]:
data["ship-state"].unique()

array(['Maharashtra', 'Karnataka', 'Puducherry', 'Tamil Nadu',
       'Uttar Pradesh', 'Chandigarh', 'Telangana', 'Andhra Pradesh',
       'Rajasthan', 'Delhi', 'Haryana', 'Assam', 'Jharkhand',
       'Chhattisgarh', 'Odisha', 'Kerala', 'Madhya Pradesh',
       'West Bengal', 'Nagaland', 'Gujarat', 'Uttarakhand', 'Bihar',
       'Jammu & Kashmir', 'Punjab', 'Himachal Pradesh',
       'Arunachal Pradesh', 'Goa', 'Meghalaya', 'Manipur', 'Tripura',
       'Ladakh', 'Dadra And Nagar', 'Sikkim', 'unknown',
       'Andaman & Nicobar', 'Mizoram', 'New Delhi', 'Lakshadweep', 'Apo'],
      dtype=object)

There is no state with "Apo" name. So, this row can be dropped.

In [65]:
# dropping a row containing state="Apo":
data = data[data["ship-state"]!="Apo"]
data.reset_index(drop=True)
data.head()

Unnamed: 0,Date,Fulfilment,Sales Channel,ship-service-level,Category,Size,Courier Status,Qty,Amount,ship-city,ship-state,Business Model
0,04-30-2022,Merchant,Amazon.in,Standard,T-shirt,S,On the Way,0,647.62,MUMBAI,Maharashtra,Others
1,04-30-2022,Merchant,Amazon.in,Standard,Shirt,3XL,Shipped,1,406.0,BENGALURU,Karnataka,Others
2,04-30-2022,Amazon,Amazon.in,Expedited,Shirt,XL,Shipped,1,329.0,NAVI MUMBAI,Maharashtra,B2B
3,04-30-2022,Merchant,Amazon.in,Standard,Blazzer,L,On the Way,0,753.33,PUDUCHERRY,Puducherry,Others
4,04-30-2022,Amazon,Amazon.in,Expedited,Trousers,3XL,Shipped,1,574.0,CHENNAI,Tamil Nadu,Others


In [68]:
data["ship-state"].unique()

array(['Maharashtra', 'Karnataka', 'Puducherry', 'Tamil Nadu',
       'Uttar Pradesh', 'Chandigarh', 'Telangana', 'Andhra Pradesh',
       'Rajasthan', 'Delhi', 'Haryana', 'Assam', 'Jharkhand',
       'Chhattisgarh', 'Odisha', 'Kerala', 'Madhya Pradesh',
       'West Bengal', 'Nagaland', 'Gujarat', 'Uttarakhand', 'Bihar',
       'Jammu & Kashmir', 'Punjab', 'Himachal Pradesh',
       'Arunachal Pradesh', 'Goa', 'Meghalaya', 'Manipur', 'Tripura',
       'Ladakh', 'Dadra And Nagar', 'Sikkim', 'unknown',
       'Andaman & Nicobar', 'Mizoram', 'New Delhi', 'Lakshadweep'],
      dtype=object)