In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [152]:
retail_data = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/retail_data.csv')

In [153]:
retail_data.head()

Unnamed: 0,Transaction_ID,Customer_ID,Name,Email,Phone,Address,City,State,Zipcode,Country,...,Total_Amount,Product_Category,Product_Brand,Product_Type,Feedback,Shipping_Method,Payment_Method,Order_Status,Ratings,products
0,8691788.0,37249.0,Michelle Harrington,Ebony39@gmail.com,1414787000.0,3959 Amanda Burgs,Dortmund,Berlin,77985.0,Germany,...,324.08627,Clothing,Nike,Shorts,Excellent,Same-Day,Debit Card,Shipped,5.0,Cycling shorts
1,2174773.0,69749.0,Kelsey Hill,Mark36@gmail.com,6852900000.0,82072 Dawn Centers,Nottingham,England,99071.0,UK,...,806.707815,Electronics,Samsung,Tablet,Excellent,Standard,Credit Card,Processing,4.0,Lenovo Tab
2,6679610.0,30192.0,Scott Jensen,Shane85@gmail.com,8362160000.0,4133 Young Canyon,Geelong,New South Wales,75929.0,Australia,...,1063.432799,Books,Penguin Books,Children's,Average,Same-Day,Credit Card,Processing,2.0,Sports equipment
3,7232460.0,62101.0,Joseph Miller,Mary34@gmail.com,2776752000.0,8148 Thomas Creek Suite 100,Edmonton,Ontario,88420.0,Canada,...,2466.854021,Home Decor,Home Depot,Tools,Excellent,Standard,PayPal,Processing,4.0,Utility knife
4,4983775.0,27901.0,Debra Coleman,Charles30@gmail.com,9098268000.0,5813 Lori Ports Suite 269,Bristol,England,48704.0,UK,...,248.553049,Grocery,Nestle,Chocolate,Bad,Standard,Cash,Shipped,1.0,Chocolate cookies


In [154]:
# name of the columns
retail_data.columns

Index(['Transaction_ID', 'Customer_ID', 'Name', 'Email', 'Phone', 'Address',
       'City', 'State', 'Zipcode', 'Country', 'Age', 'Gender', 'Income',
       'Customer_Segment', 'Date', 'Year', 'Month', 'Time', 'Total_Purchases',
       'Amount', 'Total_Amount', 'Product_Category', 'Product_Brand',
       'Product_Type', 'Feedback', 'Shipping_Method', 'Payment_Method',
       'Order_Status', 'Ratings', 'products'],
      dtype='object')

In [155]:
# data types of the retail_data
print(retail_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302010 entries, 0 to 302009
Data columns (total 30 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Transaction_ID    301677 non-null  float64
 1   Customer_ID       301702 non-null  float64
 2   Name              301628 non-null  object 
 3   Email             301663 non-null  object 
 4   Phone             301648 non-null  float64
 5   Address           301695 non-null  object 
 6   City              301762 non-null  object 
 7   State             301729 non-null  object 
 8   Zipcode           301670 non-null  float64
 9   Country           301739 non-null  object 
 10  Age               301837 non-null  float64
 11  Gender            301693 non-null  object 
 12  Income            301720 non-null  object 
 13  Customer_Segment  301795 non-null  object 
 14  Date              301651 non-null  object 
 15  Year              301660 non-null  float64
 16  Month             30

In [156]:
# total number of the missing value in total
retail_data.isnull().sum()


Transaction_ID      333
Customer_ID         308
Name                382
Email               347
Phone               362
Address             315
City                248
State               281
Zipcode             340
Country             271
Age                 173
Gender              317
Income              290
Customer_Segment    215
Date                359
Year                350
Month               273
Time                350
Total_Purchases     361
Amount              357
Total_Amount        350
Product_Category    283
Product_Brand       281
Product_Type          0
Feedback            184
Shipping_Method     337
Payment_Method      297
Order_Status        235
Ratings             184
products              0
dtype: int64

In [130]:
retail_data.isnull().sum().sum()


2278

#### Handling missing value

In [157]:
shipping_methods = retail_data['Shipping_Method']

# calculate the distribution
standard = (shipping_methods == 'Standard').sum() / len(shipping_methods)
same_day = (shipping_methods == 'Same-Day').sum() / len(shipping_methods)
express = (shipping_methods == 'Express').sum() / len(shipping_methods)
missing = (shipping_methods.isna()).sum() / len(shipping_methods)

print("Standard:", standard)
print("Same-Day:", same_day)
print("Express:", express)
print("Missing values:", missing)

Standard: 0.31509221548955335
Same-Day: 0.34487599748352704
Express: 0.3389159299360948
Missing values: 0.0011158570908248071


In [158]:
retail_data['Shipping_Method'].fillna('Same-Day', inplace=True)

In [159]:
# replace missing value with the unique 7 digit
def generate_unique_transaction_id(existing_ids, n):
    unique_ids = set(existing_ids.dropna())
    new_ids = []
    while len(new_ids) < n:
        new_id = np.random.randint(1000000, 9999999)
        if new_id not in unique_ids:
            unique_ids.add(new_id)
            new_ids.append(new_id)
    return new_ids

# fill missing value with unique 7-digit random value
missing_transaction_ids = retail_data["Transaction_ID"].isna().sum()
unique_ids = generate_unique_transaction_id(retail_data["Transaction_ID"], missing_transaction_ids)
retail_data.loc[retail_data['Transaction_ID'].isna(), 'Transaction_ID'] = unique_ids
retail_data["Transaction_ID"].isnull().sum()

0

In [160]:
# replace missing value with the unique 7 digit

def generate_unique_customer_id(existing_ids, n):
    unique_ids = set(existing_ids.dropna())
    new_ids = []
    while len(new_ids) < n:
        new_id = np.random.randint(10000,99999)
        if new_id not in unique_ids:
            unique_ids.add(new_id)
            new_ids.append(new_id)
    return new_ids

missing_customer_ids = retail_data["Customer_ID"].isna().sum()
unique_ids = generate_unique_customer_id(retail_data["Customer_ID"], missing_customer_ids)
retail_data.loc[retail_data['Customer_ID'].isna(), 'Customer_ID'] = unique_ids
retail_data["Customer_ID"].isnull().sum()

0

In [161]:
def generate_unique_phone_numbers(existing_ids, n):
    unique_ids = set(existing_ids.dropna().astype(int))
    new_ids = []
    while len(new_ids) < n:
        new_id = np.random.randint(1000000000, 9999999999)
        if new_id not in unique_ids:
            unique_ids.add(new_id)
            new_ids.append(new_id)
    return new_ids

# count missing phone numbers
missing_phone_numbers = retail_data["Phone"].isna().sum()

# generate unique phone numbers
unique_phone_numbers = generate_unique_phone_numbers(retail_data["Phone"], missing_phone_numbers)

# fill missing phone numbers with generated unique numbers
retail_data.loc[retail_data['Phone'].isna(), 'Phone'] = unique_phone_numbers

retail_data["Phone"].isnull().sum()


0

In [162]:
# convert it into date
retail_data['Date'] = retail_data['Date'].str.replace('-', '/')
# delete all row with NaN value
retail_data.dropna(subset=["Date"], inplace=True)
retail_data['Date'].isnull().sum()

0

In [163]:
# fill missing year with mode
retail_data['Year'].fillna(retail_data['Year'].mode()[0], inplace=True)
print("Number of missing Year values:", retail_data['Year'].isnull().sum())

# fill missing total_purchases with mode
retail_data['Total_Purchases'].fillna(retail_data['Total_Purchases'].mode()[0], inplace=True)
print("Number of missing Total_Purchases values:", retail_data['Total_Purchases'].isnull().sum())

# fill missing amount with mean
retail_data['Amount'].fillna(retail_data['Amount'].mean(), inplace=True)
print("Number of missing Amount values:", retail_data['Amount'].isnull().sum())

# fill missing total_amount with mean
retail_data['Total_Amount'].fillna(retail_data['Total_Amount'].mean(), inplace=True)
print("Number of missing Total_Amount values:", retail_data['Total_Amount'].isnull().sum())

# fill missing product_category with mode
retail_data['Product_Category'].fillna(retail_data['Product_Category'].mode()[0], inplace=True)
print("Number of missing Product_Category values:", retail_data['Product_Category'].isnull().sum())

# drop rows with missing product_brand
retail_data.dropna(subset=["Product_Brand"], inplace=True)
print("Number of missing Product_Brand values:", retail_data['Product_Brand'].isnull().sum())

# fill missing feedback with mode
retail_data['Feedback'].fillna(retail_data['Feedback'].mode()[0], inplace=True)
print("Number of missing Feedback values:", retail_data['Feedback'].isnull().sum())

# fill missing ratings with mode
retail_data['Ratings'].fillna(retail_data['Ratings'].mode()[0], inplace=True)
print("Number of missing Ratings values:", retail_data['Ratings'].isnull().sum())

# fill missing order_status with "delivered"
retail_data['Order_Status'].fillna('Delivered', inplace=True)
print("Number of missing Order_Status values:", retail_data['Order_Status'].isnull().sum())

# fill missing payment_method with "credit card"
retail_data['Payment_Method'].fillna('Credit Card', inplace=True)
print("Number of missing Payment_Method values:", retail_data['Payment_Method'].isnull().sum())

# fill missing email with "unknown"
retail_data['Email'].fillna('Unknown', inplace=True)
print("Number of missing Email values:", retail_data['Email'].isnull().sum())

# fill missing name with "unknown"
retail_data['Name'].fillna('Unknown', inplace=True)
print("Number of missing Name values:", retail_data['Name'].isnull().sum())

# fill missing customer_segment with mode
retail_data['Customer_Segment'].fillna(retail_data['Customer_Segment'].mode()[0], inplace=True)
print("Number of missing Customer_Segment values:", retail_data['Customer_Segment'].isnull().sum())

# fill missing income with mode
retail_data['Income'].fillna(retail_data['Income'].mode()[0], inplace=True)
print("Number of missing Income values:", retail_data['Income'].isnull().sum())

# fill missing zipcode with 0
retail_data['Zipcode'].fillna(0, inplace=True)
print("Number of missing Zipcode values:", retail_data['Zipcode'].isnull().sum())

# fill missing age with mean
retail_data['Age'].fillna(retail_data['Age'].mean(), inplace=True)
print("Number of missing Age values:", retail_data['Age'].isnull().sum())

# fill missing address with "unknown"
retail_data['Address'].fillna('Unknown', inplace=True)
print("Number of missing Address values:", retail_data['Address'].isnull().sum())

# fill missing city with "unknown"
retail_data['City'].fillna('Unknown', inplace=True)
print("Number of missing City values:", retail_data['City'].isnull().sum())

# fill missing state with "unknown"
retail_data['State'].fillna('Unknown', inplace=True)
print("Number of missing State values:", retail_data['State'].isnull().sum())

# fill missing gender with "unknown"
retail_data['Gender'].fillna('Unknown', inplace=True)
print("Number of missing Gender values:", retail_data['Gender'].isnull().sum())

# fill missing country with "unknown"
retail_data['Country'].fillna('Unknown', inplace=True)
print("Number of missing Country values:", retail_data['Country'].isnull().sum())

# drop rows with missing month
retail_data.dropna(subset=['Month'], inplace=True)
print("Number of missing Month values:", retail_data['Month'].isnull().sum())

# drop rows with missing time
retail_data.dropna(subset=["Time"], inplace=True)
print("Number of missing Time values:", retail_data['Time'].isnull().sum())

# standardize date format and drop rows with missing date
retail_data['Date'] = retail_data['Date'].str.replace('-', '/')
retail_data.dropna(subset=["Date"], inplace=True)
print("Number of missing Date values:", retail_data['Date'].isnull().sum())


Number of missing Year values: 0
Number of missing Total_Purchases values: 0
Number of missing Amount values: 0
Number of missing Total_Amount values: 0
Number of missing Product_Category values: 0
Number of missing Product_Brand values: 0
Number of missing Feedback values: 0
Number of missing Ratings values: 0
Number of missing Order_Status values: 0
Number of missing Payment_Method values: 0
Number of missing Email values: 0
Number of missing Name values: 0
Number of missing Customer_Segment values: 0
Number of missing Income values: 0
Number of missing Zipcode values: 0
Number of missing Age values: 0
Number of missing Address values: 0
Number of missing City values: 0
Number of missing State values: 0
Number of missing Gender values: 0
Number of missing Country values: 0
Number of missing Month values: 0
Number of missing Time values: 0
Number of missing Date values: 0


In [164]:
retail_data.isnull().sum()


Transaction_ID      0
Customer_ID         0
Name                0
Email               0
Phone               0
Address             0
City                0
State               0
Zipcode             0
Country             0
Age                 0
Gender              0
Income              0
Customer_Segment    0
Date                0
Year                0
Month               0
Time                0
Total_Purchases     0
Amount              0
Total_Amount        0
Product_Category    0
Product_Brand       0
Product_Type        0
Feedback            0
Shipping_Method     0
Payment_Method      0
Order_Status        0
Ratings             0
products            0
dtype: int64