# **Cleaning and Preparation of data**

This is derived from Online_Sales and Discount_Coupon Datasets from https://www.kaggle.com/datasets/rishikumarrajvansh/marketing-insights-for-e-commerce-company.

It contains 52924 transaction records and 10 columns capturing various aspects of the transaction. 
- The online sales dataset primarily focuses on understanding transactions, products, discounts, delivery charges and also coupon usages. 
- The discount coupon dataset provides details on the types of coupons available through different periods of the year.

As this dataset does not orignate from Amazon, we will be matching the products descriptions in online sales dataset with products descriptions in the amazon sales dataset using thefuzz library to generate the matching product_id.



In [1]:
# Importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from thefuzz import process, fuzz
from faker import Faker
import random

In [23]:
#Importing Files
df = pd.read_csv('../source/Online_Sales.csv')

In [24]:
df.head()

Unnamed: 0,CustomerID,Transaction_ID,Transaction_Date,Product_SKU,Product_Description,Product_Category,Quantity,Avg_Price,Delivery_Charges,Coupon_Status
0,17850,16679,1/1/2019,GGOENEBJ079499,Nest Learning Thermostat 3rd Gen-USA - Stainle...,Nest-USA,1,153.71,6.5,Used
1,17850,16680,1/1/2019,GGOENEBJ079499,Nest Learning Thermostat 3rd Gen-USA - Stainle...,Nest-USA,1,153.71,6.5,Used
2,17850,16681,1/1/2019,GGOEGFKQ020399,Google Laptop and Cell Phone Stickers,Office,1,2.05,6.5,Used
3,17850,16682,1/1/2019,GGOEGAAB010516,Google Men's 100% Cotton Short Sleeve Hero Tee...,Apparel,5,17.53,6.5,Not Used
4,17850,16682,1/1/2019,GGOEGBJL013999,Google Canvas Tote Natural/Navy,Bags,1,16.5,6.5,Used


In [25]:
#Checking Column Names
df.columns

Index(['CustomerID', 'Transaction_ID', 'Transaction_Date', 'Product_SKU',
       'Product_Description', 'Product_Category', 'Quantity', 'Avg_Price',
       'Delivery_Charges', 'Coupon_Status'],
      dtype='object')

In [26]:
#Replacing column names
df = df.rename(columns={'CustomerID': 'user_id', 'Transaction_ID':'transaction_id', 'Transaction_Date' : 'date'})

In [27]:
#Checking Number of Rows and Columns
df.shape

(52924, 10)

In [28]:
#Checking Data Types for each Column
df.dtypes

user_id                  int64
transaction_id           int64
date                    object
Product_SKU             object
Product_Description     object
Product_Category        object
Quantity                 int64
Avg_Price              float64
Delivery_Charges       float64
Coupon_Status           object
dtype: object

In [29]:
#Changing the data type of transaction_id, user_id and date
df['transaction_id'] = df['transaction_id'].astype('object')
df['user_id'] = df['user_id'].astype('object')
df['date'] = pd.to_datetime(df['date'])

In [30]:
#Checking for null values
df.isnull().sum()

user_id                0
transaction_id         0
date                   0
Product_SKU            0
Product_Description    0
Product_Category       0
Quantity               0
Avg_Price              0
Delivery_Charges       0
Coupon_Status          0
dtype: int64

In [31]:
#Importing a new dataset
coupons = pd.read_csv('../source/Discount_Coupon.csv')

In [32]:
coupons.head()

Unnamed: 0,Month,Product_Category,Coupon_Code,Discount_pct
0,Jan,Apparel,SALE10,10
1,Feb,Apparel,SALE20,20
2,Mar,Apparel,SALE30,30
3,Jan,Nest-USA,ELEC10,10
4,Feb,Nest-USA,ELEC20,20


In [33]:
#Checking Column Names
coupons.columns

Index(['Month', 'Product_Category', 'Coupon_Code', 'Discount_pct'], dtype='object')

In [34]:
#Checking Data Types for each Column
coupons.dtypes

Month               object
Product_Category    object
Coupon_Code         object
Discount_pct         int64
dtype: object

In [35]:
#Changing Datatype and values in Discount_pct
coupons['Discount_pct'] = coupons['Discount_pct'].astype('float64')

coupons['Discount_pct'] = coupons['Discount_pct'] / 100

coupons['Discount_pct']

0      0.1
1      0.2
2      0.3
3      0.1
4      0.2
      ... 
199    0.2
200    0.3
201    0.1
202    0.2
203    0.3
Name: Discount_pct, Length: 204, dtype: float64

In [36]:
#Creating a month column in df to perform a left join with coupons
df['Month'] = df['date'].dt.strftime('%b')
df['Month']

0        Jan
1        Jan
2        Jan
3        Jan
4        Jan
        ... 
52919    Dec
52920    Dec
52921    Dec
52922    Dec
52923    Dec
Name: Month, Length: 52924, dtype: object

In [37]:
#Replacing categories to match categories in coupons dataframe
df['Product_Category'] = df['Product_Category'].replace({'Fun': 'Accessories', 'Google': 'Bottles',
                                                         'Backpacks':'Bags', 'More Bags':'Bags'})

In [38]:
#Left join on both dataframes
merged_df = pd.merge(df, coupons, left_on=['Month', 'Product_Category'], right_on=['Month', 'Product_Category'], how='left')
merged_df

Unnamed: 0,user_id,transaction_id,date,Product_SKU,Product_Description,Product_Category,Quantity,Avg_Price,Delivery_Charges,Coupon_Status,Month,Coupon_Code,Discount_pct
0,17850,16679,2019-01-01,GGOENEBJ079499,Nest Learning Thermostat 3rd Gen-USA - Stainle...,Nest-USA,1,153.71,6.50,Used,Jan,ELEC10,0.1
1,17850,16680,2019-01-01,GGOENEBJ079499,Nest Learning Thermostat 3rd Gen-USA - Stainle...,Nest-USA,1,153.71,6.50,Used,Jan,ELEC10,0.1
2,17850,16681,2019-01-01,GGOEGFKQ020399,Google Laptop and Cell Phone Stickers,Office,1,2.05,6.50,Used,Jan,OFF10,0.1
3,17850,16682,2019-01-01,GGOEGAAB010516,Google Men's 100% Cotton Short Sleeve Hero Tee...,Apparel,5,17.53,6.50,Not Used,Jan,SALE10,0.1
4,17850,16682,2019-01-01,GGOEGBJL013999,Google Canvas Tote Natural/Navy,Bags,1,16.50,6.50,Used,Jan,AIO10,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52919,14410,48493,2019-12-31,GGOENEBB078899,Nest Cam Indoor Security Camera - USA,Nest-USA,1,121.30,6.50,Clicked,Dec,ELEC30,0.3
52920,14410,48494,2019-12-31,GGOEGAEB091117,Google Zip Hoodie Black,Apparel,1,48.92,6.50,Used,Dec,SALE30,0.3
52921,14410,48495,2019-12-31,GGOENEBQ084699,Nest Learning Thermostat 3rd Gen-USA - White,Nest-USA,1,151.88,6.50,Used,Dec,ELEC30,0.3
52922,14600,48496,2019-12-31,GGOENEBQ079199,Nest Protect Smoke + CO White Wired Alarm-USA,Nest-USA,5,80.52,6.50,Clicked,Dec,ELEC30,0.3


In [50]:
merged_df[merged_df["Product_Description"].str.contains("Bottle")]

Unnamed: 0,user_id,transaction_id,date,Product_SKU,Product_Description,Product_Category,Quantity,Avg_Price,Delivery_Charges,Coupon_Status,Month,Coupon_Code,Discount_pct,unique_id
6,17850,16682,2019-01-01,GGOEGDHC018299,Google 22 oz Water Bottle,Drinkware,15,3.08,6.50,Not Used,Jan,EXTRA10,0.1,Google 22 oz Water Bottle Drinkware
7,17850,16682,2019-01-01,GGOEGDHG014499,Google Infuser-Top Water Bottle,Drinkware,15,10.31,6.50,Clicked,Jan,EXTRA10,0.1,Google Infuser-Top Water Bottle Drinkware
27,12583,16692,2019-01-01,GGOEGDHC015299,23 oz Wide Mouth Sport Bottle,Drinkware,26,8.72,102.79,Clicked,Jan,EXTRA10,0.1,23 oz Wide Mouth Sport Bottle Drinkware
32,12583,16693,2019-01-01,GGOEGDHC074099,Google 17oz Stainless Steel Sport Bottle,Drinkware,1,15.67,6.50,Clicked,Jan,EXTRA10,0.1,Google 17oz Stainless Steel Sport Bottle Drink...
56,17850,16704,2019-01-01,GGOEADHB014799,Android Glass Water Bottle with Black Sleeve,Drinkware,10,13.40,6.50,Clicked,Jan,EXTRA10,0.1,Android Glass Water Bottle with Black Sleeve D...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52710,13584,48322,2019-12-29,GGOEGDHC018299,Google 22 oz Water Bottle,Drinkware,2,2.44,6.50,Used,Dec,EXTRA30,0.3,Google 22 oz Water Bottle Drinkware
52824,17105,48418,2019-12-30,GGOEGDHC087099,Google Thermal Bottle Blue,Drinkware,2,24.45,6.00,Clicked,Dec,EXTRA30,0.3,Google Thermal Bottle Blue Drinkware
52859,14438,48447,2019-12-31,GGOEGDHC018299,Google 22 oz Water Bottle,Drinkware,1,2.44,6.50,Used,Dec,EXTRA30,0.3,Google 22 oz Water Bottle Drinkware
52868,14606,48452,2019-12-31,GGOEGDHQ086199,Google White Force 17 oz Bottle,Drinkware,1,24.45,6.00,Clicked,Dec,EXTRA30,0.3,Google White Force 17 oz Bottle Drinkware


In [39]:
#Checking for null values to ensure join is performed correctly
merged_df.isnull().sum()

user_id                0
transaction_id         0
date                   0
Product_SKU            0
Product_Description    0
Product_Category       0
Quantity               0
Avg_Price              0
Delivery_Charges       0
Coupon_Status          0
Month                  0
Coupon_Code            0
Discount_pct           0
dtype: int64

In [44]:
#Importing Products dataset
products = pd.read_csv('../products.csv')

In [45]:
products.head()

Unnamed: 0,product_id,product_name,about_product,category,actual_price,discounted_price,discount_percentage
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers&Accessories|Accessories&Peripherals|...,13.19,4.79,0.64
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,"Compatible with all Type C enabled devices, be...",Computers&Accessories|Accessories&Peripherals|...,4.19,2.39,0.43
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,【 Fast Charger& Data Sync】-With built-in safet...,Computers&Accessories|Accessories&Peripherals|...,22.79,2.39,0.9
3,B08HDJ86NZ,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,The boAt Deuce USB 300 2 in 1 cable is compati...,Computers&Accessories|Accessories&Peripherals|...,8.39,3.95,0.53
4,B08CF3B7N1,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,[CHARGE & SYNC FUNCTION]- This cable comes wit...,Computers&Accessories|Accessories&Peripherals|...,4.79,1.85,0.61


## Online Sales Table


In [47]:
#Creating a unique id that consists of Product description and category
merged_df['unique_id'] = merged_df['Product_Description'] + ' ' + merged_df['Product_Category']
merged_df['unique_id']

0        Nest Learning Thermostat 3rd Gen-USA - Stainle...
1        Nest Learning Thermostat 3rd Gen-USA - Stainle...
2             Google Laptop and Cell Phone Stickers Office
3        Google Men's 100% Cotton Short Sleeve Hero Tee...
4                     Google Canvas Tote Natural/Navy Bags
                               ...                        
52919       Nest Cam Indoor Security Camera - USA Nest-USA
52920                      Google Zip Hoodie Black Apparel
52921    Nest Learning Thermostat 3rd Gen-USA - White N...
52922    Nest Protect Smoke + CO White Wired Alarm-USA ...
52923    Nest Protect Smoke + CO White Battery Alarm-US...
Name: unique_id, Length: 52924, dtype: object

In [48]:
#Creating unique id for products dataframe
products['unique_id'] = products['product_name'] + ' ' + products['category']
products['unique_id']

0       Wayona Nylon Braided USB to Lightning Fast Cha...
1       Ambrane Unbreakable 60W / 3A Fast Charging 1.5...
2       Sounce Fast Phone Charging Cable & Data Sync U...
3       boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...
4       Portronics Konnect L 1.2M Fast Charging 3A 8 P...
                              ...                        
1346    Noir Aqua - 5pcs PP Spun Filter + 1 Spanner | ...
1347    Prestige Delight PRWO Electric Rice Cooker (1 ...
1348    Bajaj Majesty RX10 2000 Watts Heat Convector R...
1349    Havells Ventil Air DSP 230mm Exhaust Fan (Pist...
1350    Borosil Jumbo 1000-Watt Grill Sandwich Maker (...
Name: unique_id, Length: 1351, dtype: object

In [220]:
#Using fuzzy matching to replace unique_ids in merged_df with values from the products table using the unique_id col
merged_df["unique_id"] = merged_df["unique_id"].apply(
  lambda x: process.extractOne(x, products["unique_id"], scorer=fuzz.token_set_ratio)[0]
)

In [239]:
#Perform left join to join merged_df and products 
combined_df = pd.merge(merged_df, products, on="unique_id", how="left")

In [240]:
#Check that left join is performed correctly
combined_df.isnull().sum()

user_id                0
transaction_id         0
date                   0
Product_SKU            0
Product_Description    0
Product_Category       0
Quantity               0
Avg_Price              0
Delivery_Charges       0
Coupon_Status          0
Month                  0
Coupon_Code            0
Discount_pct           0
unique_id              0
product_id             0
product_name           0
about_product          0
category               0
actual_price           0
discounted_price       0
discount_percentage    0
dtype: int64

In [241]:
#Dropping unnecessary cols
combined_df.drop(columns=['Product_SKU', 'Product_Description', 'Product_Category',
                          'Month','unique_id','product_name','about_product',
                          'category','actual_price', 'discounted_price',
                          'discount_percentage','Avg_Price'], inplace=True)

In [9]:
#Rearranging the columns 
pi = combined_df.pop('product_id')
combined_df.insert(3,'product_id',pi)

In [12]:
combined_df.columns

Index(['user_id', 'transaction_id', 'date', 'product_id', 'Quantity',
       'Delivery_Charges', 'Coupon_Status', 'Coupon_Code', 'Discount_pct'],
      dtype='object')

In [238]:
combined_df.to_csv("../online_sales_edited.csv",index=False)

In [2]:
df_edit = pd.read_csv('../online_sales_edited.csv')

In [3]:
# Coupon_Status consists of "Used", "Not Used" and "Clicked". 

# "Clicked" represents that the user tried to apply the coupon, but it is not applicable for the purchase

# "Clicked" will be remapped to "Not Used"

df_edit['Coupon_Status'] = df_edit['Coupon_Status'].apply(lambda x: 'Used' if x == 'Used' else 'Not Used')

In [4]:
df_edit.to_csv("../online_sales_edited.csv",index=False)

In [3]:
ose =pd.read_csv('../online_sales_edited.csv')
ose.head()

Unnamed: 0,user_id,transaction_id,date,product_id,Quantity,Delivery_Charges,Coupon_Status,Coupon_Code,Discount_pct
0,17850,16679,2019-01-01,B09DL9978Y,1,6.5,Used,ELEC10,0.1
1,17850,16680,2019-01-01,B09DL9978Y,1,6.5,Used,ELEC10,0.1
2,17850,16681,2019-01-01,B07GXHC691,1,6.5,Used,OFF10,0.1
3,17850,16682,2019-01-01,B08NCKT9FG,5,6.5,Not Used,SALE10,0.1
4,17850,16682,2019-01-01,B08H21B6V7,1,6.5,Used,AIO10,0.1


In [7]:
min(ose['user_id']),max(ose['user_id'])

(12346, 18283)

Since these user_ids do not exist in our current database, we will be generating synthetic data for their gender and age

In [4]:
#Keeping only the users column to store the users data into our database
online_sales_users = ose['user_id'].drop_duplicates().reset_index(drop=True)
osu = pd.DataFrame(online_sales_users)
osu.head()

Unnamed: 0,user_id
0,17850
1,13047
2,12583
3,13748
4,15100


In [5]:
fake = Faker()

# Generate fake age and gender for users
def add_fake_age_gender(df):
    df['age'] = [random.randint(18, 60) for _ in range(len(df))]  # Generate random ages
    df['gender'] = [random.choice(['Male', 'Female']) for _ in range(len(df))]  # Generate random genders
    return df

# Add fake age and gender to the existing DataFrame
osu = add_fake_age_gender(osu)
osu.head()

Unnamed: 0,user_id,age,gender
0,17850,33,Female
1,13047,57,Male
2,12583,59,Male
3,13748,32,Female
4,15100,35,Male


In [8]:
osu.to_csv('../online_sales_users.csv',index=False)