## Import libraries

In [None]:
import numpy as np
import pandas as pd
import random

## Dataset Generation

### Import Dataset

In [None]:
!git clone https://github.com/yeesem/Machine-Learning-Datasets.git

Cloning into 'Machine-Learning-Datasets'...
remote: Enumerating objects: 106, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (51/51), done.[K
remote: Total 106 (delta 15), reused 0 (delta 0), pack-reused 55 (from 1)[K
Receiving objects: 100% (106/106), 48.82 MiB | 9.42 MiB/s, done.
Resolving deltas: 100% (27/27), done.
Updating files: 100% (45/45), done.


In [None]:
df = pd.read_csv("/content/Machine-Learning-Datasets/Supermart Grocery Sales ECommerce Dataset.csv")
df.head()

Unnamed: 0,Order ID,Customer Name,Category,Sub Category,City,Order Date,Region,Sales,Discount,Profit,State
0,OD1,Harish,Oil & Masala,Masalas,Vellore,11-08-2017,North,1254,0.12,401.28,Tamil Nadu
1,OD2,Sudha,Beverages,Health Drinks,Krishnagiri,11-08-2017,South,749,0.18,149.8,Tamil Nadu
2,OD3,Hussain,Food Grains,Atta & Flour,Perambalur,06-12-2017,West,2360,0.21,165.2,Tamil Nadu
3,OD4,Jackson,Fruits & Veggies,Fresh Vegetables,Dharmapuri,10-11-2016,South,896,0.25,89.6,Tamil Nadu
4,OD5,Ridhesh,Food Grains,Organic Staples,Ooty,10-11-2016,South,2355,0.26,918.45,Tamil Nadu


In [None]:
df["Order Date"] = pd.to_datetime(df['Order Date'], format='mixed')
df["Order Date"] = pd.to_datetime(df['Order Date'], format='coerce')
df.rename(columns={'Sub Category': 'Product Name'}, inplace=True)

### City Entity

In [None]:
city_dict = {}

city_set = []
for i in range(len(df)):
  if df["City"].iloc[i] not in city_set:
    city_dict[df["City"].iloc[i]] = {
        "City ID" :f"CT{len(city_dict) + 1}",
        "City" : df["City"].iloc[i],
        "Region" : df["Region"].iloc[i],
        "State" : df["State"].iloc[i]
    }
    city_set.append(df["City"].iloc[i])

In [None]:
city_df = pd.DataFrame(city_dict.values())
city_df.head()

Unnamed: 0,City ID,City,Region,State
0,CT1,Vellore,North,Tamil Nadu
1,CT2,Krishnagiri,South,Tamil Nadu
2,CT3,Perambalur,West,Tamil Nadu
3,CT4,Dharmapuri,South,Tamil Nadu
4,CT5,Ooty,South,Tamil Nadu


### Customer Entity

In [None]:
customer_dict = {}
customer_set = []
for i in range(len(df)):
  if df["Customer Name"].iloc[i] not in customer_set:
    customer_dict[df["Customer Name"].iloc[i]] = {
        "Customer ID" : f"C{len(customer_dict) + 1}",
        "City ID" : city_dict[df["City"].iloc[i]]["City ID"],
        "Customer Name" : df["Customer Name"].iloc[i],
        "Age" : np.random.randint(18, 80),
        "Customer Type" : "Member" if np.random.randint(0, 2) else "Normal"
    }
    customer_set.append(df["Customer Name"].iloc[i])

In [None]:
customer_df = pd.DataFrame(customer_dict.values())
customer_df.head()

Unnamed: 0,Customer ID,City ID,Customer Name,Age,Customer Type
0,C1,CT1,Harish,20,Normal
1,C2,CT2,Sudha,40,Normal
2,C3,CT3,Hussain,53,Member
3,C4,CT4,Jackson,65,Member
4,C5,CT5,Ridhesh,62,Normal


### Product Entity

In [None]:
price = []
quantity = []
for sales in list(df["Sales"].values):
  tempt_quantity = np.random.randint(1, 30)
  price.append(round(sales/tempt_quantity,2))
  quantity.append(tempt_quantity)

In [None]:
product_dict = {}
product_set = []

for i in range(len(df)):
  if df["Product Name"].iloc[i] not in product_set:
    product_dict[df["Product Name"].iloc[i]] = {
        "Product ID" : f"P{len(product_dict) + 1}",
        "Category" : df["Category"].iloc[i],
        "Product Name" : df["Product Name"].iloc[i],
        "Price" : price[i]
    }
    product_set.append(df["Product Name"].iloc[i])

In [None]:
product_df = pd.DataFrame(product_dict.values())
product_df.head()

Unnamed: 0,Product ID,Category,Product Name,Price
0,P1,Oil & Masala,Masalas,44.79
1,P2,Beverages,Health Drinks,57.62
2,P3,Food Grains,Atta & Flour,168.57
3,P4,Fruits & Veggies,Fresh Vegetables,34.46
4,P5,Food Grains,Organic Staples,785.0


### Time Entity

In [None]:
time_df = pd.DataFrame({
    "Date": df["Order Date"].values,
    "Day of Week" : df["Order Date"].dt.day_name().values,
    "Month" : df["Order Date"].dt.month.astype(int).values,
    "Year" : df["Order Date"].dt.year.astype(int).values,
})

time_df.drop_duplicates(inplace=True, keep = 'first')

time_df = time_df.sort_values(by="Date", ascending=True)

time_id = []

for i in range(len(time_df)):
  time_id.append(f"T{i + 1}")

time_df["Time ID"] = time_id
time_df = time_df[[ "Time ID", "Date", "Day of Week", "Month", "Year"]]
time_df = time_df.reset_index(drop = True)
time_df.head()

Unnamed: 0,Time ID,Date,Day of Week,Month,Year
0,T1,2015-01-03,Saturday,1,2015
1,T2,2015-01-04,Sunday,1,2015
2,T3,2015-01-05,Monday,1,2015
3,T4,2015-01-06,Tuesday,1,2015
4,T5,2015-01-07,Wednesday,1,2015


### Shipment Entity

In [None]:
transportation_type = ["Van", "Truck", "Motorcycles", "Air Freight"]
category_type = ["Budget", "Standard", "Premium"]

shipment_dict = {}

transportation_used = []
category_used = []

for i in range(len(df)):
  type_of_transportation_used = np.random.randint(0, 4)
  transportation_used.append(transportation_type[type_of_transportation_used])

  if type_of_transportation_used == 0 or type_of_transportation_used == 1 or type_of_transportation_used == 2:
    category_index = np.random.randint(0, 2)
    category_used.append(category_type[category_index])
  else:
    category_index = np.random.randint(1, 3)
    category_used.append(category_type[category_index])

In [None]:
for index, (order_date, category) in enumerate(zip(df["Order Date"],category_used)):
  if category == "Budget":
    random_days = np.random.randint(1, 11)
  elif category == "Standard":
    random_days = np.random.randint(1, 6)
  elif category == "Premium":
    random_days = np.random.randint(1, 3)

shipment_df_first = pd.DataFrame({
    "Transportation Type": transportation_used,
    "Category": category_used,
})

In [None]:
for transportation_type, category in zip(transportation_used, category_used):
  tempt = (transportation_type, category)
  if tempt not in shipment_dict:
    shipment_dict[tempt] = {
                            "Shipment ID": f"S{len(shipment_dict) + 1}",
                            "Transportation Type": transportation_type,
                            "Category": category,
                            }

In [None]:
shipment_df = pd.DataFrame(shipment_dict.values())
shipment_df.head()

Unnamed: 0,Shipment ID,Transportation Type,Category
0,S1,Air Freight,Standard
1,S2,Van,Budget
2,S3,Motorcycles,Standard
3,S4,Truck,Budget
4,S5,Van,Standard


### Order Entity

In [None]:
customer_id = []
product_id = []
time_id = []
shipment_id = []

for i in range(len(df)):
  customer_id.append(customer_dict[df["Customer Name"].iloc[i]]["Customer ID"])
  product_id.append(product_dict[df["Product Name"].iloc[i]]["Product ID"])
  time_id.append(time_df[time_df["Date"] == df["Order Date"].iloc[i]]["Time ID"].values[0])
  shipment_id.append(f"S{np.random.randint(0,9)}")

In [None]:
order_df = pd.DataFrame({
    "Order ID": df["Order ID"].values,
    "Customer ID":customer_id,
    "Product ID":product_id,
    "Time ID":time_id,
    "Shipment ID":shipment_id,
    "Quantity" : quantity,
    "Sales" : df["Sales"].values,
    "Cost of Goods Sold" : df["Sales"].values - df["Profit"].values,
    "Profit" : df["Profit"].values
})

order_df.head()

Unnamed: 0,Order ID,Customer ID,Product ID,Time ID,Shipment ID,Quantity,Sales,Cost of Goods Sold,Profit
0,OD1,C1,P1,T864,S3,28,1254,852.72,401.28
1,OD2,C2,P2,T864,S2,13,749,599.2,149.8
2,OD3,C3,P3,T732,S7,14,2360,2194.8,165.2
3,OD4,C4,P4,T520,S6,26,896,806.4,89.6
4,OD5,C5,P5,T520,S6,3,2355,1436.55,918.45


## Featuretools

In [None]:
!pip install featuretools

### Import Libraries

In [None]:
import featuretools as ft

### Create EntitySet

In [None]:
# Create a new EntitySet named 'grocery_ecommerce'
es = ft.EntitySet(id='grocery_ecommerce')

### Add dataframe to EntitySet

In [None]:
# Add dataframes
es = es.add_dataframe(
    dataframe_name = "customer",
    dataframe = customer_df,
    index = "Customer ID"
)

es = es.add_dataframe(
    dataframe_name = "city",
    dataframe = city_df,
    index = "City ID"
)

es = es.add_dataframe(
    dataframe_name = "product",
    dataframe = product_df,
    index = "Product ID"
)

es = es.add_dataframe(
    dataframe_name = "shipment",
    dataframe = shipment_df,
    index = "Shipment ID"
)

es = es.add_dataframe(
    dataframe_name = "time",
    dataframe = time_df,
    index = "Time ID"
)

es = es.add_dataframe(
    dataframe_name = "order",
    dataframe = order_df,
    index = "Order ID"
)

### Add relationship to the EntitySet

In [None]:
# Define relationships
customer_relationship = ('customer', 'Customer ID', 'order', 'Customer ID')
city_relationship = ('city', 'City ID', 'customer', 'City ID')
product_relationship = ('product', 'Product ID', 'order', 'Product ID')
shipment_relationship = ('shipment', 'Shipment ID', 'order', 'Shipment ID')
time_relationship = ('time', 'Time ID', 'order', 'Time ID')

In [None]:
# Add relationship to the EntitySet
es = es.add_relationships([customer_relationship, city_relationship, product_relationship, shipment_relationship, time_relationship])

### Verify added relationships

In [None]:
es # Verify the relationships are added

Entityset: grocery_ecommerce
  DataFrames:
    customer [Rows: 50, Columns: 5]
    city [Rows: 24, Columns: 4]
    product [Rows: 23, Columns: 4]
    shipment [Rows: 8, Columns: 3]
    time [Rows: 1236, Columns: 5]
    order [Rows: 9994, Columns: 9]
  Relationships:
    order.Customer ID -> customer.Customer ID
    customer.City ID -> city.City ID
    order.Product ID -> product.Product ID
    order.Shipment ID -> shipment.Shipment ID
    order.Time ID -> time.Time ID

### Deep Feature Synthesis

In [None]:
# Run DFS to generate new features
features_matrix, feature_defs = ft.dfs(
    entityset = es,
    target_dataframe_name = "order",
    verbose = False
)

In [None]:
# Example of new features created
len(feature_defs)

115

In [None]:
pd.DataFrame(feature_defs)[99:]

Unnamed: 0,0
99,<Feature: time.SKEW(order.Cost of Goods Sold)>
100,<Feature: time.SKEW(order.Profit)>
101,<Feature: time.SKEW(order.Quantity)>
102,<Feature: time.SKEW(order.Sales)>
103,<Feature: time.STD(order.Cost of Goods Sold)>
104,<Feature: time.STD(order.Profit)>
105,<Feature: time.STD(order.Quantity)>
106,<Feature: time.STD(order.Sales)>
107,<Feature: time.SUM(order.Cost of Goods Sold)>
108,<Feature: time.SUM(order.Profit)>


In [None]:
features_matrix

Unnamed: 0_level_0,Quantity,Sales,Cost of Goods Sold,Profit,customer.Age,customer.Customer Type,product.Price,time.Day of Week,time.Month,time.Year,...,time.STD(order.Quantity),time.STD(order.Sales),time.SUM(order.Cost of Goods Sold),time.SUM(order.Profit),time.SUM(order.Quantity),time.SUM(order.Sales),time.DAY(Date),time.MONTH(Date),time.WEEKDAY(Date),time.YEAR(Date)
Order ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OD1,28,1254,852.72,401.28,20,Normal,44.79,Wednesday,11,2017,...,10.606602,357.088924,1451.92,551.08,41.0,2003.0,8,11,2,2017
OD2,13,749,599.20,149.80,40,Normal,57.62,Wednesday,11,2017,...,10.606602,357.088924,1451.92,551.08,41.0,2003.0,8,11,2,2017
OD3,14,2360,2194.80,165.20,53,Member,168.57,Monday,6,2017,...,6.903948,583.575791,28295.05,8165.95,331.0,36461.0,12,6,0,2017
OD4,26,896,806.40,89.60,65,Member,34.46,Tuesday,10,2016,...,12.288206,800.412602,3203.00,1103.00,36.0,4306.0,11,10,1,2016
OD5,3,2355,1436.55,918.45,62,Normal,785.00,Tuesday,10,2016,...,12.288206,800.412602,3203.00,1103.00,36.0,4306.0,11,10,1,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OD9990,25,945,585.90,359.10,36,Normal,25.04,Thursday,12,2015,...,8.791525,698.189581,12019.23,5436.77,155.0,17456.0,24,12,3,2015
OD9991,21,1195,1123.30,71.70,76,Member,56.50,Sunday,7,2015,...,9.873058,614.314912,13987.79,4247.21,213.0,18235.0,12,7,6,2015
OD9992,5,1567,1065.56,501.44,51,Member,97.72,Tuesday,6,2017,...,8.112717,593.659104,19431.05,6075.95,242.0,25507.0,6,6,1,2017
OD9993,20,1659,1061.76,597.24,23,Member,165.00,Tuesday,10,2018,...,10.553041,315.335853,7435.33,2797.67,79.0,10233.0,16,10,1,2018
