# Capstone Project

Submitted by:
1. Ankit Sahu
2. Vikul Aggarwal

In [1]:
# importing essential python modules
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import holidays

In [2]:
#ignore warnings
warnings.filterwarnings('ignore')
# Set pretty printing format for floats
pd.set_option('float_format', '{:.2f}'.format)
#Setting columns to be displayed
pd.options.display.max_columns = 50

### Reading the data

In [3]:
df = pd.read_csv('ConsumerElectronics.csv', low_memory = False, na_values=['\\N', ' ', ''])
df.head()

Unnamed: 0,fsn_id,order_date,Year,Month,order_id,order_item_id,gmv,units,deliverybdays,deliverycdays,s1_fact.order_payment_type,sla,cust_id,pincode,product_analytic_super_category,product_analytic_category,product_analytic_sub_category,product_analytic_vertical,product_mrp,product_procurement_sla
0,ACCCX3S58G7B5F6P,2015-10-17 15:11:54,2015,10,3419300926147000.0,3419300926147000.0,6400.0,1,,,COD,5,-1.01299130778588e+18,-7.791755829057349e+18,CE,CameraAccessory,CameraAccessory,CameraTripod,7190,0
1,ACCCX3S58G7B5F6P,2015-10-19 10:07:22,2015,10,1420830839915200.0,1420830839915200.0,6900.0,1,,,COD,7,-8.99032457905512e+18,7.33541149097431e+18,CE,CameraAccessory,CameraAccessory,CameraTripod,7190,0
2,ACCCX3S5AHMF55FV,2015-10-20 15:45:56,2015,10,2421912925714800.0,2421912925714800.0,1990.0,1,,,COD,10,-1.0404429420466e+18,-7.477687762286569e+18,CE,CameraAccessory,CameraAccessory,CameraTripod,2099,3
3,ACCCX3S5AHMF55FV,2015-10-14 12:05:15,2015,10,4416592101738400.0,4416592101738400.0,1690.0,1,,,Prepaid,4,-7.604960843527139e+18,-5.83593163877661e+18,CE,CameraAccessory,CameraAccessory,CameraTripod,2099,3
4,ACCCX3S5AHMF55FV,2015-10-17 21:25:03,2015,10,4419525153426400.0,4419525153426400.0,1618.0,1,,,Prepaid,6,2.8945572083453e+18,5.34735360997242e+17,CE,CameraAccessory,CameraAccessory,CameraTripod,2099,3


### Inspecting the data

In [4]:
print('The shape of the dataframe is ',df.shape)

The shape of the dataframe is  (1648824, 20)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1648824 entries, 0 to 1648823
Data columns (total 20 columns):
fsn_id                             1648824 non-null object
order_date                         1648824 non-null object
Year                               1648824 non-null int64
Month                              1648824 non-null int64
order_id                           1648824 non-null float64
order_item_id                      1648824 non-null float64
gmv                                1643920 non-null float64
units                              1648824 non-null int64
deliverybdays                      335852 non-null float64
deliverycdays                      335853 non-null float64
s1_fact.order_payment_type         1648824 non-null object
sla                                1648824 non-null int64
cust_id                            1643920 non-null float64
pincode                            1643920 non-null float64
product_analytic_super_category    1648824 non-null object
p

In [6]:
# standardising the column names to lower case
df.columns = df.columns.str.strip().str.lower()
print(df.columns)

Index(['fsn_id', 'order_date', 'year', 'month', 'order_id', 'order_item_id',
       'gmv', 'units', 'deliverybdays', 'deliverycdays',
       's1_fact.order_payment_type', 'sla', 'cust_id', 'pincode',
       'product_analytic_super_category', 'product_analytic_category',
       'product_analytic_sub_category', 'product_analytic_vertical',
       'product_mrp', 'product_procurement_sla'],
      dtype='object')


### Data Cleaning and filtering

In [7]:
# Percentage of missing values in each column
df.isnull().sum()*100/len(df.index)

fsn_id                             0.00
order_date                         0.00
year                               0.00
month                              0.00
order_id                           0.00
order_item_id                      0.00
gmv                                0.30
units                              0.00
deliverybdays                     79.63
deliverycdays                     79.63
s1_fact.order_payment_type         0.00
sla                                0.00
cust_id                            0.30
pincode                            0.30
product_analytic_super_category    0.00
product_analytic_category          0.00
product_analytic_sub_category      0.00
product_analytic_vertical          0.35
product_mrp                        0.00
product_procurement_sla            0.00
dtype: float64

In [8]:
# Assuming the null values in deliverybdays and deliverycdays correspond to 0, performing the imputation
df['deliverybdays'].fillna(value=0, inplace=True)
df['deliverycdays'].fillna(value=0, inplace=True)

In [9]:
# dropping the other null value records left since the %ages are very small
df.dropna(inplace=True)

In [10]:
# Checking the column-wise percentage of missing values again
df.isnull().sum()*100/len(df.index)

fsn_id                            0.00
order_date                        0.00
year                              0.00
month                             0.00
order_id                          0.00
order_item_id                     0.00
gmv                               0.00
units                             0.00
deliverybdays                     0.00
deliverycdays                     0.00
s1_fact.order_payment_type        0.00
sla                               0.00
cust_id                           0.00
pincode                           0.00
product_analytic_super_category   0.00
product_analytic_category         0.00
product_analytic_sub_category     0.00
product_analytic_vertical         0.00
product_mrp                       0.00
product_procurement_sla           0.00
dtype: float64

There are no null values now.

In [11]:
print('The number of records left after missing value treatment are', len(df.index))

The number of records left after missing value treatment are 1638106


In [12]:
print('The percentage of records dropped during null value treatment is',round((1648824-len(df.index))*100/1648824,4))

The percentage of records dropped during null value treatment is 0.65


In [13]:
# First filtering the data for the required time period of July 2015 to June 2016.b
df = df[~(((df.year == 2015) & (df.month < 7)) | ((df.year == 2016) & (df.month > 6)))]
print('The number of records left are', len(df.index))

The number of records left are 1637498


In [14]:
# Filtering the data for the 3 categories
df= df[(df.product_analytic_sub_category=='GamingAccessory')|(df.product_analytic_sub_category=='CameraAccessory')|(df.product_analytic_sub_category=='HomeAudio')]
print('The number of records left are', len(df.index))

The number of records left are 564395


In [15]:
# removing duplicate records
df = df.drop_duplicates(subset = ["order_date", "order_id", "order_item_id" ,"units"], keep = 'first', inplace = False)
print('The number of records left are', len(df.index))

The number of records left are 524114


In [16]:
# Removing the records where product_mrp is equal to 0
df = df[~(df.product_mrp==0)]
print('The number of records left are', len(df.index))

The number of records left are 520915


In [17]:
# Removing the records where gmv is equal to 0
df = df[~(df.gmv==0)]
print('The number of records left are', len(df.index))

The number of records left are 520668


In [18]:
# Removing the records where units is equal to 0
df = df[~(df.units==0)]
print('The number of records left are', len(df.index))

The number of records left are 520668


There are no records where units is 0.

In [19]:
# Fixing invalid GMV values
# There are records where GMV is greater than the product of MRP and Units. Removing such records.
df = df[~(df.gmv > df.product_mrp * df.units)]
print('The number of records left are', len(df.index))

The number of records left are 507934


Invalid values for product_mrp, units and gmv have been treated.

In [20]:
# treating deliverybdays
# there are some records where deliverybdays is negative
df[df.deliverybdays < 0]

Unnamed: 0,fsn_id,order_date,year,month,order_id,order_item_id,gmv,units,deliverybdays,deliverycdays,s1_fact.order_payment_type,sla,cust_id,pincode,product_analytic_super_category,product_analytic_category,product_analytic_sub_category,product_analytic_vertical,product_mrp,product_procurement_sla
978985,ACCEB2GBYXXV8Y3Y,2016-04-16 13:08:37,2016,4,1570438645116600.0,3.00037347717573e+17,393.0,1,-71.0,-836.0,Prepaid,6,-6.00040470643675e+18,2.1210794528446697e+18,CE,EntertainmentSmall,HomeAudio,HomeAudioSpeaker,1250,3
1012953,ACCEG47YXBJBKNZZ,2016-04-11 13:01:01,2016,4,4568270260855700.0,3.00037347717509e+17,1349.0,1,-39.0,-466.0,Prepaid,3,6.87988512636699e+17,-7.008123264312e+18,CE,EntertainmentSmall,HomeAudio,HomeAudioSpeaker,2999,2
1013347,ACCEG6BFZCKUVGBE,2016-04-28 16:15:26,2016,4,3586953427570300.0,3586953427570300.0,2480.0,2,-72.0,-848.0,Prepaid,6,2.3577288637330493e+18,6.51333054134398e+18,CE,CameraAccessory,CameraAccessory,CameraBatteryCharger,1840,2
1020219,ACCEYX9WWGEZYDUU,2016-04-27 00:17:34,2016,4,3583309822353700.0,3583309822353700.0,2745.0,5,-72.0,-844.0,Prepaid,7,2.3577288637330493e+18,6.51333054134398e+18,CE,CameraAccessory,CameraAccessory,CameraBattery,849,3
1034307,BBXD7KJWYQGJWYRP,2016-04-27 14:26:28,2016,4,2586025370724800.0,2586025370724800.0,4850.0,1,-41.0,-482.0,Prepaid,4,1.72086036317008e+17,-7.008123264312e+18,CE,EntertainmentSmall,HomeAudio,BoomBox,5199,3
1072470,ACCCXGEQHJVZ8QEB,2016-05-27 11:03:00,2016,5,3611823881854700.0,3611823881854700.0,9950.0,1,-14.0,-16.0,Prepaid,10,-4.13555681519322e+18,-5.42875743641255e+18,CE,CameraAccessory,CameraAccessory,Lens,16959,3
1075097,ACCD6KRGFFSWFYFN,2016-05-16 11:40:30,2016,5,4602342668174000.0,4602342668174000.0,396.0,1,-45.0,-532.0,COD,3,6.17816781724858e+18,-3.03201165845424e+17,CE,CameraAccessory,CameraAccessory,CameraBatteryCharger,625,5
1101685,ACCE8FTBXFKBNZMY,2016-05-16 19:09:13,2016,5,3602609539861100.0,3602609539861100.0,155.0,1,-56.0,-655.0,COD,6,7.05765652417432e+18,8.96557551844073e+18,CE,CameraAccessory,CameraAccessory,CameraTripod,1599,1
1113136,ACCEAFFNP7RDKFQ7,2016-05-09 19:37:07,2016,5,1596580081411600.0,1596580081411600.0,208.0,1,-73.0,-859.0,COD,5,-1.9507564691994097e+18,-4.84769369350052e+18,CE,CameraAccessory,CameraAccessory,Flash,1299,2


In [21]:
# dropping the records where deliverybdays is negative
df = df[~(df.deliverybdays < 0)]
print('The number of records left are', len(df.index))

The number of records left are 507925


In [22]:
# Checking if there are records where deliverycdays is negative
df[df.deliverycdays < 0]

Unnamed: 0,fsn_id,order_date,year,month,order_id,order_item_id,gmv,units,deliverybdays,deliverycdays,s1_fact.order_payment_type,sla,cust_id,pincode,product_analytic_super_category,product_analytic_category,product_analytic_sub_category,product_analytic_vertical,product_mrp,product_procurement_sla


In [23]:
# treating product_procurement_sla
# there are some records where product_procurement_sla is negative
df[df.product_procurement_sla < 0]

Unnamed: 0,fsn_id,order_date,year,month,order_id,order_item_id,gmv,units,deliverybdays,deliverycdays,s1_fact.order_payment_type,sla,cust_id,pincode,product_analytic_super_category,product_analytic_category,product_analytic_sub_category,product_analytic_vertical,product_mrp,product_procurement_sla
429,ACCCX3SGDZFGCGHX,2015-10-19 08:50:22,2015,10,2420795971167500.00,2420795971167500.00,21669.00,1,0.00,0.00,COD,4,3653244663466319872.00,2381383594627200000.00,CE,CameraAccessory,CameraAccessory,Lens,23700,-1
430,ACCCX3SGDZFGCGHX,2015-10-12 14:03:26,2015,10,4414939525658400.00,4414939525658400.00,21690.00,1,0.00,0.00,COD,3,-3724406161185070080.00,-8379280658621330432.00,CE,CameraAccessory,CameraAccessory,Lens,23700,-1
431,ACCCX3SGDZFGCGHX,2015-10-18 13:10:37,2015,10,3420092062764200.00,3420092062764200.00,21495.00,1,0.00,0.00,Prepaid,8,-6891927379305790464.00,5880641039672890368.00,CE,CameraAccessory,CameraAccessory,Lens,23700,-1
432,ACCCX3SGDZFGCGHX,2015-10-16 19:28:27,2015,10,2418590998298500.00,2418590998298500.00,19919.00,1,0.00,0.00,Prepaid,5,-5342522533301480448.00,-8271592535801180160.00,CE,CameraAccessory,CameraAccessory,Lens,23700,-1
433,ACCCX3SGDZFGCGHX,2015-10-16 21:16:25,2015,10,2418656047810100.00,2418656047810100.00,21370.00,1,0.00,0.00,Prepaid,6,7189102162087700480.00,-4383811865114310144.00,CE,CameraAccessory,CameraAccessory,Lens,23700,-1
434,ACCCX3SGDZFGCGHX,2015-10-17 11:03:54,2015,10,1419151259302000.00,1419151259302000.00,19919.00,1,0.00,0.00,Prepaid,7,-783121411427878016.00,-8372153889257870336.00,CE,CameraAccessory,CameraAccessory,Lens,23700,-1
435,ACCCX3SGDZFGCGHX,2015-10-15 18:42:18,2015,10,3417566413201200.00,3417566413201200.00,21669.00,1,0.00,0.00,COD,4,-8693637373675490304.00,8067096395579440128.00,CE,CameraAccessory,CameraAccessory,Lens,23700,-1
436,ACCCX3SGDZFGCGHX,2015-10-29 11:42:36,2015,10,1429536086418200.00,1429536086418200.00,21690.00,1,0.00,0.00,Prepaid,3,7529521755334330368.00,-5922363185279490048.00,CE,CameraAccessory,CameraAccessory,Lens,23700,-1
437,ACCCX3SGDZFGCGHX,2015-10-22 19:40:29,2015,10,4423782838868800.00,4423782838868800.00,21669.00,1,0.00,0.00,COD,4,1697893305906180096.00,-6676159501139730432.00,CE,CameraAccessory,CameraAccessory,Lens,23700,-1
438,ACCCX3SGDZFGCGHX,2015-10-16 16:20:05,2015,10,3418478074445800.00,3418478074445800.00,21685.00,1,0.00,0.00,Prepaid,2,-5971235422041490432.00,-8976206344846889984.00,CE,CameraAccessory,CameraAccessory,Lens,23700,-1


In [24]:
# The time taken to procure a product cannot be negative, these rows can be dropped.
df = df[~(df.product_procurement_sla < 0)]
print('The number of records left are', len(df.index))

The number of records left are 497887


In [25]:
# String to datetime
df['order_datetime'] =  pd.to_datetime(df.order_date, format='%Y-%m-%d %H:%M:%S')

In [26]:
# creating a column with just date
df['order_date'] = df.order_datetime.dt.date

### Dropping columns which aren't needed

In [27]:
# Columns which aren't necessary can be dropped
# Looking at the data product_analytic_super_category seems to have just one unique value
df.product_analytic_super_category.value_counts()

CE    497887
Name: product_analytic_super_category, dtype: int64

In [28]:
# product_analytic_super_category can be dropped
cols_to_drop = ['product_analytic_super_category']

In [29]:
# Also, columns like fsn_id, 'order_id', 'order_item_id', cust_id and pincode can be dropped as they aren't useful for analysis
cols_to_drop.extend(['fsn_id','order_id', 'order_item_id','cust_id','pincode'])

In [30]:
# dropping the columns
df.drop(cols_to_drop,axis=1,inplace=True)

#### The data is clean now. We have ~5 lakh records using which we can proceed with feature engineering, EDA, and model building.
#### Any other columns which aren't needed will be dropped in further analysis.

### Deriving new features

In [31]:
#Creating Week from order Date
df['week']=df.order_datetime.dt.week
df.head()

Unnamed: 0,order_date,year,month,gmv,units,deliverybdays,deliverycdays,s1_fact.order_payment_type,sla,product_analytic_category,product_analytic_sub_category,product_analytic_vertical,product_mrp,product_procurement_sla,order_datetime,week
0,2015-10-17,2015,10,6400.0,1,0.0,0.0,COD,5,CameraAccessory,CameraAccessory,CameraTripod,7190,0,2015-10-17 15:11:54,42
1,2015-10-19,2015,10,6900.0,1,0.0,0.0,COD,7,CameraAccessory,CameraAccessory,CameraTripod,7190,0,2015-10-19 10:07:22,43
2,2015-10-20,2015,10,1990.0,1,0.0,0.0,COD,10,CameraAccessory,CameraAccessory,CameraTripod,2099,3,2015-10-20 15:45:56,43
3,2015-10-14,2015,10,1690.0,1,0.0,0.0,Prepaid,4,CameraAccessory,CameraAccessory,CameraTripod,2099,3,2015-10-14 12:05:15,42
4,2015-10-17,2015,10,1618.0,1,0.0,0.0,Prepaid,6,CameraAccessory,CameraAccessory,CameraTripod,2099,3,2015-10-17 21:25:03,42


In [32]:
# Checking the unique values for week
df.week.unique()

array([42, 43, 41, 44, 40, 45, 46, 16, 10,  8, 48, 47, 38, 17, 14,  2,  7,
        5, 11, 13, 39,  9, 35,  3,  4, 49, 53, 12, 18, 50, 51,  6, 52,  1,
       15, 21, 19, 22, 20, 23, 24, 25, 26, 29, 30, 28, 31, 37, 27, 34, 33,
       32, 36], dtype=int64)

In [33]:
# To bring a continuity to week numbers, those which belong to 2016 can be incremented by 53
# so that the week numbers are continuous.
def week(x):
    if x >=1 and x<=26:
        return x+53
    else:
        return x
df.week = df.week.apply(week)

In [34]:
# checking the unique values again
df.week.unique()

array([42, 43, 41, 44, 40, 45, 46, 69, 63, 61, 48, 47, 38, 70, 67, 55, 60,
       58, 64, 66, 39, 62, 35, 56, 57, 49, 53, 65, 71, 50, 51, 59, 52, 54,
       68, 74, 72, 75, 73, 76, 77, 78, 79, 29, 30, 28, 31, 37, 27, 34, 33,
       32, 36], dtype=int64)

The Week Numbers now range from 27 to 79

In [35]:
# Creating list_price
# List price can be derived as GMV/Units sold

df['list_price']=df.gmv/df.units
df.head()

Unnamed: 0,order_date,year,month,gmv,units,deliverybdays,deliverycdays,s1_fact.order_payment_type,sla,product_analytic_category,product_analytic_sub_category,product_analytic_vertical,product_mrp,product_procurement_sla,order_datetime,week,list_price
0,2015-10-17,2015,10,6400.0,1,0.0,0.0,COD,5,CameraAccessory,CameraAccessory,CameraTripod,7190,0,2015-10-17 15:11:54,42,6400.0
1,2015-10-19,2015,10,6900.0,1,0.0,0.0,COD,7,CameraAccessory,CameraAccessory,CameraTripod,7190,0,2015-10-19 10:07:22,43,6900.0
2,2015-10-20,2015,10,1990.0,1,0.0,0.0,COD,10,CameraAccessory,CameraAccessory,CameraTripod,2099,3,2015-10-20 15:45:56,43,1990.0
3,2015-10-14,2015,10,1690.0,1,0.0,0.0,Prepaid,4,CameraAccessory,CameraAccessory,CameraTripod,2099,3,2015-10-14 12:05:15,42,1690.0
4,2015-10-17,2015,10,1618.0,1,0.0,0.0,Prepaid,6,CameraAccessory,CameraAccessory,CameraTripod,2099,3,2015-10-17 21:25:03,42,1618.0


In [36]:
# Discount %age can be derived as the product_mrp and the list price
df['discount_percentage']=round(100*((df.product_mrp - df.list_price) / df.product_mrp),2)

In [37]:
# creating a payday flag
# if the order date is close (a difference of 1 day) to pay day in Ontario (1st and 15th), we mark the column as 1, else 0
df['payday_flag'] = df['order_date'].apply(lambda x:1 if x.strftime('%d') in ('14','15','16','30','31','1','2') else 0)
df.head()

Unnamed: 0,order_date,year,month,gmv,units,deliverybdays,deliverycdays,s1_fact.order_payment_type,sla,product_analytic_category,product_analytic_sub_category,product_analytic_vertical,product_mrp,product_procurement_sla,order_datetime,week,list_price,discount_percentage,payday_flag
0,2015-10-17,2015,10,6400.0,1,0.0,0.0,COD,5,CameraAccessory,CameraAccessory,CameraTripod,7190,0,2015-10-17 15:11:54,42,6400.0,10.99,0
1,2015-10-19,2015,10,6900.0,1,0.0,0.0,COD,7,CameraAccessory,CameraAccessory,CameraTripod,7190,0,2015-10-19 10:07:22,43,6900.0,4.03,0
2,2015-10-20,2015,10,1990.0,1,0.0,0.0,COD,10,CameraAccessory,CameraAccessory,CameraTripod,2099,3,2015-10-20 15:45:56,43,1990.0,5.19,0
3,2015-10-14,2015,10,1690.0,1,0.0,0.0,Prepaid,4,CameraAccessory,CameraAccessory,CameraTripod,2099,3,2015-10-14 12:05:15,42,1690.0,19.49,1
4,2015-10-17,2015,10,1618.0,1,0.0,0.0,Prepaid,6,CameraAccessory,CameraAccessory,CameraTripod,2099,3,2015-10-17 21:25:03,42,1618.0,22.92,0


In [38]:
# importing holiday data
# Importing Holidays for Canada ontario State for 2015-2016 Years
holidays=pd.DataFrame(pd.Series(holidays.CA(prov='ON',years=[2015,2016]))).reset_index()
holidays=holidays.rename(columns={'index':'holiday_flag'})
holidays=holidays.drop(0,axis=1)
holidays['holiday_flag']=pd.to_datetime(holidays['holiday_flag'], dayfirst=True)
holidays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 1 columns):
holiday_flag    21 non-null datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 296.0 bytes


In [39]:
# creating holiday flag
holidays['holiday_flag']=holidays['holiday_flag'].dt.date
df=pd.merge(df,holidays,how='left',left_on='order_date',right_on='holiday_flag').fillna(0)

def holiday_flag(x):
    if x == 0:
        return 0
    else:
        return 1

df['holiday_flag']=df['holiday_flag'].apply(holiday_flag)

In [None]:
# creating a column product type
# if the gmv is more than the 80th percentile, it can be qualified as luxury
# else it can be considered as mass market
df['product_type'] = df['gmv'].apply(lambda x:'luxury' if x >= df.gmv.quantile(.8) else 'mass_market')