## This code is related to market basket analysis by dividing the data into two parts
## One contains weekdays data and second contains weekdends data only

#### importing the necessary modules

In [1]:
#importing the modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### importing the data and checking the first few columns

In [2]:
cols=['SHDR_IDL','PROD_IDL','PRTYPE_IDL','RPRICE','QUANTITY','DATEORDER']
dtl=pd.read_csv("SalesDTL.csv",usecols=cols,parse_dates=['DATEORDER'],
                dtype={'SHDR_IDL':int,'PROD_IDL':int,'PRTYPE_IDL':int,'QUANTITY':int})

#converting column names to lower case
dtl.columns=dtl.columns.str.lower()

#dtl=dtl.set_index("dateorder")
dtl['rprice']=dtl['rprice'].abs()
dtl['total_price']=dtl['rprice']*dtl['quantity']
#dtl=dtl.loc['2016-03-06':,]
dtl['prod_idl']=dtl['prod_idl'].astype('category')
dtl.head()

Unnamed: 0,shdr_idl,prod_idl,prtype_idl,rprice,quantity,dateorder,total_price
0,1,3,2,7.02,1,2005-02-13 03:35:52,7.02
1,1,4,2,3.17,1,2005-02-13 03:35:52,3.17
2,2,3,2,9.95,1,2005-03-08 09:18:41,9.95
3,3,3,2,9.95,1,2005-03-08 09:21:59,9.95
4,4,28,13,2.79,1,2008-12-19 09:21:48,2.79


In [3]:
dtl.shape

(565304, 7)

## Since the data is not consistant we sliced the data from 6th march 2016 to 5th november 2017

In [4]:
sliced_data=dtl[dtl['shdr_idl']>=31424]

In [5]:
sliced_data.head()

Unnamed: 0,shdr_idl,prod_idl,prtype_idl,rprice,quantity,dateorder,total_price
107838,31424,51,13,3.5,1,2016-03-06 14:33:35,3.5
107839,31425,163,26,39.8,4,2016-03-06 14:35:41,159.2
107840,31425,81,26,8.5,1,2016-03-06 14:35:41,8.5
107841,31426,163,26,19.9,2,2016-03-06 14:36:35,39.8
107842,31427,428,26,19.9,2,2016-03-06 14:37:43,39.8


In [6]:
sliced_data.shape

(457462, 7)

In [7]:
sliced_data['weekday_name']=sliced_data.dateorder.dt.weekday_name

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [8]:
sliced_data.head(8)

Unnamed: 0,shdr_idl,prod_idl,prtype_idl,rprice,quantity,dateorder,total_price,weekday_name
107838,31424,51,13,3.5,1,2016-03-06 14:33:35,3.5,Sunday
107839,31425,163,26,39.8,4,2016-03-06 14:35:41,159.2,Sunday
107840,31425,81,26,8.5,1,2016-03-06 14:35:41,8.5,Sunday
107841,31426,163,26,19.9,2,2016-03-06 14:36:35,39.8,Sunday
107842,31427,428,26,19.9,2,2016-03-06 14:37:43,39.8,Sunday
107843,31427,300,10,0.0,1,2016-03-06 14:37:43,0.0,Sunday
107844,31427,278,10,0.0,1,2016-03-06 14:37:43,0.0,Sunday
107845,31426,57,13,5.0,1,2016-03-06 14:39:41,5.0,Sunday


## Slicing the data for weekends

In [9]:
weekends=sliced_data[(sliced_data['weekday_name']=='Saturday') | (sliced_data['weekday_name']=='Sunday')]

In [10]:
weekends['weekday_name'].value_counts()

Saturday    96646
Sunday      80406
Name: weekday_name, dtype: int64

## Slicing the data for weekdays

In [11]:
weekdays=sliced_data[(sliced_data['weekday_name']=='Monday') | (sliced_data['weekday_name']=='Tuesday')
                    |(sliced_data['weekday_name']=='Wednesday')|(sliced_data['weekday_name']=='Thursday')
                    |(sliced_data['weekday_name']=='Friday')]

In [12]:
weekdays['weekday_name'].value_counts()

Friday       75994
Thursday     54334
Monday       51444
Tuesday      50103
Wednesday    48535
Name: weekday_name, dtype: int64

In [13]:
len(weekends)+len(weekdays)==len(sliced_data)

True

## arranging data for weekends(making basket)

In [14]:
basket_weekends=weekends.groupby(['shdr_idl', 'prod_idl'])['quantity'].sum().unstack().fillna(0)

In [15]:
# Convert the units to 1 hot encoded values
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1 

In [16]:
basket_sets_weekends = basket_weekends.applymap(encode_units)

In [17]:
basket_sets_weekends.head(8)

prod_idl,51,81,163,57,76,110,242,278,300,428,...,432,263,424,602,528,249,389,426,556,589
shdr_idl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31424,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31425,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31426,0,0,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31427,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
31428,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31429,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31430,0,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31431,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## importing apriori and association rules modules

In [18]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

### Building up frequent item sets on weekends

In [19]:
# Build up the frequent items
frequent_itemsets_weekends = apriori(basket_sets_weekends, min_support=0.04, use_colnames=True)

## The top most ordered items in weekends are 

In [20]:
frequent_itemsets_weekends.sort_values(by='support',ascending=False)

Unnamed: 0,support,itemsets
6,0.264641,(31)
1,0.169055,(163)
17,0.165899,(176)
19,0.144871,(178)
13,0.131307,(65)
2,0.11367,(110)
3,0.11045,(242)
16,0.096438,(239)
14,0.079868,(56)
21,0.073363,"(176, 31)"


## arranging data for weekdays(making basket for weekdays)

In [21]:
basket_weekdays=weekdays.groupby(['shdr_idl', 'prod_idl'])['quantity'].sum().unstack().fillna(0)

In [22]:
basket_sets_weekdays = basket_weekdays.applymap(encode_units)

In [23]:
basket_sets_weekdays.head(8)

prod_idl,242,244,252,51,245,248,260,279,253,42,...,613,614,285,590,263,405,621,612,424,268
shdr_idl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31622,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31623,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31624,0,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31625,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31626,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31627,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
31628,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
31629,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Building frequent item sets for weekdays

In [24]:
# Build up the frequent items
frequent_itemsets_weekdays = apriori(basket_sets_weekdays, min_support=0.04, use_colnames=True)

## the Top most ordered items in weekdays are

In [25]:
frequent_itemsets_weekdays.sort_values(by='support',ascending=False)

Unnamed: 0,support,itemsets
2,0.221014,(163)
11,0.187668,(31)
7,0.166206,(65)
0,0.099233,(242)
14,0.094189,(81)
5,0.093411,(110)
10,0.089132,(56)
1,0.087161,(42)
3,0.08476,(239)
16,0.069804,(74)
