In [1]:
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
url="https://raw.githubusercontent.com/sumeyyeozel/csv/main/Groceries_dataset.csv"
groceries=pd.read_csv(url)

In [3]:
groceries.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [4]:
groceries.shape

(38765, 3)

In [5]:
groceries.dtypes

Member_number       int64
Date               object
itemDescription    object
dtype: object

In [6]:
groceries.describe()

Unnamed: 0,Member_number
count,38765.0
mean,3003.641868
std,1153.611031
min,1000.0
25%,2002.0
50%,3005.0
75%,4007.0
max,5000.0


In [7]:
groceries.columns

Index(['Member_number', 'Date', 'itemDescription'], dtype='object')

In [8]:
groceries.index

RangeIndex(start=0, stop=38765, step=1)

In [9]:
groceries['Date']=pd.to_datetime(groceries['Date'])

In [10]:
groceries.dtypes

Member_number               int64
Date               datetime64[ns]
itemDescription            object
dtype: object

In [11]:
groceries.isna().sum()

Member_number      0
Date               0
itemDescription    0
dtype: int64

In [12]:
groceries.isnull().sum()

Member_number      0
Date               0
itemDescription    0
dtype: int64

In [13]:
groceries['Member_number'].value_counts()

3180    36
3737    33
3050    33
2051    33
2625    31
        ..
2503     2
3301     2
1775     2
3723     2
2417     2
Name: Member_number, Length: 3898, dtype: int64

In [14]:
groceries['Date'].value_counts()

2015-01-21    96
2015-07-21    93
2015-11-29    92
2015-08-08    92
2015-04-30    91
              ..
2014-06-29    26
2014-04-07    24
2015-03-16    23
2015-03-17    23
2015-01-09    22
Name: Date, Length: 728, dtype: int64

In [15]:
groceries['itemDescription'].value_counts()

whole milk               2502
other vegetables         1898
rolls/buns               1716
soda                     1514
yogurt                   1334
                         ... 
rubbing alcohol             5
bags                        4
baby cosmetics              3
kitchen utensil             1
preservation products       1
Name: itemDescription, Length: 167, dtype: int64

In [16]:
groceries.groupby('itemDescription').mean()

Unnamed: 0_level_0,Member_number
itemDescription,Unnamed: 1_level_1
Instant food products,3159.216667
UHT-milk,2998.207430
abrasive cleaner,3642.909091
artif. sweetener,3076.241379
baby cosmetics,2793.666667
...,...
white bread,2917.381215
white wine,3065.767045
whole milk,3014.649880
yogurt,2910.379310


In [17]:
topten = groceries['itemDescription'].value_counts().sort_values(ascending=False)[:10]
fig = px.bar(x= topten.index, y= topten.values,
            color=groceries['itemDescription'].value_counts().sort_values(ascending=False)[:10])
fig.update_layout(title_text= "Top 10 frequently sold products ", xaxis_title= "Products", yaxis_title="Number of item sold")
fig.show()

In [18]:
leastten = groceries['itemDescription'].value_counts().sort_values(ascending=True)[:10]
fig = px.bar(x= leastten.index, y= leastten.values, 
             color=groceries['itemDescription'].value_counts().sort_values(ascending=True)[:10])
fig.update_layout(title_text= "Least 10 frequently sold products ", xaxis_title= "Products", yaxis_title="Number of item sold")
fig.show()

In [19]:
pd.DataFrame(groceries['Member_number'].value_counts().sort_values(ascending=False))[:10]

Unnamed: 0,Member_number
3180,36
3050,33
2051,33
3737,33
2625,31
3915,31
2433,31
2271,31
3872,30
2394,29


**find the dates on which highest sale was made.**

In [20]:
fig1 = px.bar(groceries["Date"].value_counts(ascending=False), 
              orientation= "v", 
              color = groceries["Date"].value_counts(ascending=False),
              
               labels={'value':'Count', 'index':'Date','color':'Meter'})

fig1.update_layout(title_text="Exploring highest sales by  date")

fig1.show()

In [21]:
import datetime as dt
newdate = groceries['Date'].dt.strftime('%Y-%m')
print(newdate) 

0        2015-07
1        2015-05
2        2015-09
3        2015-12
4        2015-01
          ...   
38760    2014-08
38761    2014-02
38762    2014-04
38763    2014-03
38764    2014-12
Name: Date, Length: 38765, dtype: object


In [22]:
fig2 = px.bar(newdate.value_counts(ascending=False), 
              orientation= "v", 
              color = newdate.value_counts(ascending=False),
              
               labels={'value':'Count', 'index':'Date','color':'Meter'})

fig2.update_layout(title_text="Exploring highest sales by  date")

fig2.show()

In [23]:
products=groceries['itemDescription'].unique()

In [24]:
products[:10]

array(['tropical fruit', 'whole milk', 'pip fruit', 'other vegetables',
       'rolls/buns', 'pot plants', 'citrus fruit', 'beef', 'frankfurter',
       'chicken'], dtype=object)

**One Hot Encoder**

In [25]:
one_hot = pd.get_dummies(groceries['itemDescription'])
groceries1=groceries.copy()
groceries1.drop(['itemDescription'], inplace =True, axis=1)

groceries1 = groceries1.join(one_hot)

groceries1.head()

Unnamed: 0,Member_number,Date,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,1808,2015-07-21,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2552,2015-05-01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2300,2015-09-19,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1187,2015-12-12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3037,2015-01-02,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [26]:
groceries2 = groceries1.groupby(['Member_number', 'Date'])[products[:]].sum()

groceries2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
Member_number,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1000,2014-06-24,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,2015-03-15,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,2015-05-27,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,2015-07-24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,2015-11-25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
#Reset the index of the newly formed dataset.
groceries2 = groceries2.reset_index()[products]
groceries2.head()

Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
def product_names(x):
    for product in products:
        if x[product] >0:
            x[product] = product
    return x
#Apply the created function on data2 dataset.
groceries2 = groceries2.apply(product_names, axis=1)
groceries2.head()

Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
0,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**The bottom line is required for apriori algorithm.**

In [29]:
#Filter out the values from the groceries frame groceries2
x = groceries2.values
#Convert into list values in each row if value is not zero
x = [sub[~(sub==0)].tolist() for sub in x if sub [sub != 0].tolist()]
transactions = x
transactions[0:10] 

[['whole milk', 'pastry', 'salty snack'],
 ['whole milk', 'yogurt', 'sausage', 'semi-finished bread'],
 ['soda', 'pickled vegetables'],
 ['canned beer', 'misc. beverages'],
 ['sausage', 'hygiene articles'],
 ['whole milk', 'rolls/buns', 'sausage'],
 ['whole milk', 'soda'],
 ['frankfurter', 'soda', 'whipped/sour cream'],
 ['frankfurter', 'curd'],
 ['beef', 'white bread']]

**Apriori Algorithm**

Apriori is an algorithm for frequent itemset mining and association rule learning over relational databases. It proceeds by identifying the frequent individual items in the database and extending them to larger and larger item sets as long as those item sets appear sufficiently often in the database. The frequent itemsets determined by Apriori can be used to determine association rules which highlight general trends in the database: this has applications in domains such as market basket analysis.

In [30]:
!pip install apyori

Collecting apyori
  Downloading apyori-1.1.2.tar.gz (8.6 kB)
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25l[?25hdone
  Created wheel for apyori: filename=apyori-1.1.2-py3-none-any.whl size=5974 sha256=a36ed03ba1b778f1e4863b7a2137feabe15b37dcee43ff0d3270f366ece96fbf
  Stored in directory: /root/.cache/pip/wheels/cb/f6/e1/57973c631d27efd1a2f375bd6a83b2a616c4021f24aab84080
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


In [31]:
import apyori
from apyori import apriori

**Association rules** is used to find relationships between attributes in large databases. An association rule, A=> B, will be of the form” for a set of transactions, some value of itemset A determines the values of itemset B under the condition in which minimum support and confidence are met”.

In [32]:
associations = apriori(transactions, min_support = 0.00030, min_confidence = 0.06, min_lift = 3, max_length = 2, target = "associations")
association_results = list(associations)
association_results[:5]

[RelationRecord(items=frozenset({'liver loaf', 'fruit/vegetable juice'}), support=0.00040098910646260775, ordered_statistics=[OrderedStatistic(items_base=frozenset({'liver loaf'}), items_add=frozenset({'fruit/vegetable juice'}), confidence=0.12, lift=3.5276227897838903)]),
 RelationRecord(items=frozenset({'meat', 'roll products '}), support=0.0003341575887188398, ordered_statistics=[OrderedStatistic(items_base=frozenset({'roll products '}), items_add=frozenset({'meat'}), confidence=0.06097560975609757, lift=3.620547812620984)]),
 RelationRecord(items=frozenset({'seasonal products', 'soups'}), support=0.0003341575887188398, ordered_statistics=[OrderedStatistic(items_base=frozenset({'soups'}), items_add=frozenset({'seasonal products'}), confidence=0.10416666666666667, lift=14.704205974842768)]),
 RelationRecord(items=frozenset({'sugar', 'spread cheese'}), support=0.00040098910646260775, ordered_statistics=[OrderedStatistic(items_base=frozenset({'spread cheese'}), items_add=frozenset({'su

In [33]:
#iterate through the list of associations and for each item
for item in association_results:
    
    #for each item filter out the item pair and create item list containing individual items in the itemset
    itemset = item[0]
    items = [x for x in itemset]
    
    #Print the relationship( First value in items to second value in items)
    print("Rule : ", items[0], " -> " + items[1])
    
    #Print support,confidence and lift value of each itemset
    print("Support : ", str(item[1]))
    print("Confidence : ",str(item[2][0][2]))
    print("Lift : ", str(item[2][0][3]))
    print("===================") 

Rule :  liver loaf  -> fruit/vegetable juice
Support :  0.00040098910646260775
Confidence :  0.12
Lift :  3.5276227897838903
Rule :  meat  -> roll products 
Support :  0.0003341575887188398
Confidence :  0.06097560975609757
Lift :  3.620547812620984
Rule :  seasonal products  -> soups
Support :  0.0003341575887188398
Confidence :  0.10416666666666667
Lift :  14.704205974842768
Rule :  sugar  -> spread cheese
Support :  0.00040098910646260775
Confidence :  0.06
Lift :  3.3878490566037733
