# Install and ensure we have the libraries needed.
!pip install numpy
!pip install scipy
!pip install sklearn
!pip install matplotlib
!pip install seaborn
!pip install pandas
!pip install apyori
!pip install mlxtend

In [1]:
# Import libraries needed.
import numpy as np
import pandas as pd
import scipy.stats as stats
from pandas import DataFrame

import matplotlib.pyplot as plt
import seaborn as sns


from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder


# Data Cleaning and Preparation

In [2]:
#Import the dataset into Pandas DataFrame and display it.
df=pd.read_csv('teleco_market_basket.csv')
df.head()

Unnamed: 0,Item01,Item02,Item03,Item04,Item05,Item06,Item07,Item08,Item09,Item10,Item11,Item12,Item13,Item14,Item15,Item16,Item17,Item18,Item19,Item20
0,,,,,,,,,,,,,,,,,,,,
1,Logitech M510 Wireless mouse,HP 63 Ink,HP 65 ink,nonda USB C to USB Adapter,10ft iPHone Charger Cable,HP 902XL ink,Creative Pebble 2.0 Speakers,Cleaning Gel Universal Dust Cleaner,Micro Center 32GB Memory card,YUNSONG 3pack 6ft Nylon Lightning Cable,TopMate C5 Laptop Cooler pad,Apple USB-C Charger cable,HyperX Cloud Stinger Headset,TONOR USB Gaming Microphone,Dust-Off Compressed Gas 2 pack,3A USB Type C Cable 3 pack 6FT,HOVAMP iPhone charger,SanDisk Ultra 128GB card,FEEL2NICE 5 pack 10ft Lighning cable,FEIYOLD Blue light Blocking Glasses
2,,,,,,,,,,,,,,,,,,,,
3,Apple Lightning to Digital AV Adapter,TP-Link AC1750 Smart WiFi Router,Apple Pencil,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,


In [3]:
#List total number of entries in dataset (RowsxColumn).
print(df.shape)

(15002, 20)


In [4]:
#Display the datatype for each column.
print(df.dtypes)

Item01    object
Item02    object
Item03    object
Item04    object
Item05    object
Item06    object
Item07    object
Item08    object
Item09    object
Item10    object
Item11    object
Item12    object
Item13    object
Item14    object
Item15    object
Item16    object
Item17    object
Item18    object
Item19    object
Item20    object
dtype: object


In [5]:
#Look at the basic statistics of the dataset.
df.describe()

Unnamed: 0,Item01,Item02,Item03,Item04,Item05,Item06,Item07,Item08,Item09,Item10,Item11,Item12,Item13,Item14,Item15,Item16,Item17,Item18,Item19,Item20
count,7501,5747,4389,3345,2529,1864,1369,981,654,395,256,154,87,47,25,8,4,4,3,1
unique,115,117,115,114,110,106,102,97,88,80,66,50,43,28,19,8,3,3,3,1
top,Dust-Off Compressed Gas 2 pack,Dust-Off Compressed Gas 2 pack,Dust-Off Compressed Gas 2 pack,Dust-Off Compressed Gas 2 pack,Apple USB-C Charger cable,USB 2.0 Printer cable,Apple USB-C Charger cable,Apple USB-C Charger cable,Apple USB-C Charger cable,Apple USB-C Charger cable,TopMate C5 Laptop Cooler pad,Apple USB-C Charger cable,Apple USB-C Charger cable,Apple USB-C Charger cable,ARRIS SURFboard SB8200 Cable Modem,3A USB Type C Cable 3 pack 6FT,SanDisk Ultra 128GB card,Brother Genuine High Yield Toner Cartridge,FEEL2NICE 5 pack 10ft Lighning cable,FEIYOLD Blue light Blocking Glasses
freq,577,484,375,201,153,107,96,67,57,31,22,15,8,4,3,1,2,2,1,1


In [6]:
#Perform checks to see if null values exists.
print(df.isna().sum())


Item01     7501
Item02     9255
Item03    10613
Item04    11657
Item05    12473
Item06    13138
Item07    13633
Item08    14021
Item09    14348
Item10    14607
Item11    14746
Item12    14848
Item13    14915
Item14    14955
Item15    14977
Item16    14994
Item17    14998
Item18    14998
Item19    14999
Item20    15001
dtype: int64


In [7]:
#Drop null values
df.dropna(how='all', inplace=True)


In [8]:
#List total number of entries in dataset (RowsxColumn).
print(df.shape)

(7501, 20)


In [9]:
#Perform checks to see if null values exists.
print(df.isna().sum())


Item01       0
Item02    1754
Item03    3112
Item04    4156
Item05    4972
Item06    5637
Item07    6132
Item08    6520
Item09    6847
Item10    7106
Item11    7245
Item12    7347
Item13    7414
Item14    7454
Item15    7476
Item16    7493
Item17    7497
Item18    7497
Item19    7498
Item20    7500
dtype: int64


In [10]:
#Fill in null variables with strings
df.fillna("", axis = 1, inplace=True);


In [11]:
#Perform checks to see if null values exists.
print(df.isna().sum())

Item01    0
Item02    0
Item03    0
Item04    0
Item05    0
Item06    0
Item07    0
Item08    0
Item09    0
Item10    0
Item11    0
Item12    0
Item13    0
Item14    0
Item15    0
Item16    0
Item17    0
Item18    0
Item19    0
Item20    0
dtype: int64


In [12]:
#List total number of entries in dataset (RowsxColumn).
print(df.shape)

(7501, 20)


In [13]:
#Look at the basic statistics of the dataset.
df.describe()

Unnamed: 0,Item01,Item02,Item03,Item04,Item05,Item06,Item07,Item08,Item09,Item10,Item11,Item12,Item13,Item14,Item15,Item16,Item17,Item18,Item19,Item20
count,7501,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0,7501.0
unique,115,118.0,116.0,115.0,111.0,107.0,103.0,98.0,89.0,81.0,67.0,51.0,44.0,29.0,20.0,9.0,4.0,4.0,4.0,2.0
top,Dust-Off Compressed Gas 2 pack,,,,,,,,,,,,,,,,,,,
freq,577,1754.0,3112.0,4156.0,4972.0,5637.0,6132.0,6520.0,6847.0,7106.0,7245.0,7347.0,7414.0,7454.0,7476.0,7493.0,7497.0,7497.0,7498.0,7500.0


In [14]:
#Look at the dataframe
df.head()

Unnamed: 0,Item01,Item02,Item03,Item04,Item05,Item06,Item07,Item08,Item09,Item10,Item11,Item12,Item13,Item14,Item15,Item16,Item17,Item18,Item19,Item20
1,Logitech M510 Wireless mouse,HP 63 Ink,HP 65 ink,nonda USB C to USB Adapter,10ft iPHone Charger Cable,HP 902XL ink,Creative Pebble 2.0 Speakers,Cleaning Gel Universal Dust Cleaner,Micro Center 32GB Memory card,YUNSONG 3pack 6ft Nylon Lightning Cable,TopMate C5 Laptop Cooler pad,Apple USB-C Charger cable,HyperX Cloud Stinger Headset,TONOR USB Gaming Microphone,Dust-Off Compressed Gas 2 pack,3A USB Type C Cable 3 pack 6FT,HOVAMP iPhone charger,SanDisk Ultra 128GB card,FEEL2NICE 5 pack 10ft Lighning cable,FEIYOLD Blue light Blocking Glasses
3,Apple Lightning to Digital AV Adapter,TP-Link AC1750 Smart WiFi Router,Apple Pencil,,,,,,,,,,,,,,,,,
5,UNEN Mfi Certified 5-pack Lightning Cable,,,,,,,,,,,,,,,,,,,
7,Cat8 Ethernet Cable,HP 65 ink,,,,,,,,,,,,,,,,,,
9,Dust-Off Compressed Gas 2 pack,Screen Mom Screen Cleaner kit,Moread HDMI to VGA Adapter,HP 62XL Tri-Color ink,Apple USB-C Charger cable,,,,,,,,,,,,,,,


# Export Data

In [15]:
#Export Dataset
df.to_csv('D212_Cleaned_Market_Basket_T3.csv')

# Data Preprocessing

In [16]:
#Create list within a list

list_df = df.to_numpy().tolist()
list_df
dataset = list()
for i in range(len(list_df)) :
    item = list()
    for j in list_df[i] :
        if pd.notna(j):
            item.append(j)
    dataset.append(item)

In [17]:
# Create an instance of our TransactionEncoder class 
te = TransactionEncoder()
# Fit and transform our dataset which is a list of lists into an array of True and False.
te_array = te.fit(dataset).transform(dataset)
te_array

array([[False,  True, False, ..., False,  True, False],
       [ True, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       ...,
       [ True, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False]])

In [18]:
#Convert Array back in Dataframe for Apriori algorithm
df_model = pd.DataFrame(te_array,columns=te.columns_)
df_model.drop(columns=[''],axis=1,inplace=True)

# Apriori Algorithm

In [19]:
#Train data using apriori on the list_df
frequent_itemsets = apriori(df_model, min_support= 0.03 , use_colnames=True)

In [20]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

In [21]:
result = pd.DataFrame(rules)
result.sort_values(by='lift',inplace=True,ascending=False)
result

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
23,(VIVO Dual LCD Monitor Desk mount),(SanDisk Ultra 64GB card),0.17411,0.098254,0.039195,0.225115,2.291162,0.022088,1.163716
24,(SanDisk Ultra 64GB card),(VIVO Dual LCD Monitor Desk mount),0.098254,0.17411,0.039195,0.398915,2.291162,0.022088,1.373997
12,(SanDisk Ultra 64GB card),(Dust-Off Compressed Gas 2 pack),0.098254,0.238368,0.040928,0.416554,1.747522,0.017507,1.305401
11,(Nylon Braided Lightning to USB cable),(Dust-Off Compressed Gas 2 pack),0.095321,0.238368,0.035729,0.374825,1.572463,0.013007,1.21827
25,(VIVO Dual LCD Monitor Desk mount),(Screen Mom Screen Cleaner kit),0.17411,0.129583,0.035462,0.203675,1.571779,0.0129,1.093043
26,(Screen Mom Screen Cleaner kit),(VIVO Dual LCD Monitor Desk mount),0.129583,0.17411,0.035462,0.273663,1.571779,0.0129,1.137061
14,(Dust-Off Compressed Gas 2 pack),(Screen Mom Screen Cleaner kit),0.238368,0.129583,0.047994,0.201342,1.553774,0.017105,1.08985
13,(Screen Mom Screen Cleaner kit),(Dust-Off Compressed Gas 2 pack),0.129583,0.238368,0.047994,0.37037,1.553774,0.017105,1.20965
18,(Screen Mom Screen Cleaner kit),(HP 61 ink),0.129583,0.163845,0.032129,0.247942,1.513276,0.010898,1.111823
15,(Stylus Pen for iPad),(Dust-Off Compressed Gas 2 pack),0.095054,0.238368,0.033729,0.354839,1.488616,0.011071,1.180529
