In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori           # to obtain the products that fulfill the support threshold
from mlxtend.preprocessing import TransactionEncoder    # to encode data before passing in to apriori algorithm
from mlxtend.frequent_patterns import association_rules # to get the association rules

dat = pd.read_csv('website_link.csv')
dat = dat.drop(columns = ['Unnamed: 0'])   # drop redundant column

In [2]:
# Getting the unique id for each of the url
alist = []             # contains all the unique url
unique_id = []         # contains the index of the urls in alist
for i in range(len(dat['links'])):
    url = dat['links'][i].replace(" ", "")        # to remove all the blank space in url
    if str(url) not in alist:                     # if-else statement to check if url already appended
        alist.append(str(url))
        unique_id.append(len(alist)-1)
    else:
        val = alist.index(str(url))
        unique_id.append(val)
        
df_id = pd.DataFrame({'id': unique_id})
df = pd.concat([dat, df_id], axis= 1)   # attach the unique id for column 'links'
df.tail(10)

Unnamed: 0,links,product,id
48664,http://www.amazon.co.uk/Star-Wars-Costume-Sto...,"Star Wars Costume, Kids Stormtrooper Costume S...",28287
48665,http://www.amazon.co.uk/Rubies-Costume-Star-W...,"Star Wars Costume, Kids Stormtrooper Costume S...",28288
48666,http://www.amazon.co.uk/STAR-WARS-Stormtrooper,"Star Wars Costume, Kids Stormtrooper Costume S...",28289
48667,http://www.amazon.co.uk/Deluxe-Stormtrooper,"Star Wars Costume, Kids Stormtrooper Costume S...",10801
48668,http://www.amazon.co.uk/Bandai-Star-Wars-Fight...,Star Wars 1/72 Y-Wing Starfighter,28290
48669,http://www.amazon.co.uk/Bandai-Star-Wars-Star...,Star Wars 1/72 Y-Wing Starfighter,28291
48670,http://www.amazon.co.uk/Star-Wars-48-Snow-Spe...,Star Wars 1/72 Y-Wing Starfighter,28292
48671,http://www.amazon.co.uk/Bandai-Advanced-Starf...,Star Wars 1/72 Y-Wing Starfighter,28293
48672,http://www.amazon.co.uk/Star-Wars-fighter-res...,Star Wars 1/72 Y-Wing Starfighter,28294
48673,http://www.amazon.co.uk/Millennium,Star Wars 1/72 Y-Wing Starfighter,3455


In [3]:
# convert elements of series to string object
df['id'] = df['id'].apply(str)

In [4]:
# Same product concatenate it into one record, required format for apriori
new_df = df[['product', 'id']].groupby(['product']).transform(lambda x: ','.join(x))
new_df['product'] = df['product']         # attach product according to new dataframe

In [5]:
# Drop all the rows with duplicate values
new_df.drop_duplicates(subset='id', inplace=True)     # id contains concatenated id
new_df.reset_index(drop=True, inplace=True)
print(len(new_df))                                    # to make sure the result is as expected

8711


In [7]:
# Rename the column
new_df.rename(columns={'id': "linked_prod_list"}, inplace=True)

In [8]:
# Rearrange the format into list
for t in range(len(new_df)):
    result = new_df['linked_prod_list'][t].split(",")
    new_df['linked_prod_list'][t] = result

new_df.head(10)

Unnamed: 0,linked_prod_list,product
0,"[0, 1, 2, 3, 4, 5]",Hornby 2014 Catalogue
1,"[6, 7, 8, 9, 10, 11]",FunkyBuys® Large Christmas Holiday Express Fes...
2,"[12, 13, 14, 15, 16, 17]",CLASSIC TOY TRAIN SET TRACK CARRIAGES LIGHT EN...
3,"[18, 19, 20, 21, 22, 23]",Hornby 00 Gauge 0-4-0 Gildenlow Salt Co. Steam...
4,"[24, 25, 26, 27, 28]",20pcs Model Garden Light Double Heads Lamppost...
5,"[29, 30]",Hornby 00 Gauge 230mm BR Bogie Passenger Brake...
6,"[31, 20, 23, 32, 33, 34]",Hornby Santa's Express Train Set
7,"[35, 4, 36, 37, 38, 39]",Hornby Gauge Western Express Digital Train Set...
8,"[40, 41, 40, 42]",Learning Curve Chuggington Interactive Chatsworth
9,"[43, 20, 44, 45, 46, 47]",Hornby Gauge Railroad Mosley Tarmacadam Locomo...


In [9]:
# Pass the url represented by unique IDs into the apriori algorithm
# Encode the data into a sparse dataframe
te = TransactionEncoder()
te_array = te.fit(new_df['linked_prod_list']).transform(new_df['linked_prod_list'], sparse=True)
convert = pd.SparseDataFrame(te_array, columns=te.columns_, default_fill_value=False)

In [10]:
# Apriori algorithm to obtain the frequent itemset
freq_itemsets = apriori(convert, min_support=0.001, use_colnames= True)
freq_itemsets['length'] = freq_itemsets['itemsets'].apply(lambda x: len(x))
freq_itemsets

Unnamed: 0,support,itemsets,length
0,0.001148,(103),1
1,0.004362,(1038),1
2,0.002181,(1040),1
3,0.001378,(1044),1
4,0.001263,(10668),1
5,0.001033,(11087),1
6,0.001033,(1125),1
7,0.001263,(11266),1
8,0.001722,(11314),1
9,0.001148,(11336),1


In [11]:
# Obtain the rules that fulfills the confidence threshold
rules = association_rules(freq_itemsets, metric="confidence", min_threshold = 0.8)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1301),(1300),0.00287,0.005625,0.002411,0.84,149.331429,0.002395,6.214843
1,(1302),(1300),0.001378,0.005625,0.001378,1.0,177.77551,0.00137,inf
2,(1303),(1300),0.003559,0.005625,0.003214,0.903226,160.571429,0.003194,10.275208
3,(1304),(1300),0.004133,0.005625,0.003788,0.916667,162.960884,0.003765,11.932499
4,(1302),(1303),0.001378,0.003559,0.001148,0.833333,234.166667,0.001143,5.978648
5,(1302),(1304),0.001378,0.004133,0.001263,0.916667,221.80787,0.001257,11.950408
6,(1303),(1304),0.003559,0.004133,0.00287,0.806452,195.138889,0.002855,5.145314
7,(1507),(1353),0.001263,0.002181,0.001148,0.909091,416.794258,0.001145,10.976007
8,(14564),(14507),0.001263,0.001607,0.001033,0.818182,509.084416,0.001031,5.491161
9,(14522),(14517),0.001148,0.001607,0.001033,0.9,559.992857,0.001031,9.983928


In [12]:
# Get the important information into a dataframe for further processing
rules_ant = pd.DataFrame({'antecedents': rules['antecedents']})
rules_con = pd.DataFrame({'consequents': rules['consequents']})
rules_support = pd.DataFrame({'support': rules['support']})
rules_conf = pd.DataFrame({'confidence': rules['confidence']})
df_rules = pd.concat([rules_ant, rules_con,rules_support, rules_conf], axis= 1)

In [13]:
# Loop through both 'antecedents' and 'consquents' column to get the list of unique ids in the rules
new_list = []
for i in df_rules['antecedents'].unique():
    x = i
    string = ""
    for t in x:
        if len(string) < 4:
            string += t
    if string not in new_list:
        new_list.append(string)
        
for j in df_rules['consequents'].unique():
    x = j
    string = ""
    for t in x:
        if len(string) < 4:
            string += t
    if string not in new_list:
        new_list.append(string)

print(new_list)

['1301', '1302', '1303', '1304', '1507', '14564', '14522', '14594', '1489', '1491', '16896', '1865', '43', '25267', '25614', '3782', '3783', '6062', '6083', '6094', '1300', '1353', '14507', '14517', '1492', '16895', '1864', '20', '25250', '25615', '6063']


In [14]:
# Based on the unique ids, get the associated urls 
url_list = []
for i in new_list:
    val = int(i)
    url_list.append(alist[val])   # alist contains the urls, can be access through the index value

In [15]:
# Requests for the product name through web scraping
import requests                           # library to make request, necessary for web scraping
from bs4 import BeautifulSoup             # library for pulling data out of HTML and XML files.
import time

def get_product(url):
    """
    This function is to configure and send request to the server based on each url
    :param url: website link where web scraping is done
    :returns: return the paragraph that contains the desired result
    """
    time.sleep(2)   # to avoid being recognize as spam
    with requests.Session() as res:
        headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
        response = res.get(url, timeout=5, headers = headers)               # timeout if it takes more than 5 seconds
    soup = BeautifulSoup(response.text, "html.parser")
    if response.status_code == requests.codes.ok:
        result = soup.find('title')
        return result
    else:
        if response.status_code == '503':
            return

def proc_result(result):
    """
    This function is mainly to process the response obtained
    :param result: the input is the result extracted from the response of the request
    :returns: None if there is no result and product name if there is a response
    """
    if result == None:
        return 
    else:
        string = ''
        removed = result.text.strip()
        name = removed.split(":")
        string += name[0]
        if name[1] == ' Amazon.co.uk':   # to make sure that it takes the entire product name
            return name[0]
        else:
            string += name[1]            # to avoid getting only part of the product name
            return string
        
prod = []
for i in range(len(url_list)):
    result = get_product(url_list[i])
    if result != 'Robot Check':
        prod_name = proc_result(result)
        prod.append(prod_name)
    else:
        assert result == 'Robot Check',"Robot Check error encountered"
       

In [16]:
# Get the item in frozenset into a list, for easy access
df_rules['ant_list'] = np.nan
df_rules['cons_list'] = np.nan
for k in range(len(df_rules['antecedents'])):
    x = df_rules['antecedents'][k]
    string = ""
    temp = []
    for t in x:
        if len(string) < 4:
            string += t
        else:
            string = ""
            string += t
        
        temp.append(string)
    df_rules['ant_list'][k] = temp
    
for j in range(len(df_rules['consequents'])):
    x = df_rules['consequents'][j]
    string = ""
    temp = []
    for t in x:
        if len(string) < 4:
            string += t
        else:
            string = ""
            string += t
        
        temp.append(string)
    df_rules['cons_list'][j] = temp   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [17]:
# Get the product name for each of the unique id and assign it into their respective row
df_rules['ant_prodName'] = np.nan
df_rules['con_prodName'] = np.nan
for t in range(len(df_rules['ant_list'])):
    if len(df_rules['ant_list'][t]) >= 2:       # if there are more than one product, create a list
        temp = []
        for j in df_rules['ant_list'][t]: 
            val = new_list.index(str(j))
            prod_name = prod[val]
            if prod_name != None:
                temp.append(prod_name)
            else:
                temp.append(np.nan)             # append nan if there is no product name
            df_rules['ant_prodName'][t] = temp
    else:
        for j in df_rules['ant_list'][t]: 
            val = new_list.index(str(j))
            prod_name = prod[val]
            if prod_name != None:
                df_rules['ant_prodName'][t] = prod[val]
            else:
                df_rules['ant_prodName'][t] = np.nan

for t in range(len(df_rules['cons_list'])):
    if len(df_rules['cons_list'][t]) >= 2:      # if there are more than one product, create a list
        temp = []
        for j in df_rules['cons_list'][t]: 
            val = new_list.index(str(j))
            prod_name = prod[val]
            if prod_name != None:
                temp.append(prod_name)
            else:
                temp.append(np.nan)             # append nan if there is no product name
            df_rules['con_prodName'][t] = temp
    else:
        for j in df_rules['cons_list'][t]: 
            val = new_list.index(str(j))
            prod_name = prod[val]
            if prod_name != None:
                df_rules['con_prodName'][t] = prod[val]
            else:
                df_rules['con_prodName'][t] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice fro

In [18]:
# Drop if the product name is nan, and then sort it in descending order
df_rules = df_rules.dropna(subset=['ant_prodName', 'con_prodName'])
df_rules = df_rules.sort_values(by = ['confidence'], ascending = False)
df_rules.reset_index(drop=True, inplace=True)     # nan product name means wrong website link or item no longer on amazon
df_rules

Unnamed: 0,antecedents,consequents,support,confidence,ant_list,cons_list,ant_prodName,con_prodName
0,(1302),(1300),0.001378,1.0,[1302],[1300],Dungeons & Dragons Starter Box (D&d Boxed Game),Dungeons & Dragons Player's Handbook (Dungeons...
1,(3782),(3783),0.001148,1.0,[3782],[3783],Schleich Dragon Knight with Sword,Schleich Griffin Knight with Sword
2,"(1302, 1303)","(1304, 1300)",0.001148,1.0,"[1302, 1303]","[1304, 1300]",[Dungeons & Dragons Starter Box (D&d Boxed Gam...,[Dungeon Master's Guide (Dungeons & Dragons Co...
3,"(1300, 1302, 1303)",(1304),0.001148,1.0,"[1300, 1302, 1303]",[1304],[Dungeons & Dragons Player's Handbook (Dungeon...,Dungeon Master's Guide (Dungeons & Dragons Cor...
4,"(1304, 1302, 1303)",(1300),0.001148,1.0,"[1304, 1302, 1303]",[1300],[Dungeon Master's Guide (Dungeons & Dragons Co...,Dungeons & Dragons Player's Handbook (Dungeons...
5,"(1304, 1301, 1303)",(1300),0.001492,1.0,"[1304, 1301, 1303]",[1300],[Dungeon Master's Guide (Dungeons & Dragons Co...,Dungeons & Dragons Player's Handbook (Dungeons...
6,"(1302, 1303)",(1304),0.001148,1.0,"[1302, 1303]",[1304],[Dungeons & Dragons Starter Box (D&d Boxed Gam...,Dungeon Master's Guide (Dungeons & Dragons Cor...
7,"(1304, 1302)",(1300),0.001263,1.0,"[1304, 1302]",[1300],[Dungeon Master's Guide (Dungeons & Dragons Co...,Dungeons & Dragons Player's Handbook (Dungeons...
8,"(1304, 1301)",(1300),0.002066,1.0,"[1304, 1301]",[1300],[Dungeon Master's Guide (Dungeons & Dragons Co...,Dungeons & Dragons Player's Handbook (Dungeons...
9,(6094),(6063),0.001148,1.0,[6094],[6063],Pack Of 24 Boys Junior Glitter Tattoo Stencils...,Girls 10ml Pot Of Glitter Tattoo Glue - Cosmet...


In [22]:
# Uncomment if prefer printing it out
"""
for i in range(len(df_rules)):
    print('antecedent: ', df_rules['ant_prodName'][i])
    print('consequent: ', df_rules['con_prodName'][i])
    print('confidence: ', df_rules['confidence'][i])
    print("\n")
"""

antecedent:  Dungeons & Dragons Starter Box (D&d Boxed Game)
consequent:  Dungeons & Dragons Player's Handbook (Dungeons & Dragons Core Rulebooks)
confidence:  1.0


antecedent:  ['Monster Manual A Dungeons & Dragons Core Rulebook', "Dungeons & Dragons Player's Handbook (Dungeons & Dragons Core Rulebooks)", 'Dungeons & Dragons Starter Box (D&d Boxed Game)']
consequent:  Dungeon Master's Guide (Dungeons & Dragons Core Rulebooks)
confidence:  1.0


antecedent:  ['Monster Manual A Dungeons & Dragons Core Rulebook', 'Dungeons & Dragons Starter Box (D&d Boxed Game)']
consequent:  Dungeon Master's Guide (Dungeons & Dragons Core Rulebooks)
confidence:  1.0


antecedent:  ["Dungeon Master's Guide (Dungeons & Dragons Core Rulebooks)", 'Dungeons & Dragons Starter Box (D&d Boxed Game)']
consequent:  Dungeons & Dragons Player's Handbook (Dungeons & Dragons Core Rulebooks)
confidence:  1.0


antecedent:  ['Monster Manual A Dungeons & Dragons Core Rulebook', "Dungeon Master's Guide (Dungeons & Drago