In [1]:
import os
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import time
import operator
import itertools
import spacy
from spacy.tokenizer import Tokenizer

from sentence_transformers import SentenceTransformer
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device = device)

In [2]:
def word_embedding_func(df):
    '''
    word embedding function
    '''
    embedded_keywords = model.encode(df['First_H1_lemmatized'].tolist(), show_progress_bar = False)
    embedding_dict = dict(zip(df['First_H1_lemmatized'].tolist(), embedded_keywords))
    
    return embedding_dict

In [3]:
def clustering_sklearn_func(n_clusters, 
                            embedded_vectors_array_dict, 
                            affinity = 'cosine', 
                            linkage = 'average', 
                            distance_threshold = 0.01):
    '''
    fit agglomerative clustering - sklearn
    '''
    cluster = AgglomerativeClustering(n_clusters = n_clusters, 
                                      affinity = affinity, 
                                      linkage = linkage, 
                                      distance_threshold = distance_threshold)
    labels = cluster.fit_predict(np.array(list(embedded_vectors_array_dict.values())))
    return labels

In [4]:
def similar_ranking_func(pn, df):
    test_df = df[df['cluster'] == pn]

    # get the embedding for H1s
    embedded_dict = {}
    for H1 in test_df['First H1'].tolist():
        embedded_dict[H1] = model.encode(H1, show_progress_bar=False)

    # calculate the similarity matrix
    similarity_matrix = pd.DataFrame(index = list(embedded_dict.keys()), 
                                     columns = list(embedded_dict.keys()))
    for a, b in itertools.combinations(embedded_dict.keys(), 2):
        # a is column name, b is index name
        similarity_matrix[a][b] = cosine(embedded_dict[a], embedded_dict[b])

    # loop all the H1s and append the ranking
    test_df['similar ranking'] = ''
    for i, row in test_df.iterrows():
        similarity_list = similarity_matrix[row['First H1']].sort_values(ascending=False)
        test_df.loc[i, 'similar ranking'] = ', '.join(similarity_list.index.tolist())
    return test_df

In [5]:
def cosine(u, v):
    if len(v) > 0:
        return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
    else:
        return np.nan

### Data Processing

In [6]:
# only use the non-dp page for clustering
data = pd.read_csv('unique_h1s.csv')
data = data.drop_duplicates(subset=['First_H1_lemmatized'])
data = data[data['dp_page'] == False]

In [7]:
# remove the holiday related Bread Crumb 1
breadcrumb_1 = ['Small Pet', 'New Pet', 'Bird',
               'Dog', 'Cat', 'Pharmacy', 'Personalized', 'Pet Parents',
               'Birthday Shop', 'Reptile', 'Horse', 'Holiday Shop',
               'Fish', 'Farm Animal', 'Flea & Tick',
               'Game Day Essentials',
               'Cold Weather Essentials', 'The Playtime Shop',
               'Pet Healthcare', 'Disney', "While You're Out Essentials",
               'Science Shop', 'Fall Shop', 'Woman Owned Brands', 'Goody Box',
               'Pet Food', 'Pet Treats', 'Pet Toys', 'Pet Vitamins & Supplements',
               'Chewy Home Collection', 'Pick of the Month',
               'Select New Arrivals', 'Warm Weather Styles', 'Water Fountains',
               'Funny Graphics', 'Cooling Beds', 'Formal Wear',
               'Connect With a Vet', 'Gift Guide', 'Donate', 'Summer Shop',
               'Senior Pet Shop']
# holiday_related = ['Cyber Week', 'Black Friday', 'Halloween Shop', "Valentine's Day Shop", "Easter Shop", np.nan]
data = data[data['Breadcrumb Depth 1'].isin(breadcrumb_1)]

In [8]:
# get all the h1 and keyword mapping data
h1_kw_1 = pd.read_excel("Category and Facet PLP Total SV and Keyword Mapping.xlsx", sheet_name='category keywords')
h1_kw_2 = pd.read_excel("Category and Facet PLP Total SV and Keyword Mapping.xlsx", sheet_name='Sheet2')
h1_kw = pd.concat([h1_kw_1, h1_kw_2]).sort_values('Total SV').groupby('URL').head(1)

In [10]:
# merge (6263 out of 10897)
merged_df = data.merge(h1_kw, left_on='Full URL', right_on='URL', how='left')
merged_df['URL'].isna().sum(), merged_df.shape[0]

(4634, 10897)

In [12]:
# map the h1s with the highest search volumn keywords
h1_kw_dict = {}
kw_sv_map = {
    'Search Volume KW 1' : 'Keyword 1',
    'Search Volume KW 2' : 'KW2', 
    'Search Volume KW 3' : 'KW3',
    'Search Volume KW 4' : 'KW4',
    'Search Volume KW 5' : 'KW5',
    'Search Volume KW 6' : 'KW6'
}
for i, row in merged_df.iterrows():
    # URL - Highest Search Volume KW
    x = {row[v]:row[k] for k,v in list(kw_sv_map.items())}
    top_keywords = sorted(x.items(), key=lambda kv: kv[1], reverse=True)[0]
    h1_kw_dict[row['First H1']] = top_keywords

### Bread Crumb Level 1 Clustering (Apply clustering for within each cluster)

In [13]:
%%time
start_label = 0
cluster_res = pd.DataFrame()
for bc1 in breadcrumb_1:
    category_data = data[data['Breadcrumb Depth 1'] == bc1]
    if category_data.shape[0] > 1:
        category_data['cluster'] = ''
        embedding_dict = word_embedding_func(category_data)
        # print(len(embedding_dict), category_data.shape[0])
        cluster_labels = clustering_sklearn_func(n_clusters = None, 
                                             embedded_vectors_array_dict = embedding_dict, 
                                             distance_threshold = 0.4)
        # print(len(cluster_labels))
        category_data['cluster'] = [labels + start_label for labels in cluster_labels]
    else:
        category_data['cluster'] = start_label + 1
    start_label += category_data['cluster'].nunique()
    cluster_res = pd.concat([cluster_res, category_data])

Wall time: 25.7 s


In [17]:
%%time
# calculate the similarity matrix and provide a ranked list for each cluster
cluster_res_with_similar_ranking = pd.DataFrame()
for cluster_number in cluster_res['cluster'].unique().tolist():
    cluster_w_rank = similar_ranking_func(cluster_number, cluster_res)
    cluster_res_with_similar_ranking = pd.concat([cluster_res_with_similar_ranking, cluster_w_rank])

Wall time: 4min 37s


In [19]:
target_col = ['First H1', 'Full URL', 'cluster', 'Breadcrumb Depth 1', 'Breadcrumb Depth 2', 
              'Breadcrumb Depth 3', 'similar ranking']
res_df = cluster_res_with_similar_ranking[target_col]

In [22]:
# format - unpivot on the similar ranking column
formatted_df = res_df.assign(related_H1=lambda df: df['similar ranking'].str.split(', ')).explode("related_H1")

In [23]:
# mapping the keywords on related_H1
formatted_df['related_KW'] = formatted_df['related_H1'].apply(lambda x: h1_kw_dict[x][0] if x in h1_kw_dict else np.nan)

In [25]:
formatted_df.head()

Unnamed: 0,First H1,Full URL,cluster,Breadcrumb Depth 1,Breadcrumb Depth 2,Breadcrumb Depth 3,similar ranking,related_H1,related_KW
32,Chinchilla Chew Toys,https://www.chewy.com/b/chew-toys-11047,14,Small Pet,Shop by Small Pet,Chinchilla,"Chinchilla Toys, Chinchilla Treats, Chinchilla...",Chinchilla Toys,chinchilla toys
32,Chinchilla Chew Toys,https://www.chewy.com/b/chew-toys-11047,14,Small Pet,Shop by Small Pet,Chinchilla,"Chinchilla Toys, Chinchilla Treats, Chinchilla...",Chinchilla Treats,
32,Chinchilla Chew Toys,https://www.chewy.com/b/chew-toys-11047,14,Small Pet,Shop by Small Pet,Chinchilla,"Chinchilla Toys, Chinchilla Treats, Chinchilla...",Chinchilla Feeding Accessories,
32,Chinchilla Chew Toys,https://www.chewy.com/b/chew-toys-11047,14,Small Pet,Shop by Small Pet,Chinchilla,"Chinchilla Toys, Chinchilla Treats, Chinchilla...",Chinchilla Supplies & Accessories,chinchilla accessories
32,Chinchilla Chew Toys,https://www.chewy.com/b/chew-toys-11047,14,Small Pet,Shop by Small Pet,Chinchilla,"Chinchilla Toys, Chinchilla Treats, Chinchilla...",Chinchilla Grooming Supplies,chinchilla grooming


In [24]:
# SAVE FILE 
# formatted_df[['First H1', 'Full URL', 'cluster', 'Breadcrumb Depth 1',
#                'Breadcrumb Depth 2',
#                'related_H1','related_KW']].to_csv('Formatted Related Search Result - R1.csv', index=False)

In [27]:
# Reset Index (if needed)
formatted_df.set_index(['First H1', 'Full URL', 'cluster', 'Breadcrumb Depth 1', 'Breadcrumb Depth 2'])#.to_excel('test.xlsx')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Breadcrumb Depth 3,similar ranking,related_H1,related_KW
First H1,Full URL,cluster,Breadcrumb Depth 1,Breadcrumb Depth 2,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Chinchilla Chew Toys,https://www.chewy.com/b/chew-toys-11047,14,Small Pet,Shop by Small Pet,Chinchilla,"Chinchilla Toys, Chinchilla Treats, Chinchilla...",Chinchilla Toys,chinchilla toys
Chinchilla Chew Toys,https://www.chewy.com/b/chew-toys-11047,14,Small Pet,Shop by Small Pet,Chinchilla,"Chinchilla Toys, Chinchilla Treats, Chinchilla...",Chinchilla Treats,
Chinchilla Chew Toys,https://www.chewy.com/b/chew-toys-11047,14,Small Pet,Shop by Small Pet,Chinchilla,"Chinchilla Toys, Chinchilla Treats, Chinchilla...",Chinchilla Feeding Accessories,
Chinchilla Chew Toys,https://www.chewy.com/b/chew-toys-11047,14,Small Pet,Shop by Small Pet,Chinchilla,"Chinchilla Toys, Chinchilla Treats, Chinchilla...",Chinchilla Supplies & Accessories,chinchilla accessories
Chinchilla Chew Toys,https://www.chewy.com/b/chew-toys-11047,14,Small Pet,Shop by Small Pet,Chinchilla,"Chinchilla Toys, Chinchilla Treats, Chinchilla...",Chinchilla Grooming Supplies,chinchilla grooming
...,...,...,...,...,...,...,...,...
Connect With a Vet,https://www.chewy.com/b/connect-vet-16616?utm_source=chewy&utm_medium=ELP&utm_campaign=dentalcare&utm_audience=dentalcarebanner,928,Connect With a Vet,,,Connect With a Vet,Connect With a Vet,
Chewy Picks,https://www.chewy.com/b/gift-guide-2766?,929,Gift Guide,,,Chewy Picks,Chewy Picks,
Chewy Gives Back,https://www.chewy.com/b/donate-101502,930,Donate,,,Chewy Gives Back,Chewy Gives Back,
Summer Shop,https://www.chewy.com/b/summer-shop-2338,931,Summer Shop,,,Summer Shop,Summer Shop,
