In [80]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import re
import spacy
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import networkx as nx
from sklearn.neighbors import NearestNeighbors

In [50]:
df = pd.read_csv('shopee_ph.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,URL,Page,Preferred,Mall,Product Name,Main Category,Sub Category 1,Sub Category 2,Current Rating,...,Lowest Price Guarantee,Whole Sale,Five Star,Four Star,Three Star,Two Star,One Star,With Comments,With Media,Description
0,0,https://shopee.ph/Kids-Everyday-Leggings-Sizes...,0.0,True,False,Kids Everyday Leggings Sizes for Infant-10 Yea...,Babies & Kids,Girls' Fashion,Bottom,4.9,...,True,True,4400,103,42,8,9,1300,853,
1,1,https://shopee.ph/CUTE-PAJAMA-TERNO-SLEEPWEAR-...,0.0,False,False,CUTE PAJAMA TERNO SLEEPWEAR 1-2 yrs old wholes...,Babies & Kids,Babies' Fashion,Set,4.9,...,True,True,7200,188,73,23,20,3100,2000,
2,2,https://shopee.ph/TERNO-PAJAMA-FOR-KIDS-1-T0-1...,0.0,False,False,TERNO PAJAMA FOR KIDS 1 T0 10Y.O,Babies & Kids,Girls' Fashion,Pajama,4.8,...,True,True,5100,349,193,59,105,1800,913,


In [51]:
list(df)

['Unnamed: 0',
 'URL',
 'Page',
 'Preferred',
 'Mall',
 'Product Name',
 'Main Category',
 'Sub Category 1',
 'Sub Category 2',
 'Current Rating',
 'Total Rating',
 'Total Sold',
 'Favorite',
 'Discount Range',
 'Price Range',
 'Discount Percentage',
 'Free Shipping',
 'Free Shipping Info',
 'Shipping Location',
 'Shipping Price Range',
 'Brand Name',
 'Store Name',
 'Store Ratings',
 'Store Products Count',
 'Store Response Rate',
 'Store Response Time',
 'Store Joined',
 'Store Followers',
 'Shipping From',
 'Vouchers Available',
 'Bundle Details',
 'Coins Available',
 'Product Variation List',
 'Lowest Price Guarantee',
 'Whole Sale',
 'Five Star',
 'Four Star',
 'Three Star',
 'Two Star',
 'One Star',
 'With Comments',
 'With Media',
 'Description']

In [52]:
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

In [53]:
def get_tfidf(product_details):
    clean_product = []
    product_name = list(product_details)
    for i in range(len(product_name)):
        words = ""

        doc = nlp(product_name[i].lower())
        for token in doc:
            token.lemma_ = re.sub(r'\W',' ',token.lemma_)
            token.lemma_ = token.lemma_.strip()
            if not token.lemma_.endswith("ml") and not token.lemma_.endswith("ms") and not token.lemma_.isdigit() and not token.lemma_ in stop_words:
                if len(token.lemma_) > 2 or token.lemma_ == 'uv': 
                    words += token.lemma_.lower() + " "
                    

        if len(words) > 0:
            clean_product.append(str(words.strip()))

    tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(clean_product)
    first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]

    df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), 
                      columns=["tfidf"]) 
    df = df.sort_values(by=["tfidf"], ascending=False).reset_index()
    
    return df

In [59]:
G = nx.Graph()
# entries = os.listdir('dataset/updated/') ## Please Change Directory
main_category_list = []
sub_category_list = []
sub_category_list_2 = []
product_name_list = []

df['Main Category'] = df['Main Category'].str.lower()
df['Sub Category 1'] = df['Sub Category 1'].str.lower()
df['Sub Category 2'] = df['Sub Category 2'].str.lower()

main_category = df['Main Category'].unique()
for _main_category in main_category:
    if type(_main_category) == str:
        print(_main_category)
        main_category_data = df[df['Main Category'] == _main_category]
        G.add_node(_main_category)
        main_category_list.append(_main_category)
        
        for row in main_category_data['Sub Category 1'].unique():
            if type(row) == str:
                sub_category_list.append(row)
                G.add_edge(row, _main_category, weight=1.0)
                
                for row2 in main_category_data.loc[main_category_data['Sub Category 1'] == row]['Sub Category 2'].unique():
                    if type(row2) == str:
                        G.add_edge(row2.strip(), row.strip(), weight=1.0)
                        sub_category_list_2.append(row2.strip())
                        
                        tfidf_result = get_tfidf(main_category_data.loc[main_category_data['Sub Category 2'] == row2]['Product Name'])
                        
                        index =  tfidf_result['index']
                        tfidf = tfidf_result['tfidf']
                        counter = 0
                        for _tfidf_result in tfidf_result['index']:
                            if float(tfidf[counter]) > 0.0:                    
                                if index[counter] not in main_category_list:
                                    product_name_list.append(index[counter].lower())
                                    G.add_edge(index[counter].lower(), row2.strip(), weight=tfidf[counter])
    
    
                            counter +=1

babies & kids




cameras




gaming




home & living




mobiles & gadgets




health & personal care




sports & travel




hobbies & stationery




home appliances




home entertainment




laptops & computers




makeup & fragrances




men's apparel




men's bags & accessories




men's shoes




women's shoes




mobile accessories
motors




pet care




women's apparel




women's accessories




women's bags




In [60]:
import pickle

with open('network_theory.pickle','wb') as fe_data_file:
     pickle.dump(G, fe_data_file)

In [61]:
BETWEENNESS_CENTRALITY = nx.betweenness_centrality(G)

with open('betweeness_centrality.pickle','wb') as fe_data_file:
     pickle.dump(G, fe_data_file)

In [62]:
def recommend_products(given_wishlist, G, BETWEENNESS_CENTRALITY, overall_data):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(given_wishlist.strip())
    result_categories = []
    
    for token in reversed(doc):
        if token.text in list(G.nodes()):
            closeness_centrality_list = []
            betweness_centrality_list = []
            neighbor_list = []
            shortest_path_list = []
            length_list = []
            
            for neighbor in list(G.neighbors(token.text)):
                if neighbor in sub_category_list_2:
                    neighbor_list.append(neighbor)
                    betweness_centrality_list.append(BETWEENNESS_CENTRALITY[neighbor])
                    shortest_path = nx.shortest_path(G, source=neighbor, target=token.lemma_)
                    shortest_path_list.append(len(shortest_path))
                    length_list.append(overall_data[overall_data['Sub Category 2'] == neighbor].shape[0])
                    
            network_result = pd.DataFrame(neighbor_list, columns=['neighbor'])
            network_result['betweenness_centrality'] = betweness_centrality_list
            network_result['shortest_path'] = shortest_path_list
            
            if betweness_centrality_list:
                if network_result[network_result['shortest_path'] == min(shortest_path_list)]['neighbor'].shape[0] < 2:
                    if list(network_result[network_result['shortest_path'] == min(shortest_path_list)]['neighbor'])[0] not in result_categories:
                        result_categories.append(list(network_result[network_result['shortest_path'] == min(shortest_path_list)]['neighbor'])[0])
                else:
                    if list(network_result[network_result['betweenness_centrality'] == min(betweness_centrality_list)]['neighbor'])[0] not in result_categories:
                        result_categories.append(list(network_result[network_result['betweenness_centrality'] == min(betweness_centrality_list)]['neighbor'])[0])
    
    merge_products = []
    for result_category in result_categories:
        merge_products.append(overall_data[overall_data['Sub Category 2'] == result_category.title()])
    
    selected_category = pd.concat(merge_products).reset_index()
    
    return selected_category

In [96]:
#Example usage
given_wishlist = 'jogger pants'
overall_data = pd.read_csv('shopee_ph.csv')  # Load your overall data
selected_category = recommend_products(given_wishlist, G, BETWEENNESS_CENTRALITY, overall_data)
selected_category

Unnamed: 0.1,index,Unnamed: 0,URL,Page,Preferred,Mall,Product Name,Main Category,Sub Category 1,Sub Category 2,...,Lowest Price Guarantee,Whole Sale,Five Star,Four Star,Three Star,Two Star,One Star,With Comments,With Media,Description
0,9953,10001,https://shopee.ph/Cotton-Plain-Jogger-Pants-Un...,0.0,False,False,Cotton Plain Jogger Pants Unisex,Men's Apparel,Pants,Jogger Pants,...,True,True,5900,341,228,47,78,1500,815,New ArrivalCotton Good QualityUnisexColors: Li...
1,9956,10004,https://shopee.ph/Unisex-Plain-Cotton-Jogger-P...,0.0,False,False,Unisex Plain Cotton Jogger Pants(Makapal Tela)...,Men's Apparel,Pants,Jogger Pants,...,False,False,3700,194,85,17,36,1300,683,Makapal telaWith anti theft zipperColor: Black...
2,9960,10008,https://shopee.ph/COTTON-JOGGER-PANTS-FOR-MEN'...,0.0,False,False,COTTON JOGGER PANTS FOR MEN'S(WITH ZIPPERS),Men's Apparel,Pants,Jogger Pants,...,False,False,7300,553,321,121,188,2700,1300,SIZE:M:28 29L:30 31XL:32 33XXL:34 36COLOR:BLAC...
3,9969,10017,https://shopee.ph/Unisex-Plain-Cotton-Jogger-P...,0.0,False,False,Unisex Plain Cotton Jogger Pants Makpal tela S...,Men's Apparel,Pants,Jogger Pants,...,False,False,2900,236,205,97,217,1200,496,#Joggerpants #sports #shopeeph#highquality #co...
4,9978,10026,https://shopee.ph/Unisex-Palie-Jogger-Pants-Ma...,0.0,False,False,Unisex Palie Jogger Pants Makapal Quality JF10,Men's Apparel,Pants,Jogger Pants,...,True,True,4100,313,270,133,324,1700,792,WELCOME TO [Hello Jeans]SELLER SAIDAccura...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,10831,10881,https://shopee.ph/Unisex-Plain-Cotton-Jogger-P...,17.0,False,False,Unisex Plain Cotton Jogger Pants Track Jagger ...,Men's Apparel,Pants,Jogger Pants,...,True,True,891,77,41,16,29,262,111,Unisex Plain Cotton Jogger Pants Track Jagger ...
70,10850,10900,https://shopee.ph/New-design-4-pockets-jogger-...,18.0,False,False,New design 4 pockets jogger pants denim unisex...,Men's Apparel,Pants,Jogger Pants,...,False,False,232,13,14,5,19,93,38,Jogger pantssix pocketsskinny plainskinny Fash...
71,10857,10907,https://shopee.ph/Unisex-jogging-pants-Adidas*...,18.0,False,False,Unisex jogging pants Adidas*,Men's Apparel,Pants,Jogger Pants,...,True,True,611,52,40,19,31,209,86,CottonM size： 25-29L size： 30-32XL size: 33-...
72,10872,10922,https://shopee.ph/hot-sale-cotton-jogger-pants...,18.0,False,False,hot sale cotton jogger pants(WITH ZIPPER),Men's Apparel,Pants,Jogger Pants,...,False,False,691,36,23,9,8,173,95,style number:18031size: M--27.28 L...


In [97]:
def calculate_relevance(sample_wishlist, selected_category):
    vectorize = TfidfVectorizer(stop_words='english')
    tfidf_response = vectorize.fit_transform(selected_category['Product Name'])
    dtm = pd.DataFrame(tfidf_response.todense(), columns=vectorize.get_feature_names())
    
    nn = NearestNeighbors(n_neighbors=selected_category.shape[0])
    nn.fit(dtm)
    
    new = vectorize.transform(sample_wishlist)
    knn_model_result = nn.kneighbors(new.todense())
    
    knn_result = pd.DataFrame(knn_model_result[0][0], columns=['Distance'])
    knn_result["Product Name"] = selected_category['Product Name'].iloc[knn_model_result[1][0]]
    
    merged_result = pd.merge(selected_category, knn_result, on='Product Name', how='inner')
    merged_result = merged_result.drop_duplicates(subset='Product Name', keep="first")
    
    return merged_result

In [98]:
# Example usage
sample_wishlist = ['jogger pants']
final_recommendations = calculate_relevance(sample_wishlist, selected_category)
final_recommendations



Unnamed: 0.1,index,Unnamed: 0,URL,Page,Preferred,Mall,Product Name,Main Category,Sub Category 1,Sub Category 2,...,Whole Sale,Five Star,Four Star,Three Star,Two Star,One Star,With Comments,With Media,Description,Distance
0,9953,10001,https://shopee.ph/Cotton-Plain-Jogger-Pants-Un...,0.0,False,False,Cotton Plain Jogger Pants Unisex,Men's Apparel,Pants,Jogger Pants,...,True,5900,341,228,47,78,1500,815,New ArrivalCotton Good QualityUnisexColors: Li...,0.951206
1,9956,10004,https://shopee.ph/Unisex-Plain-Cotton-Jogger-P...,0.0,False,False,Unisex Plain Cotton Jogger Pants(Makapal Tela)...,Men's Apparel,Pants,Jogger Pants,...,False,3700,194,85,17,36,1300,683,Makapal telaWith anti theft zipperColor: Black...,1.019461
2,9960,10008,https://shopee.ph/COTTON-JOGGER-PANTS-FOR-MEN'...,0.0,False,False,COTTON JOGGER PANTS FOR MEN'S(WITH ZIPPERS),Men's Apparel,Pants,Jogger Pants,...,False,7300,553,321,121,188,2700,1300,SIZE:M:28 29L:30 31XL:32 33XXL:34 36COLOR:BLAC...,1.021324
3,9969,10017,https://shopee.ph/Unisex-Plain-Cotton-Jogger-P...,0.0,False,False,Unisex Plain Cotton Jogger Pants Makpal tela S...,Men's Apparel,Pants,Jogger Pants,...,False,2900,236,205,97,217,1200,496,#Joggerpants #sports #shopeeph#highquality #co...,1.024725
4,9978,10026,https://shopee.ph/Unisex-Palie-Jogger-Pants-Ma...,0.0,False,False,Unisex Palie Jogger Pants Makapal Quality JF10,Men's Apparel,Pants,Jogger Pants,...,True,4100,313,270,133,324,1700,792,WELCOME TO [Hello Jeans]SELLER SAIDAccura...,1.056625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,10831,10881,https://shopee.ph/Unisex-Plain-Cotton-Jogger-P...,17.0,False,False,Unisex Plain Cotton Jogger Pants Track Jagger ...,Men's Apparel,Pants,Jogger Pants,...,True,891,77,41,16,29,262,111,Unisex Plain Cotton Jogger Pants Track Jagger ...,1.338954
72,10850,10900,https://shopee.ph/New-design-4-pockets-jogger-...,18.0,False,False,New design 4 pockets jogger pants denim unisex...,Men's Apparel,Pants,Jogger Pants,...,False,232,13,14,5,19,93,38,Jogger pantssix pocketsskinny plainskinny Fash...,1.339380
73,10857,10907,https://shopee.ph/Unisex-jogging-pants-Adidas*...,18.0,False,False,Unisex jogging pants Adidas*,Men's Apparel,Pants,Jogger Pants,...,True,611,52,40,19,31,209,86,CottonM size： 25-29L size： 30-32XL size: 33-...,1.345050
74,10872,10922,https://shopee.ph/hot-sale-cotton-jogger-pants...,18.0,False,False,hot sale cotton jogger pants(WITH ZIPPER),Men's Apparel,Pants,Jogger Pants,...,False,691,36,23,9,8,173,95,style number:18031size: M--27.28 L...,1.358009
