In [2]:
import pandas as pd
import numpy as np
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import TreebankWordTokenizer, casual_tokenize
from collections import Counter
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.fasttext import FastText

In [3]:
data = pd.read_excel("ShopifyCategoryNames.xlsx", sheet_name='Shopify Categories')
ali_cats = pd.read_excel("ShopifyCategoryNames.xlsx", sheet_name='Aliexpress Categories')

In [4]:
data.head()

Unnamed: 0,Categories
0,Cat_all Headphone Amplifiers & Dac
1,Printed Range 210 Tc
2,Lanyards &amp; Key Straps
3,Casual Button-ups
4,Zange


In [5]:
ali_cats.head()

Unnamed: 0,Categories
0,Women's Fashion
1,Men's Fashion
2,Phones & Telecommunications
3,Computer
4,Consumer Electronics


In [6]:
data['Categories'] = data['Categories'].astype(str)
all_categories = data['Categories'].tolist()
joined_categories = "\n".join(str(e) for e in all_categories)

In [7]:
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(joined_categories)

In [8]:
bag_of_words = Counter(tokens)

In [9]:
bag_of_words.most_common(10)

[('&', 1576),
 (',', 1515),
 ('>', 1273),
 ('-', 1023),
 ("'s", 500),
 ('Accessories', 459),
 ('Women', 327),
 ('And', 312),
 ('Men', 252),
 ('Mens', 245)]

In [10]:
# https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
word_vectors = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [11]:
def top_words_to_string(positive, negative=None, topn=1000):
    top_words = word_vectors.most_similar(positive=positive, negative=negative, topn=topn)
    top_words_to_list = [x[0] for x in top_words]
    string_mung = [x.replace("_", " ").lower() for x in top_words_to_list]
    categories_string = ", ".join(string_mung)
    return categories_string

In [28]:
cat_0 = top_words_to_string(positive=['women', 'fashion'], negative=['man', 'men'])
cat_1 = top_words_to_string(positive=['men', 'fashion'], negative=['woman', 'women'])
cat_2 = top_words_to_string(positive=['telephone', 'smartphone', 'mobile', 'gadget'], negative=['computer'])
cat_3 = top_words_to_string(positive=['computer', 'laptop', 'pc'])
cat_4 = top_words_to_string(positive=['consumer', 'electronics'])
cat_5 = top_words_to_string(positive=['jewerly', 'rings'])
cat_6 = top_words_to_string(positive=['goods', 'home'])
cat_7 = top_words_to_string(positive=['bags', 'backpack', 'handbag', 'wallet'], negative=['clothes'])
cat_8 = top_words_to_string(positive=['toys', 'games'])
cat_9 = top_words_to_string(positive=['outdoor', 'fun', 'sports'])
cat_10 = top_words_to_string(positive=['beauty', 'health'])
cat_11 = top_words_to_string(positive=['automobile', 'car'])
cat_12 = top_words_to_string(positive=['home', 'improvement', 'furniture'])

In [13]:
for_mapping = {ali_cats['Categories'][0]: cat_0, ali_cats['Categories'][1]: cat_1, 
               ali_cats['Categories'][2]: cat_2, ali_cats['Categories'][3]: cat_3, 
               ali_cats['Categories'][4]: cat_4, ali_cats['Categories'][5]: cat_5, 
               ali_cats['Categories'][6]: cat_6, ali_cats['Categories'][7]: cat_7,
               ali_cats['Categories'][8]: cat_8, ali_cats['Categories'][9]: cat_9,
               ali_cats['Categories'][10]: cat_10, ali_cats['Categories'][11]: cat_11,
               ali_cats['Categories'][12]: cat_12}

In [14]:
ali_cats['Words'] = ali_cats['Categories'].map(for_mapping)
ali_cats.head()

Unnamed: 0,Categories,Words
0,Women's Fashion,"fashions, couture, fashion, moda operandi, raf..."
1,Men's Fashion,"fasion, menswear, men's wear, streetwear, wool..."
2,Phones & Telecommunications,"handset, phone, smartphones, handsets, phones,..."
3,Computer,"computers, laptop computer, laptops, pc, pcs, ..."
4,Consumer Electronics,"consumer electronics, maker aeroflex holding, ..."


In [15]:
vectorizer = TfidfVectorizer(min_df=1)

In [16]:
model = vectorizer.fit_transform(ali_cats['Words'])
target = model.todense().round(2)

In [17]:
matching = vectorizer.transform(data['Categories']).todense().round(2)

In [18]:
shopify_categ = pd.DataFrame(matching, columns=vectorizer.get_feature_names())
shopify_categ.shape

(23556, 12118)

In [19]:
aliexpress_categ = pd.DataFrame(target, columns=vectorizer.get_feature_names())
aliexpress_categ.shape

(13, 12118)

In [20]:
cos_sim = cosine_similarity(shopify_categ.values, aliexpress_categ.values)

In [21]:
cos_sim_df = pd.DataFrame(cos_sim).round(2)
cos_sim_df.columns = ali_cats['Categories'].values
cos_sim_df.index = data['Categories'].values
cos_sim_df.head()

Unnamed: 0,Women's Fashion,Men's Fashion,Phones & Telecommunications,Computer,Consumer Electronics,Jewelry,Home,Bags,Toys,Outdoor Fun & Sports,"Beauty, Health",Automobiles & Motorcycles,Home Improvement
Cat_all Headphone Amplifiers & Dac,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Printed Range 210 Tc,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
Lanyards &amp; Key Straps,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
Casual Button-ups,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
Zange,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
for_other_category = pd.DataFrame()
for_other_category = np.where(cos_sim_df.sum(axis=1) == 0.0, 1, 0)
for_other_category = pd.DataFrame(for_other_category)
for_other_category.head()

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,1


In [29]:
match_arg_max = cos_sim_df.idxmax(axis=1)
final_match = match_arg_max.to_frame().reset_index()
final_match = final_match.merge(for_other_category, left_index=True, right_index=True)
final_match.columns = ['Shopify', 'Ali_temp', 'Indicator']
final_match.head()

Unnamed: 0,Shopify,Ali_temp,Indicator
0,Cat_all Headphone Amplifiers & Dac,Women's Fashion,1
1,Printed Range 210 Tc,Phones & Telecommunications,0
2,Lanyards &amp; Key Straps,Bags,0
3,Casual Button-ups,Women's Fashion,0
4,Zange,Women's Fashion,1


In [30]:
final_match['Aliexpress'] = np.where(final_match['Indicator'] == 1, "Other", final_match['Ali_temp'])
final_match = final_match[['Shopify', 'Aliexpress']]
final_match.head()

Unnamed: 0,Shopify,Aliexpress
0,Cat_all Headphone Amplifiers & Dac,Other
1,Printed Range 210 Tc,Phones & Telecommunications
2,Lanyards &amp; Key Straps,Bags
3,Casual Button-ups,Women's Fashion
4,Zange,Other


In [25]:
final_match.head()

Unnamed: 0,Shopify,Aliexpress
0,Cat_all Headphone Amplifiers & Dac,Other
1,Printed Range 210 Tc,Phones & Telecommunications
2,Lanyards &amp; Key Straps,Bags
3,Casual Button-ups,Women's Fashion
4,Zange,Other


In [32]:
final_match[final_match.Aliexpress == "Phones & Telecommunications"].sample(10)

Unnamed: 0,Shopify,Aliexpress
1937,Galaxy M20,Phones & Telecommunications
4879,Glitter Ii,Phones & Telecommunications
4316,Marine Perpetual Calendar Limited Edition,Phones & Telecommunications
8865,Sony Xperia 10 Plus,Phones & Telecommunications
19291,Lg K10,Phones & Telecommunications
4916,Galaxy S10 Lite Spartan,Phones & Telecommunications
13825,Tattoo Needles,Phones & Telecommunications
20759,Cricket Bat,Phones & Telecommunications
3351,Samsung S10+ Plus Atomic Slim 2,Phones & Telecommunications
10267,Mobile Device Chargers,Phones & Telecommunications


In [37]:
final_match[final_match.Aliexpress == "Home"].sample(10)

Unnamed: 0,Shopify,Aliexpress
15325,Exec 3 Note 9,Home
23189,Back Support,Home
11523,Bird-baths & Houses,Home
22096,Bolder Borders® – Sparkle Plus,Home
11973,Loose Setting Powder,Home
21320,Foot Support,Home
20353,Food Storage Containers,Home
12932,Travel Organizer,Home
5949,Small Animal-treats,Home
7687,Car Strip Light,Home


In [26]:
final_match.to_csv("final_match.csv")

In [27]:
cos_sim_df.to_csv("cos_sim.csv")