In [1]:
import pandas as pd
import numpy as np
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import TreebankWordTokenizer, casual_tokenize
from collections import Counter
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.fasttext import FastText

In [149]:
data = pd.read_csv("shopifyProducts.csv")
data.drop_duplicates()
ali_cats = pd.read_csv("aliCategories.csv")

In [151]:
data["title_orig"] = data["title"]
data["title"] = data["title"] + ' ' + data["product_type"]
data.head()

Unnamed: 0,product_type,title,title_orig
0,Sweater,Homebody Sweater Sweater Sweater,Homebody Sweater Sweater
1,,,
2,Top,Subliminal Top Top Top,Subliminal Top Top
3,,,
4,Dress,Peek-A-Vu Dress Dress Dress,Peek-A-Vu Dress Dress


In [152]:
ali_cats.head()

Unnamed: 0,categoryId,text
0,100003109,Women's Fashion
1,100003070,Men's Fashion
2,509,Phones & Telecommunications
3,7,Computer
4,44,Consumer Electronics


In [153]:
# https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
word_vectors = KeyedVectors.load_word2vec_format("F:/SU/ScientificPython/GoogleNews-vectors-negative300.bin.gz", binary=True)

In [154]:
def top_words_to_string(positive, negative=None, topn=10000):
    top_words = word_vectors.most_similar(positive=positive, negative=negative, topn=topn)
    top_words_to_list = [x[0] for x in top_words]
    string_mung = [x.replace("_", " ").lower() for x in top_words_to_list]
    categories_string = ", ".join(string_mung)
    return categories_string

In [155]:
cat_0 = top_words_to_string(positive=['women', 'fashion'], negative=['man', 'men'])
cat_1 = top_words_to_string(positive=['men', 'fashion'], negative=['woman', 'women'])
cat_2 = top_words_to_string(positive=['telephone', 'phone', 'mobile', 'gadget'], negative=['computer'])
cat_3 = top_words_to_string(positive=['computer', 'laptop', 'pc'])
cat_4 = top_words_to_string(positive=['consumer', 'electronics'])
cat_5 = top_words_to_string(positive=['jewerly', 'rings'])
cat_6 = top_words_to_string(positive=['goods', 'home'])
cat_7 = top_words_to_string(positive=['bags', 'backpack', 'handbag', 'wallet'], negative=['clothes'])
cat_8 = top_words_to_string(positive=['toys', 'games'])
cat_9 = top_words_to_string(positive=['outdoor', 'fun', 'sports'])
cat_10 = top_words_to_string(positive=['beauty', 'health'])
cat_11 = top_words_to_string(positive=['automobile', 'car'])
cat_12 = top_words_to_string(positive=['home', 'improvement', 'furniture'])

In [156]:
for_mapping = {ali_cats['text'][0]: cat_0, ali_cats['text'][1]: cat_1, 
               ali_cats['text'][2]: cat_2, ali_cats['text'][3]: cat_3, 
               ali_cats['text'][4]: cat_4, ali_cats['text'][5]: cat_5, 
               ali_cats['text'][6]: cat_6, ali_cats['text'][7]: cat_7,
               ali_cats['text'][8]: cat_8, ali_cats['text'][9]: cat_9,
               ali_cats['text'][10]: cat_10, ali_cats['text'][11]: cat_11,
               ali_cats['text'][12]: cat_12}

In [157]:
ali_cats['Words'] = ali_cats['text'].map(for_mapping)
ali_cats.head()

Unnamed: 0,categoryId,text,Words
0,100003109,Women's Fashion,"fashions, couture, fashion, moda operandi, raf..."
1,100003070,Men's Fashion,"fasion, menswear, men's wear, streetwear, wool..."
2,509,Phones & Telecommunications,"cellphone, cell phone, phones, phone, handset,..."
3,7,Computer,"computers, laptop computer, laptops, pc, pcs, ..."
4,44,Consumer Electronics,"consumer electronics, maker aeroflex holding, ..."


In [158]:
vectorizer = TfidfVectorizer(min_df=1)

In [159]:
model = vectorizer.fit_transform(ali_cats['Words'])
target = model.todense().round(2)

In [160]:
model.shape

(13, 74430)

In [161]:
s = len(sample_shopify_products)
type(s)

int

In [167]:
sample_shopify_products = data.sample(1000)
sample_shopify_products = sample_shopify_products.drop_duplicates()
sample_shopify_products = sample_shopify_products.dropna()
sample_shopify_products['productId'] = np.arange(len(sample_shopify_products))
sample_shopify_products.head()
sample_shopify_products[0:5]

Unnamed: 0,product_type,title,title_orig,productId
487458,bong,Slyme Accented Cone Perc Bong bong bong,Slyme Accented Cone Perc Bong bong,0
394781,Accessories,Little Birds High Socks Accessories Accessories,Little Birds High Socks Accessories,1
147002,NOSE RING L-shape,20 Gauge 5/16 Spring Flower L Shaped Nose Ring...,20 Gauge 5/16 Spring Flower L Shaped Nose Ring...,2
424806,Flowers,Hare's Ear Bupleurum Rotundifolium Griffithii ...,Hare's Ear Bupleurum Rotundifolium Griffithii ...,3
261984,Headphones,Blue Lola High-Fidelity Headphones Headphones ...,Blue Lola High-Fidelity Headphones Headphones,4


In [168]:
matching = vectorizer.transform(sample_shopify_products['title']).todense().round(2)

In [170]:
shopify_products = pd.DataFrame(matching, columns=vectorizer.get_feature_names())
shopify_products.shape

(881, 74430)

In [171]:
aliexpress_categ = pd.DataFrame(target, columns=vectorizer.get_feature_names())
aliexpress_categ.shape

(13, 74430)

In [173]:
cos_sim = cosine_similarity(shopify_products.values, aliexpress_categ.values)

In [174]:
cos_sim_df = pd.DataFrame(cos_sim).round(2)
cos_sim_df.columns = ali_cats['text'].values
cos_sim_df.head()

Unnamed: 0,Women's Fashion,Men's Fashion,Phones & Telecommunications,Computer,Consumer Electronics,Jewelry,Home,Bags,Toys,Outdoor Fun & Sports,"Beauty, Health",Automobiles & Motorcycles,Home Improvement
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0
1,0.02,0.01,0.0,0.0,0.01,0.02,0.01,0.0,0.02,0.01,0.0,0.0,0.01
2,0.02,0.0,0.0,0.0,0.0,0.03,0.0,0.02,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.0,0.0
4,0.0,0.01,0.01,0.02,0.01,0.01,0.0,0.03,0.0,0.0,0.0,0.01,0.0


In [175]:
for_other_category = pd.DataFrame()
for_other_category = np.where(cos_sim_df.sum(axis=1) == 0.0, 1, 0)
for_other_category = pd.DataFrame(for_other_category)
for_other_category.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [176]:
match_arg_max = cos_sim_df.idxmax(axis=1)
pre_final_match = match_arg_max.to_frame().reset_index()
pre_final_match = pre_final_match.merge(for_other_category, left_index=True, right_index=True)
pre_final_match.columns = ['productId', 'Ali_temp', 'Indicator']
pre_final_match.head()

Unnamed: 0,productId,Ali_temp,Indicator
0,0,Bags,0
1,1,Women's Fashion,0
2,2,Jewelry,0
3,3,Jewelry,0
4,4,Bags,0


In [177]:
final_match = pre_final_match.merge(sample_shopify_products, how='inner', on='productId')
final_match.head()

Unnamed: 0,productId,Ali_temp,Indicator,product_type,title,title_orig
0,0,Bags,0,bong,Slyme Accented Cone Perc Bong bong bong,Slyme Accented Cone Perc Bong bong
1,1,Women's Fashion,0,Accessories,Little Birds High Socks Accessories Accessories,Little Birds High Socks Accessories
2,2,Jewelry,0,NOSE RING L-shape,20 Gauge 5/16 Spring Flower L Shaped Nose Ring...,20 Gauge 5/16 Spring Flower L Shaped Nose Ring...
3,3,Jewelry,0,Flowers,Hare's Ear Bupleurum Rotundifolium Griffithii ...,Hare's Ear Bupleurum Rotundifolium Griffithii ...
4,4,Bags,0,Headphones,Blue Lola High-Fidelity Headphones Headphones ...,Blue Lola High-Fidelity Headphones Headphones


In [178]:
final_match['Aliexpress'] = np.where(final_match['Indicator'] == 1, "Other", final_match['Ali_temp'])
final_match = final_match[['title', 'Aliexpress']]
final_match.head()

Unnamed: 0,title,Aliexpress
0,Slyme Accented Cone Perc Bong bong bong,Bags
1,Little Birds High Socks Accessories Accessories,Women's Fashion
2,20 Gauge 5/16 Spring Flower L Shaped Nose Ring...,Jewelry
3,Hare's Ear Bupleurum Rotundifolium Griffithii ...,Jewelry
4,Blue Lola High-Fidelity Headphones Headphones ...,Bags


In [179]:
final_match[final_match.Aliexpress == "Phones & Telecommunications"].sample(10)

Unnamed: 0,title,Aliexpress
200,MGZ Black Phone Case Phone & Tablet Cases Phon...,Phones & Telecommunications
404,iPhone 11 Case Thin Fit 360 Case Case,Phones & Telecommunications
463,Pink Gameboy iPhone Case - 10 games included! ...,Phones & Telecommunications
683,mellow biodegradable case for Galaxy S10e acce...,Phones & Telecommunications
6,Dual Color Leather + Natural Cloth Texture Cas...,Phones & Telecommunications
572,Google Pixel 2 or 2 XL 64GB - Unlocked Cellpho...,Phones & Telecommunications
645,2G + 3G + 4G + LTE Paddle Antenna with SMA Plu...,Phones & Telecommunications
166,*Closeout* STM Relocated Coolant Overflow Rese...,Phones & Telecommunications
233,Call Me Emoji [Free Download iPhone Emojis] Fr...,Phones & Telecommunications
213,Sago Palm Sago Palm Sago Palm,Phones & Telecommunications


In [188]:
final_match[final_match.Aliexpress == "Home"].sample(10)

Unnamed: 0,title,Aliexpress
72,10L Roll Top Waterproof Stuff Sack R-PVC By Ma...,Home
727,Northcott Shiver Me Whiskers 23027 74 Green Pi...,Home
31,Traeger Pure kosher Sea Salt By Jacobsen SP...,Home
480,Harry Potter Books 1-7 Special Edition Boxed S...,Home
184,Kalita Wave Kettle Merchandise Merchandise,Home
848,Bold Brocade Wallpaper in Blue and Grey from t...,Home
245,The Autumn Collection Crafted by Ohm Slaw - Pr...,Home
455,Green And Golden Leaves One-piece Swimsuit One...,Home
409,Hosta Fire Island PLANT PLANT,Home
24,"SMOKEA 7"" Can Rig w/ Honeycomb Dip Perc Rigs Rigs",Home


In [181]:
final_match.to_csv("final_match.csv")

In [182]:
cos_sim_df.to_csv("cos_sim.csv")