In [37]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
import pickle
import joblib
import xgboost as xgb # conda install -c anaconda py-xgboost

In [39]:
shopify_prod = pd.read_csv("shopifyProducts.csv")
ali_prod = pd.read_csv("aliProducts.csv")
ali_prod.drop_duplicates()
ali_dict = pd.read_csv("aliCategories.csv")

In [40]:
ali_dict

Unnamed: 0,categoryId,text
0,100003109,Women's Fashion
1,100003070,Men's Fashion
2,509,Phones & Telecommunications
3,7,Computer
4,44,Consumer Electronics
5,1509,Jewelry
6,15,Home
7,1524,Bags
8,26,Toys
9,18,Outdoor Fun & Sports


In [41]:
ali_prod = ali_prod.merge(ali_dict, left_on='categoryId', right_on='categoryId')
ali_prod = ali_prod[['text', 'titleModule.subject']]
ali_prod.columns = ['category', 'products']
ali_prod.head()

Unnamed: 0,category,products
0,Home,Superhero Girls Theme Party Favor Bags Candy B...
1,Home,Colored Mini Love Heart Wooden Clothespin Offi...
2,Home,Rose Gold 18th 21th 30th 40th 50th Birthday La...
3,Home,100 skeins embroidery floss embroidery thread ...
4,Home,2018 hot sale Lace accessories rice white blac...


In [42]:
consumer_electronic = ali_prod[ali_prod['category'] == "Consumer Electronics"]

In [43]:
consumer_electronic[consumer_electronic['products'].str.contains("sony")]

Unnamed: 0,category,products
666604,Consumer Electronics,Hotsell 2x bateria NP-BN1 np bn1 NPBN1 battery...
667677,Consumer Electronics,Ear pads 45mm 50mm 55mm 60mm 65mm 70mm 75mm 80...
667782,Consumer Electronics,Panoramic 360 Degree Rotation Swivel dslr Came...
668708,Consumer Electronics,TV Remote Controller control For RM-L1098+8 hu...
668994,Consumer Electronics,Micro single camera wide plate wrist band hand...
...,...,...
707838,Consumer Electronics,Universal Remote Control 1PCS RM-88E TV/VCD/DV...
707880,Consumer Electronics,wired Earphone for iphone 5 s 6s SE xiaomi bas...
707953,Consumer Electronics,100% Original SONY MH750 in Ear earphone BASS ...
708413,Consumer Electronics,Universal Remote Control For LG LCD LED HDTV 3...


In [44]:
shopify_prod.head()

Unnamed: 0,product_type,title
0,Sweater,Homebody Sweater
1,,Single Stone Zip Hoodie - Sulfur Black
2,Top,Subliminal Top
3,,2X4 Denim - Rinse
4,Dress,Peek-A-Vu Dress


In [45]:
shopify_prod.shape

(615943, 2)

In [46]:
# Filter missing. Not necessary though.
shopify_prod = shopify_prod[shopify_prod.product_type.notnull()]
shopify_prod.shape

(545980, 2)

In [47]:
ali_prod['category'].value_counts()

Home                           107307
Men's Fashion                   80847
Automobiles & Motorcycles       76090
Women's Fashion                 74405
Computer                        67782
Outdoor Fun & Sports            66664
Jewelry                         61206
Beauty, Health                  60290
Bags                            51846
Home Improvement                47495
Phones & Telecommunications     44301
Consumer Electronics            42178
Toys                            41394
Name: category, dtype: int64

In [48]:
map_for_target = {"Home": 0, "Men's Fashion": 1, "Automobiles & Motorcycles": 2, "Women's Fashion": 3, "Computer": 4,
                  "Outdoor Fun & Sports": 5, "Jewelry": 6, "Beauty, Health": 7, "Bags": 8, "Home Improvement": 9, 
                  "Phones & Telecommunications": 10, "Consumer Electronics": 11, "Toys": 12}
inv_map = {v: k for k, v in map_for_target.items()}

In [49]:
inv_map

{0: 'Home',
 1: "Men's Fashion",
 2: 'Automobiles & Motorcycles',
 3: "Women's Fashion",
 4: 'Computer',
 5: 'Outdoor Fun & Sports',
 6: 'Jewelry',
 7: 'Beauty, Health',
 8: 'Bags',
 9: 'Home Improvement',
 10: 'Phones & Telecommunications',
 11: 'Consumer Electronics',
 12: 'Toys'}

In [50]:
inv_map = {0: 15,
           1: 100003070,
           2: 34,
           3: 100003109,
           4: 7,
           5: 18,
           6: 1509,
           7: 66,
           8: 1524,
           9: 13,
           10: 509,
           11: 44,
           12: 26
}

In [51]:
with open("categories.pickle", "wb") as handle:
    pickle.dump(inv_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [52]:
ali_prod['target'] = ali_prod['category'].map(map_for_target)
ali_prod.head()

Unnamed: 0,category,products,target
0,Home,Superhero Girls Theme Party Favor Bags Candy B...,0
1,Home,Colored Mini Love Heart Wooden Clothespin Offi...,0
2,Home,Rose Gold 18th 21th 30th 40th 50th Birthday La...,0
3,Home,100 skeins embroidery floss embroidery thread ...,0
4,Home,2018 hot sale Lace accessories rice white blac...,0


In [54]:
X = ali_prod['products'].values
y = ali_prod['target'].values

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [56]:
X_train[:10]

array(['Adomaner Cosmetic Brush Cleaning Tool Egg Makeup Brushes Washing Tool Foundation Powder Seiketsu Outil Silicone Scrubber Cleaner',
       'Women Summer Boho Dress Formal One Shoulder Party Bodycon Ladies Clubwear Sundress',
       'Electric Hot Stone Needle Scraping Board Scraping Knife Heating Scraper Rib GuaSha Instrument for Massage Meridian Gua Sha Tool',
       '21cm Dragon Ball Super Saiyan Vegeta Trunks PVC Action Figures dramatic showcase Dragonball Model Toy Doll Figuras',
       'KMC Chain HP1L K-Z7 K-Z99 chain super light chain mtb road bicycle',
       '2019 Women Cotton Socks Animal Cat Art Animation Character Cute Gift Dress Sock happy funny socks fancy crazy sox L311',
       'Smokeless Moxibustion + Aromatherapy Moxa Red LED Light Device Foot Massager With Heat For Health Care Pain Relief Therapy',
       'Yomay women Sports Bras Fitness Sports Bra Top Shockproof Shapes Quick Dry Running Gym Adjustable Underwear push up Yoga Bra Top',
       'Black/White/Gold Fo

In [57]:
vectorizer_train = TfidfVectorizer(min_df=1)
vec_form = vectorizer_train.fit_transform(X_train)

In [58]:
svd = TruncatedSVD(n_components=50, n_iter=100)
svd_train_topic = svd.fit_transform(vec_form)

In [61]:
# Random Forest as initial choice.
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=5, n_jobs=-1)

In [62]:
rf_classifier.fit(svd_train_topic, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)

In [63]:
vec_form_test = vectorizer_train.transform(X_test)
svd_test_topic = svd.transform(vec_form_test)

In [64]:
xgb_class = xgb.XGBClassifier(random_state=12, learning_rate=0.01,
                              n_estimators=100, n_jobs=6)
xgb_class.fit(svd_train_topic, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.01, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=6,
              nthread=None, objective='multi:softprob', random_state=12,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [65]:
y_pred_xgb = xgb_class.predict(svd_test_topic)

In [67]:
y_pred_rf = rf_classifier.predict(svd_test_topic)

In [68]:
accuracy_score(y_test, y_pred_xgb)

0.7207123344345678

In [69]:
accuracy_score(y_test, y_pred_rf)

0.869561513984461

In [70]:
# All dataset
vectorizer_all = TfidfVectorizer(min_df=1)
vec_all_data = vectorizer_all.fit_transform(X)

In [71]:
joblib.dump(vectorizer_all, "vectorize.pkl")

['vectorize.pkl']

In [72]:
with open("tf_idf.pickle", "wb") as handle:
    pickle.dump(vectorizer_all, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [73]:
svd_all = TruncatedSVD(n_components=50, n_iter=100)
svd_all_topic = svd_all.fit_transform(vec_all_data)

In [74]:
joblib.dump(svd_all, "svd.pkl")

['svd.pkl']

In [75]:
with open("svd.pickle", "wb") as handle:
    pickle.dump(svd_all, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [76]:
clf = RandomForestClassifier(n_estimators=100, random_state=5, n_jobs=-1)
clf.fit(svd_all_topic, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)

In [77]:
joblib.dump(clf, "clf.pkl")

['clf.pkl']

In [78]:
with open("rf_clf.pickle", "wb") as handle:
    pickle.dump(clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [79]:
with open('rf_clf.pickle', 'rb') as handle:
    clf_2 = pickle.load(handle)

In [80]:
vec_for_pred = vectorizer_all.transform(shopify_prod.title)

In [81]:
svd_for_pred = svd_all.transform(vec_for_pred)

In [82]:
target_pred = clf.predict(svd_for_pred)

In [83]:
target_pred[:10]

array([3, 3, 3, 3, 3, 1, 1, 4, 1, 0], dtype=int64)

In [84]:
shopify_prod['Cat'] = target_pred
shopify_prod['Aliexpress_category'] = shopify_prod['Cat'].map(inv_map)
shopify_prod[:5]

Unnamed: 0,product_type,title,Cat,Aliexpress_category
0,Sweater,Homebody Sweater,3,100003109
2,Top,Subliminal Top,3,100003109
4,Dress,Peek-A-Vu Dress,3,100003109
6,Dress,Pop Quiz Dress,3,100003109
8,Top,Leo Rising Top,3,100003109


In [85]:
shopify_prod = shopify_prod[['product_type', 'title', 'Aliexpress_category']]

In [86]:
shopify_prod.to_csv("ml_match.csv", header=True, index=False)

In [87]:
shopify_prod[477330:477338]

Unnamed: 0,product_type,title,Aliexpress_category
534967,"Women,Women's Clothing,Women's_Lingerie, Sleep...",SHEIN | Plus Applique Floral Lace Bralette,100003109
534968,Fashion Accessories,Monogrammed Hip Flask,100003070
534969,Sweats,Sweat Homme Gizmo,100003070
534970,"Women,Women's Clothing,Women's_Dresses,Women,C...",SHEIN | Plus Cat Print Cami Dress,100003109
534971,Apparel,Cut Out Bralette,100003109
534972,Bath & Beauty,Essential Oil Set,15
534973,Apparel,Breeze Strappy Tank,44
534974,"Women,Women's Clothing,Women's_Pants,Women,Clo...",adidas by Stella McCartney | Performance Essen...,18


In [88]:
shopify_prod['Aliexpress_category'].value_counts()

15           97985
18           74783
100003070    65871
100003109    52650
44           50914
13           39564
66           36712
1509         32091
7            31622
26           20771
34           20675
1524         11475
509          10867
Name: Aliexpress_category, dtype: int64

In [98]:
shopify_prod.head()

Unnamed: 0,product_type,title,Aliexpress_category
0,Sweater,Homebody Sweater,100003109
2,Top,Subliminal Top,100003109
4,Dress,Peek-A-Vu Dress,100003109
6,Dress,Pop Quiz Dress,100003109
8,Top,Leo Rising Top,100003109


In [122]:
def getNumericCategoryId(categoryName):
    return list(ali_dict[ali_dict.text == categoryName].categoryId)[0]

In [123]:
getNumericCategoryId("Outdoor Fun & Sports")

18

In [129]:
shopify_prod = shopify_prod[['Aliexpress_category', 'title']]

In [131]:
shopify_prod[shopify_prod.Aliexpress_category == getNumericCategoryId("Home") ].sample(n = 10)

Unnamed: 0,Aliexpress_category,title
90451,15,ADP Cirno - Touhou Project Anime Dakimakura Ja...
355987,15,Kitchen Chef Knife Set 2 - Olive wood Handle a...
88951,15,Pirate Treasure Party Cup 16 fl oz | 1ct
58517,15,Logo Colorblock Sweater
264806,15,Gamma Phase 2 Butterfly Knife 2.0
14052,15,Buddz New cute smiley face purse
444239,15,Le Specs Revolution Gold
215819,15,Cozy Habits Ivory Multi Stripe Eyelash Sweater
522343,15,Striped Shawl Collar Pocket Cardigan in Toasty...
236755,15,Prosecco Scented Gold Glitter Bath Bombs


In [125]:
shopify_prod[shopify_prod.Aliexpress_category == getNumericCategoryId("Outdoor Fun & Sports")].sample(10)

Unnamed: 0,product_type,title,Aliexpress_category
341689,Socks,Army Trainer Socks Unisex Crew Sock,18
35618,underwear,Bam(Bare) Pimaluxe Boxer Brief 2-Pack Bundle,18
113809,ACC_Decorative Object,"Lys Outdoor Flameless Candle, Set of 3 - Ivory",18
130950,Belt,Slay Yes Corset Belt,18
233926,Bras,Fantasie Illusion Underwire Side Support Bra N...,18
522536,Light,Pelican 7070R Tactical Flashlight,18
154343,Hand Tools,21-Piece Precision Electronics Driver Set,18
563438,Bottom,HORIZON STRIPE RACHEL BIKINI BOTTOM,18
302956,Sunglasses,Willow Black Hex Sunglasses,18
184999,Sillones Individuales,Columbus Sillón Individual de Tela Quantum - V...,18


In [126]:
shopify_prod[shopify_prod.Aliexpress_category == getNumericCategoryId("Men's Fashion")].sample(10)

Unnamed: 0,product_type,title,Aliexpress_category
378781,Long Sleeve T-Shirt,Scherzday Long Sleeve T-Shirt,100003070
427798,Soft Shell Jacket,Arc'teryx LEAF Patrol Jacket AR,100003070
278459,Jackets & Sweatshirts,Gremlins Shadow Long-Sleeve T-shirt,100003070
278797,SHIRTS,"""THE EVAN DENIM"" T-SHIRT",100003070
378884,Long Sleeve T-Shirt,Pitchers&Catchers Long Sleeve T-Shirt,100003070
394812,Long Sleeve Shirt,Jordan Retro 1 SBB Shattered Backboard - TALK ...,100003070
25188,Watches,Ice-Watch Ice Slim Black Matte Women's Watch 0...,100003070
65803,Silicone Rings,Men's Strata Realtree Antler Silicone Ring - O...,100003070
500034,HOODIE,Diagonal Vintage Over Hoodie - Anthracite Mult...,100003070
586240,TOPS,Secret Lasagna X Garfield X The Hundreds T-Shirt,100003070


In [128]:
shopify_prod[shopify_prod.Aliexpress_category == getNumericCategoryId("Phones & Telecommunications")].sample(10)

Unnamed: 0,product_type,title,Aliexpress_category
399509,iPhone 11 Pro Max Case,Smiley Flowers iPhone 11 Pro Max Case,509
399928,iPhone Xr Case,Neon Flames iPhone Xr Case,509
322558,Camo Cases,Navy Camo iPhone Case,509
579306,Supplement,Max Pump,509
528829,Phone Cases,3D Retro Camera Lanyard Phone holder Case for ...,509
567549,Screen Protector,"LK for Samsung Galaxy Note 9 Screen Protector,...",509
281811,Book,Genesys RPG: Expanded Player's Guide Hardcover...,509
418892,Galaxy A7 2018,Luxury Artistic Marble Glass Case for Galaxy A...,509
207558,Phone Cases,Korean Cartoon Drinks iPhone Case,509
100074,Projector Lamps,SAMSUNG HLR5667 (BP96-00826A) Projection TV As...,509
