In [66]:
# http://localhost:8080/notebooks/git/product-category/notebooks/prep_20210302A1.ipynb
prfx_prp = 'prep_20210302B1'

In [67]:
import pandas as pd
import numpy as np
from collections import Counter


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier




In [68]:
MIN_CNT = 50

In [69]:
%%time
df = pd.read_csv(f'../data/data_sample__{prfx_prp}.csv')

CPU times: user 83.1 ms, sys: 16 ms, total: 99.1 ms
Wall time: 135 ms


In [70]:
%%time
def try2eval(x):
    try:
        return eval(x)
    except SyntaxError:
        return []

# df = pd.read_csv(f'../data/data_sample__{prfx_prp}.csv')
df.fillna("", inplace=True)
df = df.astype(str)
df['category'] = df.category.apply(try2eval)
print("df.shape:", df.shape)
df['txt'] = df.title + " " + df.brand + " " + df.description + " " + df.feature

df.shape: (10000, 7)
CPU times: user 169 ms, sys: 0 ns, total: 169 ms
Wall time: 167 ms


In [75]:
dmn2cnt = Counter(df.domain.value_counts().to_dict())
i2dmn = sorted(dmn2cnt.keys())
dmn2i = {v:k for k,v in enumerate(i2dmn)}
cat2cnt = Counter([j for i in df.category for j in i])
i2cat = sorted(k for k,v in cat2cnt.items() if v>50)
cat2i = {v:k for k,v in enumerate(i2cat)}

len(i2dmn), len(i2cat)

(23, 111)

## split train val

In [76]:
np.random.seed(101)
msk_val = np.random.rand(len(df))>0.85
dftrn = df[~msk_val]
dfval = df[msk_val]
dftrn.shape, dfval.shape

((8466, 8), (1534, 8))

## rf model

In [77]:
%%time
vec = TfidfVectorizer(ngram_range=(1,2),
                      min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                      smooth_idf=1, sublinear_tf=1)
Xtrn = vec.fit_transform(dftrn.txt)
Xval = vec.transform(dfval.txt)

Xtrn.shape, Xval.shape

CPU times: user 3.19 s, sys: 68.2 ms, total: 3.26 s
Wall time: 3.27 s


((8466, 57858), (1534, 57858))

In [78]:
ys = np.zeros((len(df), len(i2cat)))

for i,cats in enumerate(df.category):
    idx_pos = [cat2i[cat] for cat in cats if cat in cat2i]
    ys[i,idx_pos] = 1

ys_trn = ys[~msk_val]
ys_val = ys[msk_val]

In [79]:
clf = RandomForestClassifier(n_estimators=5, 
                             min_samples_leaf=2,
                             max_features=0.5,
                             n_jobs=-1)

In [80]:
%%time
clf.fit(Xtrn, ys_trn)

CPU times: user 2min 14s, sys: 50.3 ms, total: 2min 14s
Wall time: 30.3 s


RandomForestClassifier(max_features=0.5, min_samples_leaf=2, n_estimators=5,
                       n_jobs=-1)

In [81]:
clf.n_outputs_

111

In [82]:
prds_trn = clf.predict(Xtrn)
prds_prob_trn = clf.predict_proba(Xtrn)

np.mean(prds_trn==ys_trn), accuracy_score(ys_trn, prds_trn)

(0.9940620989522477, 0.5922513583746751)

In [83]:
prds_val = clf.predict(Xval)
prds_prob_val = clf.predict_proba(Xval)

np.mean(prds_val==ys_val), accuracy_score(ys_val, prds_val)

(0.9859696724103504, 0.2255541069100391)

In [108]:
for j in range(ys_val.shape[1]):
    print(f"{accuracy_score(ys_val[:,j], prds_val[:,j]):.2f}", end='; ')

0.99; 1.00; 0.99; 0.95; 0.99; 1.00; 0.99; 0.99; 0.99; 0.99; 0.99; 0.96; 1.00; 1.00; 1.00; 0.99; 0.99; 0.99; 0.99; 0.93; 0.99; 0.99; 1.00; 1.00; 0.97; 0.99; 0.99; 0.99; 0.98; 0.98; 0.99; 0.95; 0.97; 0.99; 0.98; 0.99; 1.00; 0.99; 1.00; 0.95; 0.99; 0.99; 0.99; 0.99; 0.99; 0.99; 0.99; 0.99; 0.99; 1.00; 1.00; 1.00; 0.99; 0.93; 0.97; 0.98; 0.99; 0.99; 0.98; 0.99; 1.00; 0.99; 0.98; 0.98; 0.99; 1.00; 0.99; 0.98; 0.97; 0.99; 0.96; 0.99; 0.99; 0.99; 0.99; 0.99; 0.99; 0.98; 0.98; 0.99; 0.98; 0.99; 0.98; 0.99; 0.98; 0.99; 1.00; 0.99; 0.99; 0.98; 0.99; 0.99; 1.00; 1.00; 0.99; 0.99; 0.97; 0.94; 0.99; 1.00; 0.98; 1.00; 0.96; 0.99; 0.99; 0.95; 0.99; 1.00; 1.00; 0.96; 1.00; 

## demo

In [145]:
i = np.random.choice(range(len(ys_val)))
dct = dict(dfval.iloc[i])
print(dct['txt'])

y = np.where(ys_val[i])[0]
prd = np.where(prds_val[i])[0]
print('---')
print("Truth:", sorted(dct['category']))
print("y:", [i2cat[i] for i in y])
print("Prediction:", [i2cat[i] for i in prd])

Geneva Women's AMZ1004 Ceramic Mother-of-Pearl Enamel Bezel Watch Geneva Moderate A sleek and fashionable timepiece dressed in white ceramic, the Geneva Moderate Women's Custom Ceramic Mother-of-Pearl Enamel Bezel Watch will quickly grab the attention of any passersby with an eye for style. This exceptional timepiece begins with a round 34mm silver-toned all-alloy case with a fixed white enamel bezel and a textured crown. These elements hug a pristine white mother-of-pearl dial, which is protected by a hardened mineral crystal and features silver-toned index and Arabic numeral hour markers and luminous silver-toned hands powered by analog quartz movement. The dial also features three sub-dials offering 60-second, 30-minute and 1/10th-of-a-second displays. A solid all-alloy bracelet with white ceramic center links straps this watch to the wrist, while a double push-button fold-over safety clasp ensures its secure placement. Finally, this watch boasts water resistance up to 33 feet (10 m