In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/dataset2.zip")
df.head()

Unnamed: 0,Product ID,Product Title,Merchant ID,Cluster ID,Cluster Label,Category ID,Category Label
0,1,apple iphone 8 plus 64gb silver,1,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
1,2,apple iphone 8 plus 64 gb spacegrau,2,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
3,4,apple iphone 8 plus 64gb space grey,4,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
4,5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones


In [3]:
df[" Category Label"].value_counts()

Fridge Freezers     5501
Mobile Phones       4081
Washing Machines    4044
CPUs                3862
Fridges             3584
TVs                 3564
Dishwashers         3424
Digital Cameras     2697
Microwaves          2342
Freezers            2212
Name:  Category Label, dtype: int64

In [4]:
df[" Cluster Label"].value_counts()

Canon IXUS 185              27
Samsung UE49NU7100          24
Canon PowerShot SX730 HS    24
Samsung UE65NU7100          23
Canon IXUS 285 HS           23
                            ..
Beko WDX8543130W             1
LG F1496TDA                  1
Logik L612WM16               1
Bosch WAN28280GB             1
Siemens WM14T470GB           1
Name:  Cluster Label, Length: 12849, dtype: int64

# Model on first word

In [5]:
df["first_word"] = df[" Cluster Label"].str.split().str[0]
df["first_word"].nunique()

284

In [15]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X = df["first_word"]
y = df[" Category ID"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

le = LabelEncoder()

X_train = le.fit_transform(X_train)

In [16]:
le.classes_

array(['AEG', 'AMD', 'ASUS', 'Acer', 'Adelberg', 'AgfaPhoto', 'Akai',
       'Alcatel', 'Aligator', 'Allcall', 'Allview', 'Amazon', 'Amica',
       'Amplicomms', 'Apple', 'Archos', 'Ariston', 'Audioline', 'Avtex',
       'Bauknecht', 'Baumatic', 'Bea-fon', 'Beafon', 'Beko', 'Belling',
       'Benq', 'Binatone', 'Blackberry', 'Blackview', 'Blaupunkt',
       'Blizzard', 'Blomberg', 'Blu', 'Bluboo', 'Bomann', 'Bosch', 'Bq',
       'Breville', 'Britannia', 'Bush', 'CASO', 'CAT', 'CDA', 'Candy',
       'Canon', 'Caple', 'Casio', 'Cello', 'Changhong', 'Cisco',
       'Clatronic', 'Constructa', 'Cookology', 'Cookworks', 'Crosscall',
       'Cubot', 'Cylinda', 'Cyrus', 'Daewoo', 'De', 'DeLonghi', 'Dell',
       'Denver', 'Dewalt', 'DigiHome', 'Digiquest', 'Disney', 'Dometic',
       'Doogee', 'Doro', 'Dyon', 'Dörr', 'E98CWW', 'Easypix', 'Ebac',
       'Elari', 'ElectrIQ', 'Electra', 'Electrolux', 'Elephone',
       'Emporia', 'Energizer', 'Energy', 'Essentials', 'Estar',
       'Exquisit', 'F

In [17]:
# transform values of X_test that are not in X_train to "Apple"
X_test = ["Apple" if x not in le.classes_ else x for x in X_test]

In [18]:
X_test = le.transform(X_test)

In [19]:
forest = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)

forest.fit(pd.DataFrame(X_train), y_train)

forest.score(pd.DataFrame(X_test), y_test)

0.2790598895653405

In [20]:
print(classification_report(y_test, forest.predict(pd.DataFrame(X_test))))

              precision    recall  f1-score   support

        2612       0.00      0.00      0.00       818
        2614       0.21      0.91      0.34       723
        2615       1.00      0.54      0.70       771
        2617       0.51      0.03      0.06       542
        2618       0.00      0.00      0.00       469
        2619       0.00      0.00      0.00       662
        2620       0.41      0.33      0.36       796
        2621       0.00      0.00      0.00       439
        2622       0.22      0.55      0.31      1115
        2623       0.00      0.00      0.00       728

    accuracy                           0.28      7063
   macro avg       0.23      0.24      0.18      7063
weighted avg       0.25      0.28      0.21      7063



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# TF-iDF + Logreg on cluster label

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [23]:
X = df[" Cluster Label"]
y = df[" Category ID"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pipe = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("clf", LogisticRegression(random_state=42)),
    ]
)


pipe.fit(X_train, y_train)

In [25]:
preditions = pipe.predict(X_test)

print(classification_report(y_test, preditions))

              precision    recall  f1-score   support

        2612       1.00      0.97      0.99       818
        2614       0.93      0.99      0.96       723
        2615       1.00      1.00      1.00       771
        2617       1.00      0.99      1.00       542
        2618       0.89      0.89      0.89       469
        2619       0.80      0.80      0.80       662
        2620       0.93      0.96      0.95       796
        2621       0.91      0.78      0.84       439
        2622       0.82      0.87      0.84      1115
        2623       0.88      0.82      0.85       728

    accuracy                           0.91      7063
   macro avg       0.91      0.91      0.91      7063
weighted avg       0.91      0.91      0.91      7063



# Catboost on ids

In [26]:
from catboost import CatBoostClassifier

In [40]:
X = df[[" Merchant ID", " Cluster ID"]]
y = df[" Category ID"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cat = CatBoostClassifier(
    iterations=10,
    learning_rate=0.1,
    random_seed=42,
    logging_level="Silent",
    task_type="GPU",
    # cat_features=[" Merchant ID", " Cluster ID"],
)

cat.fit(X_train, y_train)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


<catboost.core.CatBoostClassifier at 0x7f7eb720c280>

In [41]:
pred = cat.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

        2612       0.98      1.00      0.99       818
        2614       0.97      0.98      0.98       723
        2615       0.98      0.97      0.98       771
        2617       1.00      0.93      0.97       542
        2618       0.95      1.00      0.97       469
        2619       1.00      0.96      0.98       662
        2620       0.97      0.99      0.98       796
        2621       0.98      0.98      0.98       439
        2622       0.99      1.00      0.99      1115
        2623       0.99      1.00      1.00       728

    accuracy                           0.98      7063
   macro avg       0.98      0.98      0.98      7063
weighted avg       0.98      0.98      0.98      7063

