# import Package

In [1]:
import numpy as np
import os
import nltk
import torch
import torch.nn
import torchtext.vocab as vocab
from sklearn import metrics
import pandas as pd
import warnings
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.linear_model import LinearRegression
from datetime import datetime
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
warnings.filterwarnings("ignore")


# Function definition

In [2]:
def load_data():
    df = pd.read_csv("./data/markets.csv", encoding='utf_8_sig')
    # print(df.columns)
    texts = df['name'].values
    df = df.fillna(-1)
    # labels = np.argmax(df[df.columns[-4:]].values, axis=1)
    labels = df['cat'].values
    label2id = {l: i for i, l in enumerate(set(labels))}
    id2label = {i: l for l, i in label2id.items()}
    labels = [label2id[l] for l in labels]
    return texts, labels, id2label

In [3]:
def text_preprocess(text):
    text = str(text)
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\'', '/']
    text = "".join([(a if a not in english_punctuations else " ") for a in text])
    text = " ".join(nltk.tokenize.word_tokenize(text.lower()))
    return text

In [4]:
def evaluation(predictions, labels, id2label, model_name=None):
    acc = accuracy_score(labels, predictions)
    recall = recall_score(labels, predictions, average="macro")
    f1 = f1_score(labels, predictions, average="macro")
    report = metrics.classification_report(labels, predictions,
                                           target_names=[id2label[i] for i in range(len(id2label))])
    info = "acc:%s, recall:%s, f1 score:%s" % (acc, recall, f1)
    if model_name is not None:
        info = "%s: %s" % (model_name, info)
    print(info)
    print(report)

# Start Train

In [5]:
texts, labels, id2label = load_data()  
texts = [text_preprocess(t) for t in texts]

# TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer 
vectorizer = TfidfVectorizer() 
features=vectorizer.fit_transform(texts)
print("data len:", len(texts))
print("id2label",id2label)

data len: 13492
id2label {0: 'Frozen foods', 1: 'Meal ingredients', 2: 'Drinks - hot  cold', 3: 'Baking  cooking', 4: 'Baby care', 5: 'Pharmacy', 6: 'Personal care', 7: 'Breakfast foods', 8: 'Deli  chilled foods', 9: 'Biscuits  crackers', 10: 'Toys  party needs', 11: 'Meat', 12: 'Liquor - wine', 13: 'Fruit  vegetables', 14: 'Liquor - beer  cider', 15: 'Health  wellness', 16: 'Cleaning  homecare', 17: 'Pet care', 18: 'Chocolate, sweets  snacks', 19: 'Seafood', 20: 'Bakery', 21: 'Clothing  manchester', 22: 'Home  kitchenware', 23: 'Canned  prepared foods', 24: 'Office  entertainment'}


# Segmentation Data

In [7]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25,
                                                                            random_state=3)

# KNN

In [8]:
# knn
k = len(id2label)
model = KNeighborsClassifier(n_neighbors=k)
model.fit(x_train, y_train)
knn_predict_labels = model.predict(x_test)
evaluation(knn_predict_labels, y_test, id2label, "knn")

knn: acc:0.866587607471094, recall:0.8082734545361214, f1 score:0.8151671521848803
                           precision    recall  f1-score   support

             Frozen foods       0.80      0.89      0.84       210
         Meal ingredients       0.79      0.90      0.84       214
       Drinks - hot  cold       0.80      0.93      0.86       183
          Baking  cooking       0.79      0.75      0.77       197
                Baby care       0.97      0.93      0.95       125
                 Pharmacy       0.00      0.00      0.00         1
            Personal care       0.88      0.94      0.91       213
          Breakfast foods       0.85      0.85      0.85       136
      Deli  chilled foods       0.87      0.84      0.85       230
       Biscuits  crackers       0.90      0.93      0.91       108
        Toys  party needs       0.80      0.76      0.78        49
                     Meat       0.77      0.83      0.80        60
            Liquor - wine       0.96      0.9

# SVM

In [9]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
model = OneVsRestClassifier(SVC())
model.fit(x_train, y_train) 
svm_predict_labels = model.predict(x_test)
evaluation(svm_predict_labels, y_test, id2label, "svm")

svm: acc:0.9276608360509931, recall:0.8771445682314636, f1 score:0.8803862572150645
                           precision    recall  f1-score   support

             Frozen foods       0.88      0.91      0.90       210
         Meal ingredients       0.88      0.93      0.91       214
       Drinks - hot  cold       0.93      0.98      0.95       183
          Baking  cooking       0.88      0.85      0.87       197
                Baby care       0.96      0.98      0.97       125
                 Pharmacy       0.00      0.00      0.00         1
            Personal care       0.96      0.98      0.97       213
          Breakfast foods       0.92      0.97      0.94       136
      Deli  chilled foods       0.90      0.90      0.90       230
       Biscuits  crackers       0.97      0.97      0.97       108
        Toys  party needs       0.89      0.82      0.85        49
                     Meat       0.86      0.93      0.90        60
            Liquor - wine       1.00      1.

# LightGBM with GBDT

In [10]:
import lightgbm as lgb

def lgb_model(x_train, x_test, y_train, y_test, verbose):
    params = {'num_leaves': 60,
    'min_data_in_leaf': 30,
    'objective': 'multiclass',
    'num_class': 33,
    'max_depth': -1,
    'learning_rate': 0.03,
    "min_sum_hessian_in_leaf": 6,
    "boosting": "gbdt",
    "feature_fraction": 0.9,
    "bagging_freq": 1,
    "bagging_fraction": 0.8,
    "bagging_seed": 11,
    "lambda_l1": 0.1,
    "verbosity": -1,
    "nthread": 15,
    'metric': 'multi_error',
    "random_state": 2020
          }

    model = lgb.train(params
                      , lgb.Dataset(x_train, y_train)
                      , num_boost_round = 100000
                      , valid_sets = [lgb.Dataset(x_test, y_test)]
                      , verbose_eval = verbose
                      , early_stopping_rounds = 500
                     )

    print('Predicting...')
    y_prob = model.predict(x_test, num_iteration=model.best_iteration)
    lgb_predict_labels = [list(x).index(max(x)) for x in y_prob]
    print("AUC score: {:<8.5f}".format(metrics.accuracy_score(lgb_predict_labels, y_test)))
    report = metrics.classification_report(y_test, lgb_predict_labels, target_names=[id2label[i] for i in range(len(id2label))])
    print(report)
    
    return lgb_predict_labels

In [11]:

lgb_predict_labels = lgb_model(x_train, x_test, y_train, y_test, verbose=False)

Predicting...
AUC score: 0.69819 
                           precision    recall  f1-score   support

             Frozen foods       0.79      0.68      0.73       210
         Meal ingredients       0.76      0.72      0.74       214
       Drinks - hot  cold       0.81      0.87      0.84       183
          Baking  cooking       0.62      0.47      0.54       197
                Baby care       0.92      0.88      0.90       125
                 Pharmacy       0.00      0.00      0.00         1
            Personal care       0.75      0.73      0.74       213
          Breakfast foods       0.68      0.59      0.63       136
      Deli  chilled foods       0.80      0.71      0.76       230
       Biscuits  crackers       0.93      0.81      0.86       108
        Toys  party needs       1.00      0.35      0.52        49
                     Meat       0.58      0.72      0.64        60
            Liquor - wine       0.97      0.88      0.93       241
        Fruit  vegetables  