# import Package

In [1]:
import numpy as np
import os
import nltk
import torch
import torch.nn
import torchtext.vocab as vocab
from sklearn import metrics
import pandas as pd
import warnings
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.linear_model import LinearRegression
from datetime import datetime
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
warnings.filterwarnings("ignore")


# Function definition

In [2]:
def load_data():
    df = pd.read_csv("./data/markets.csv", encoding='utf_8_sig')
    # print(df.columns)
    texts = df['name'].values
    df = df.fillna(-1)
    # labels = np.argmax(df[df.columns[-4:]].values, axis=1)
    labels = df['cat'].values
    label2id = {l: i for i, l in enumerate(set(labels))}
    id2label = {i: l for l, i in label2id.items()}
    labels = [label2id[l] for l in labels]
    return texts, labels, id2label

In [3]:
def text_preprocess(text):
    text = str(text)
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\'', '/']
    text = "".join([(a if a not in english_punctuations else " ") for a in text])
    text = " ".join(nltk.tokenize.word_tokenize(text.lower()))
    return text

In [4]:
def load_embeddings():
    
    cache = '.vector_cache'
    if not os.path.exists(cache):
        os.mkdir(cache)

    word2vec = vocab.Vectors(name=r'./word2vec/word2vec_300dim.txt', cache=cache)

    return word2vec

In [5]:
def encode_text_to_features(vector, text):
    vectors = vector.get_vecs_by_tokens(text.split())
    sentence_vector = torch.mean(vectors, dim=0)
    return sentence_vector.tolist()

In [6]:
def evaluation(predictions, labels, id2label, model_name=None):
    acc = accuracy_score(labels, predictions)
    recall = recall_score(labels, predictions, average="macro")
    f1 = f1_score(labels, predictions, average="macro")
    report = metrics.classification_report(labels, predictions,
                                           target_names=[id2label[i] for i in range(len(id2label))])
    info = "acc:%s, recall:%s, f1 score:%s" % (acc, recall, f1)
    if model_name is not None:
        info = "%s: %s" % (model_name, info)
    print(info)
    print(report)

# Prepare corpus

In [7]:
data = pd.read_csv('./data/markets.csv', encoding='utf_8_sig')
data.head()

Unnamed: 0.1,Unnamed: 0,name,cat
0,0,little ones baby wipes unscented,Baby care
1,1,essentials baby wipes fragrance free,Baby care
2,2,huggies baby wipes unscented,Baby care
3,3,huggies essentials walker nappies unisex 13-18...,Baby care
4,4,silk baby wipes fragrance free,Baby care


In [8]:
data = data.drop(['Unnamed: 0'], axis=1)
data.head()

Unnamed: 0,name,cat
0,little ones baby wipes unscented,Baby care
1,essentials baby wipes fragrance free,Baby care
2,huggies baby wipes unscented,Baby care
3,huggies essentials walker nappies unisex 13-18...,Baby care
4,silk baby wipes fragrance free,Baby care


In [9]:
data = data.apply([text_preprocess])

In [10]:
data.columns = ['name', 'cat']

In [11]:
data

Unnamed: 0,name,cat
0,little ones baby wipes unscented,baby care
1,essentials baby wipes fragrance free,baby care
2,huggies baby wipes unscented,baby care
3,huggies essentials walker nappies unisex 13-18...,baby care
4,silk baby wipes fragrance free,baby care
...,...,...
13487,daim chocolate bar,chocolate sweets snacks
13488,romanos supreme pizza minis 450g,deli chilled foods
13489,san remo gluten free pasta fettuccine,health wellness
13490,watties wok creations stir fry sauce malaysian...,meal ingredients


In [12]:
newfile = open('./data/corpus.txt', 'w', encoding='utf_8_sig')
for i in range(len(data['name'])):
    newline = data['name'][i] + ' ' + data['cat'][i] + '\n'
    newfile.write(newline)

# Train Word2vec

In [13]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing

In [14]:
readlines = open('./data/corpus.txt', 'r', encoding='utf_8_sig')

model = Word2Vec(LineSentence(readlines),
                 size=300, window=5, min_count=10, sample=1e-5,
                 workers=multiprocessing.cpu_count(), iter=5,)

model.wv.save_word2vec_format(r'./word2vec/word2vec_300dim.txt', binary=False)

# Start Train

In [15]:
texts, labels, id2label = load_data()
texts = [text_preprocess(t) for t in texts]
vector = load_embeddings()
features = [encode_text_to_features(vector, text) for text in texts]
print("data len:", len(texts))
print("id2label",id2label)

data len: 13492
id2label {0: 'Breakfast foods', 1: 'Bakery', 2: 'Office  entertainment', 3: 'Canned  prepared foods', 4: 'Meat', 5: 'Deli  chilled foods', 6: 'Baby care', 7: 'Seafood', 8: 'Liquor - beer  cider', 9: 'Frozen foods', 10: 'Meal ingredients', 11: 'Pharmacy', 12: 'Health  wellness', 13: 'Fruit  vegetables', 14: 'Clothing  manchester', 15: 'Biscuits  crackers', 16: 'Home  kitchenware', 17: 'Personal care', 18: 'Chocolate, sweets  snacks', 19: 'Drinks - hot  cold', 20: 'Toys  party needs', 21: 'Pet care', 22: 'Baking  cooking', 23: 'Cleaning  homecare', 24: 'Liquor - wine'}


# Segmentation Data

In [16]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25,
                                                                            random_state=3)

# SVM

In [26]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
model = OneVsRestClassifier(SVC())#C=1, gamma=20, decision_function_shape='ovr'))
model.fit(x_train, y_train) 
predict_labels = model.predict(x_test)
evaluation(predict_labels, y_test, id2label, "svm")

svm: acc:0.8920841980432849, recall:0.8389959213418703, f1 score:0.8394722611556472
                           precision    recall  f1-score   support

          Breakfast foods       0.87      0.91      0.89       136
                   Bakery       0.94      0.86      0.90       118
    Office  entertainment       0.61      0.77      0.68        52
   Canned  prepared foods       0.92      0.89      0.91       162
                     Meat       0.81      0.90      0.85        60
      Deli  chilled foods       0.83      0.87      0.85       230
                Baby care       0.94      0.96      0.95       125
                  Seafood       0.71      0.46      0.56        26
     Liquor - beer  cider       0.94      1.00      0.97        59
             Frozen foods       0.91      0.93      0.92       210
         Meal ingredients       0.86      0.86      0.86       214
                 Pharmacy       0.00      0.00      0.00         1
         Health  wellness       0.79      0.

# KNN

In [18]:
# knn
k = len(id2label)
model = KNeighborsClassifier(n_neighbors=k)
model.fit(x_train, y_train)
predict_labels = model.predict(x_test)
evaluation(predict_labels, y_test, id2label, "knn")

knn: acc:0.7512600059294396, recall:0.6973786987764538, f1 score:0.7231536041824554
                           precision    recall  f1-score   support

          Breakfast foods       0.86      0.67      0.75       136
                   Bakery       0.88      0.71      0.79       118
    Office  entertainment       0.74      0.50      0.60        52
   Canned  prepared foods       0.90      0.71      0.79       162
                     Meat       0.75      0.73      0.74        60
      Deli  chilled foods       0.93      0.71      0.81       230
                Baby care       0.96      0.82      0.88       125
                  Seafood       0.60      0.35      0.44        26
     Liquor - beer  cider       0.93      0.90      0.91        59
             Frozen foods       0.85      0.74      0.79       210
         Meal ingredients       0.85      0.77      0.81       214
                 Pharmacy       0.00      0.00      0.00         1
         Health  wellness       0.66      0.