# import Package

In [1]:
import numpy as np
import os
import nltk
import torch
import torch.nn
import torchtext.vocab as vocab
from sklearn import metrics
import pandas as pd
import warnings
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.linear_model import LinearRegression
from datetime import datetime
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
warnings.filterwarnings("ignore")


# Function definition

In [2]:
def load_data():
    df = pd.read_csv("./data/data_augumented.csv", encoding='utf_8_sig')
    # print(df.columns)
    texts = df['name'].values
    df = df.fillna(-1)
    # labels = np.argmax(df[df.columns[-4:]].values, axis=1)
    labels = df['cat'].values
    label2id = {l: i for i, l in enumerate(set(labels))}
    id2label = {i: l for l, i in label2id.items()}
    labels = [label2id[l] for l in labels]
    return texts, labels, id2label

In [3]:
def text_preprocess(text):
    text = str(text)
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\'', '/']
    text = "".join([(a if a not in english_punctuations else " ") for a in text])
    text = " ".join(nltk.tokenize.word_tokenize(text.lower()))
    return text

In [4]:
def load_embeddings():
    
    cache = '.vector_cache'
    if not os.path.exists(cache):
        os.mkdir(cache)

    word2vec = vocab.Vectors(name=r'./word2vec/word2vec_300dim.txt', cache=cache)

    return word2vec

In [5]:
def encode_text_to_features(vector, text):
    vectors = vector.get_vecs_by_tokens(text.split())
    sentence_vector = torch.mean(vectors, dim=0)
    return sentence_vector.tolist()

In [6]:
def evaluation(predictions, labels, id2label, model_name=None):
    acc = accuracy_score(labels, predictions)
    recall = recall_score(labels, predictions, average="macro")
    f1 = f1_score(labels, predictions, average="macro")
    report = metrics.classification_report(labels, predictions,
                                           target_names=[id2label[i] for i in range(len(id2label))])
    info = "acc:%s, recall:%s, f1 score:%s" % (acc, recall, f1)
    if model_name is not None:
        info = "%s: %s" % (model_name, info)
    print(info)
    print(report)

# Prepare corpus

In [7]:
data = pd.read_csv('./data/data_augumented.csv', encoding='utf_8_sig')
data.head()

Unnamed: 0,name,cat
0,ones baby wipes unscented,Baby care
1,unscented ones baby wipes little,﻿Baby care
2,baby ones little wipes unscented,﻿Baby care
3,little ones baby wipes rub unscented,﻿Baby care
4,little ones baby wipes unscented,﻿Baby care


In [8]:
data = data.apply([text_preprocess])

In [9]:
data.columns = ['name', 'cat']

In [10]:
data

Unnamed: 0,name,cat
0,ones baby wipes unscented,baby care
1,unscented ones baby wipes little,﻿baby care
2,baby ones little wipes unscented,﻿baby care
3,little ones baby wipes rub unscented,﻿baby care
4,little ones baby wipes unscented,﻿baby care
...,...,...
121423,ceres organics lentils organic fertilizer orga...,meal ingredients
121424,ceres organic fertilizer organics lentils orga...,meal ingredients
121425,ceres organics organic lentils brown,meal ingredients
121426,ceres brown lentils organic organics,meal ingredients


In [11]:
newfile = open('./data/corpus.txt', 'w', encoding='utf_8_sig')
for i in range(len(data['name'])):
    newline = data['name'][i] + ' ' + data['cat'][i] + '\n'
    newfile.write(newline)

# Train Word2vec

In [12]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing

In [13]:
readlines = open('./data/corpus.txt', 'r', encoding='utf_8_sig')

model = Word2Vec(LineSentence(readlines),
                 size=300, window=5, min_count=10, sample=1e-5,
                 workers=multiprocessing.cpu_count(), iter=5,)

model.wv.save_word2vec_format(r'./word2vec/word2vec_300dim.txt', binary=False)

# Start Train

In [14]:
texts, labels, id2label = load_data()
texts = [text_preprocess(t) for t in texts]
vector = load_embeddings()
features = [encode_text_to_features(vector, text) for text in texts]
print("data len:", len(texts))
print("id2label",id2label)

data len: 121428
id2label {0: 'Home  kitchenware', 1: 'Pet care', 2: 'Clothing  manchester', 3: 'Frozen foods', 4: 'Bakery', 5: 'Baking  cooking', 6: 'Baby care', 7: 'Health  wellness', 8: 'Pharmacy', 9: 'Personal care', 10: 'Biscuits  crackers', 11: 'Toys  party needs', 12: 'Breakfast foods', 13: 'Canned  prepared foods', 14: 'Cleaning  homecare', 15: 'Liquor - wine', 16: 'Fruit  vegetables', 17: 'Deli  chilled foods', 18: 'Meat', 19: 'Seafood', 20: 'Office  entertainment', 21: '\ufeffBaby care', 22: 'Liquor - beer  cider', 23: 'Drinks - hot  cold', 24: 'Meal ingredients', 25: 'Chocolate, sweets  snacks'}


# Segmentation Data

In [15]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25,
                                                                            random_state=3)

# SVM

In [None]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
model = OneVsRestClassifier(SVC()) #C=1, gamma=20, decision_function_shape='ovr')) 
model.fit(x_train, y_train) 
predict_labels = model.predict(x_test)
evaluation(predict_labels, y_test, id2label, "svm")

# KNN

In [None]:
# knn
k = len(id2label)
model = KNeighborsClassifier(n_neighbors=k)
model.fit(x_train, y_train)
predict_labels = model.predict(x_test)
evaluation(predict_labels, y_test, id2label, "knn")