In [1]:
import lxml
import json
import copy
import string
import warnings
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.svm import SVC
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings("ignore")

In [2]:
Labels = ['country', 'id', 'title', 'c1', 'c2', 'c3', 'description', 'price', 'type']

In [3]:
df = pd.read_csv('data_train.csv',names = Labels)

In [4]:
def cleaning_data(df, tfidf_vectorizer, tfidf_vectorizer_train=None):

    df.drop(['country','id','price','type'],inplace=True,axis=1)

    def replace_all(text, sym=['<ul>', '</ul>', '<li>', '</li>']):
        for i in sym:
            text = text.replace(i, ' ')

        textSec = []
        for i in text.split(' '):
            textTh = []
            for j in i:
                textTh.append(j if ord(j) < 128 else ' ')
            textTh = ''.join(textTh).strip().split(' ')

            for t in textTh:
                if t != '':
                    textSec.append(t)

        return ' '.join(textSec)

    def remove_puncs(text):
        punctuations = '''!()–+=-[]{};:'"\,<>./?@#$%^&*_~'''
        no_punct = ""
        for char in text:
            if char not in punctuations:
                no_punct += char

        return no_punct

    def PreProcess(df, cols=['title','description']):
        for col in cols:
            df[col].fillna('', inplace=True)
            df[col] = df[col].apply(lambda x: replace_all(x))
            if col == 'description':
                df[col] = df[col].apply(lambda x: BeautifulSoup(x, "html.parser").text)
            df[col] = df[col].apply(lambda x: remove_puncs(x))

    PreProcess(df)

    df['c3'] = df['c3'].astype(str)

    Y1 = df['c1']
#     y1_le = LabelEncoder()
#     Y1 = y1_le.fit_transform(Y1)
    
#     print(y1_le)

    Y2 = df['c2']
    Y2 = LabelEncoder().fit_transform(Y2)

    Y3 = df['c3']
    Y3 = LabelEncoder().fit_transform(Y3)

    X = df.drop(['c1','c2','c3'],axis=1)

    X = X['title'] +' '+ X['description']
    
    X_tfidf = tfidf_vectorizer.fit_transform(X)
    
    if tfidf_vectorizer_train:
    
        corpus_vocabulary = defaultdict(None, copy.deepcopy(tfidf_vectorizer_train.vocabulary_))
        corpus_vocabulary.default_factory = corpus_vocabulary.__len__
    
        for word in tfidf_vectorizer.vocabulary_.keys():
            if word in tfidf_vectorizer_train.vocabulary_:
                corpus_vocabulary[word]
                
        tfidf_vectorizer = TfidfVectorizer(vocabulary=corpus_vocabulary)
        X_tfidf = tfidf_vectorizer.fit_transform(X)
    
    return X_tfidf, Y1, Y2, Y3
    
# c_vec = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# X_train_tfidf, Y1, Y2, Y3 = cleaning_data(df, c_vec, tfidf_transformer)
X_train_tfidf, Y1, Y2, Y3 = cleaning_data(df, tfidf_vectorizer)

LabelEncoder()


In [5]:
X_train_tfidf.size

1197340

# For Catergory 1

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, Y1, test_size=0.3, random_state=42)
clf_svm_cat_one = svm.LinearSVC().fit(X_train, y_train)

In [7]:
y_pred_one = clf_svm_cat_one.predict(X_test)

In [8]:
y_pred_one, y_test

(array([8, 0, 8, ..., 8, 6, 3]), array([8, 0, 4, ..., 8, 6, 3]))

In [9]:
accuracy_score(y_test, y_pred_one)

0.9382636655948553

In [10]:
print(classification_report(y_test, y_pred_one))

              precision    recall  f1-score   support

           0       0.94      0.93      0.93       610
           1       0.93      0.90      0.91       897
           2       0.97      0.97      0.97      1756
           3       0.95      0.95      0.95      1144
           4       0.90      0.91      0.91      1847
           5       0.92      0.88      0.90       452
           6       0.95      0.97      0.96      2139
           7       0.87      0.87      0.87       733
           8       0.96      0.96      0.96      1307

    accuracy                           0.94     10885
   macro avg       0.93      0.93      0.93     10885
weighted avg       0.94      0.94      0.94     10885



# For Catergory 2

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, Y2, test_size=0.3, random_state=42)
clf_svm = svm.LinearSVC().fit(X_train, y_train)

In [12]:
y_pred = clf_svm.predict(X_test)

In [13]:
accuracy_score(y_test, y_pred)

0.8741387230133211

In [14]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2027
           1       0.84      0.84      0.84       343
           2       0.77      0.72      0.74        68
           3       0.69      0.59      0.64        95
           4       0.76      0.62      0.68       119
           5       0.92      0.93      0.93       104
           6       0.78      0.39      0.52        18
           7       0.82      0.86      0.84       223
           8       0.86      0.83      0.85       449
           9       0.94      0.91      0.93        34
          10       0.90      0.90      0.90       106
          11       0.80      0.60      0.69        20
          12       1.00      0.50      0.67        34
          13       0.78      0.78      0.78        23
          14       0.95      0.99      0.97       147
          15       0.88      0.95      0.91       119
          16       0.94      0.96      0.95        95
          17       0.79    

# For Catergory 3

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, Y3, test_size=0.3, random_state=42)
clf_svm = svm.LinearSVC().fit(X_train, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [36283, 11838]

In [16]:
y_pred = clf_svm.predict(X_test)

[179 184 179 ... 106 184  41]


In [17]:
accuracy_score(y_test, y_pred)

0.7841065686724851

In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.41      0.50      0.45        14
           1       0.79      0.83      0.81       170
           2       0.67      0.72      0.69        25
           3       0.32      0.41      0.36        27
           4       0.95      1.00      0.98        20
           5       0.77      0.87      0.82        23
           6       0.40      0.24      0.30        17
           7       0.60      0.35      0.44        17
           8       0.65      0.48      0.55        46
           9       0.90      0.95      0.92       271
          10       0.69      0.74      0.72        46
          11       0.67      0.38      0.48        16
          12       0.75      0.69      0.72        35
          13       0.71      0.50      0.59        20
          14       0.71      0.62      0.66       137
          15       0.91      0.85      0.88       113
          16       0.60      0.43      0.50        28
          17       0.97    

# Checking Test Data

In [19]:
test_df = pd.read_csv('data_test.csv',names = Labels)

# For Cat 1

In [20]:
tfidf_vectorizer_test = TfidfVectorizer()

X_test_tfidf, Y1, Y2, Y3 = cleaning_data(test_df, tfidf_vectorizer_test, tfidf_vectorizer)

y_pred_one = clf_svm_cat_one.predict(X_test_tfidf)

LabelEncoder()


In [21]:
accuracy_score(Y1, y_pred_one)

0.935462071295827

In [22]:
# ### DEBUG
query = 'Leather/Plastic Case for 7"-8" Tablet (Pink) with Free 0118 Sports Bottle (Color May Vary) Ammar'

def pred_query(query):
    
    query = remove_puncs(replace_all(query))

    corpus_vocabulary = defaultdict(None, copy.deepcopy(tfidf_vectorizer.vocabulary_))
    corpus_vocabulary.default_factory = corpus_vocabulary.__len__

    tfidf_transformer_query = TfidfVectorizer()
    tfidf_transformer_query.fit_transform([query])

    for word in tfidf_transformer_query.vocabulary_.keys():
        if word in tfidf_vectorizer.vocabulary_:
            corpus_vocabulary[word]

    tfidf_transformer_query_sec = TfidfVectorizer(vocabulary=corpus_vocabulary)
    query_tfidf_matrix = tfidf_transformer_query_sec.fit_transform([query])

    return clf_svm.predict(query_tfidf_matrix)