In [1]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
import numpy as np
import pandas as pd
import re
get_ipython().magic('matplotlib inline')



In [2]:
from pygoose import *

In [3]:
project = kg.Project.discover()

In [29]:
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('0')
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('0')

In [30]:
df_train.price = df_train.price.fillna(0)
df_test.price = df_test.price.fillna(0)

In [31]:
df_train.head()

Unnamed: 0,lvl1,lvl2,titles,descriptions,price
0,110,267,Seeking Vaccancy In A Mnufacturing Company,"when working with people one need to be kind, sincere, and loyal in all that he is doing.",0.0
1,5,55,Slippers For Men,its made of good qualities and good for all men's casuals,3000.0
2,27,257,Afro By Nature Hydrating Leave In Conditioner,"Dry, brittle hair? Damaged hair or split ends? No problem. \r\nThis product is formulated to restore moisture into the hair. it also repairs the hair and leaves it full, shiny and bouncy. Suitable for all hair texture. Recommended for both natural and relaxed hair.",2500.0
3,5,168,Porshe Design Wristwatches,Porshe design new wristwatch is now available at my store,175000.0
4,3,17,"Brand New Samsung 20"" 20J4003 TV LED - Black","KEY FEATURES\r\nBrand: Samsung\r\nModel: 20J4003\r\nDesign: LED\r\nVideo: 23.6"" Measured Diagonally\r\nWireless Connectivity: YES\r\nInputs & Outputs: HDMI, USB\r\nDimensions (W X H X D): 22.1"" x 13.7"" x 1.9""\r\nPower: AC110-120V 60Hz\r\nProduct warranty: 2years warranty.\r\nTo place your order,chat me up.\r\nDelivery available nationwide with discount. Also available in bulk.",38000.0


### Preprocess data and generate features

In [32]:
def clean(s):
    return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()

def to_vw_format(description, title, price, label=None):
    title = clean(str(title).lower())
    description = clean(str(description).lower())
    return str(label or '') + ' |title ' + ' '.join(re.findall('\w{3,}', title)) + ' description '  + ' '.join(re.findall('\w{3,}', description)) + ' word_count:' + str(description.count(' ')+1)+ ' price:' + str(price) + '\n'

In [33]:
from sklearn import preprocessing
topic_encoder = preprocessing.LabelEncoder()
y_train_encoded = topic_encoder.fit_transform(df_train.lvl1) + 1
y_test_encoded = topic_encoder.fit_transform(df_test.lvl1) + 1

In [51]:
topic_encoder.classes_

array([  1,   3,   4,   5,   6,   7,   8,   9,  27,  40,  47,  59, 110, 140])

In [34]:
%time

with open('./vw_lvl1/train.vw', 'w') as vw_train_data:
    for description, title, price, target in zip(df_train.descriptions, df_train.titles, df_train.price, y_train_encoded):
        vw_train_data.write(to_vw_format(description, title, price, target))
        
with open('./vw_lvl1/test.vw', 'w') as vw_test_data:
    for description, title, price in zip(df_test.descriptions, df_test.titles, df_test.price):
        vw_test_data.write(to_vw_format(description, title, price))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.01 µs


### Train model

In [35]:
from vowpalwabbit.sklearn_vw import VW

In [36]:
model = VW(
    convert_to_vw=False, loss_function='logistic', oaa=14, probabilities=True
)

In [37]:
%time
train = pd.read_csv('vw_lvl1' + '/train.vw', header=None, names=['data'])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.1 µs


In [41]:
%time
test = pd.read_csv('vw_lvl1' + '/test.vw', header=None, names=['data'])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 12.2 µs


In [38]:
train.head()

Unnamed: 0,data
0,13 |title seeking vaccancy mnufacturing company description when working with people one need kind sincere and loyal all that doing word_count:18 price:0
1,4 |title slippers for men description its made good qualities and good for all men casuals word_count:12 price:3000.0
2,9 |title afro nature hydrating leave conditioner description dry brittle hair damaged hair split ends problem this product formulated restore moisture into the hair also repairs the hair and leaves full shiny and bouncy suitable for all hair texture recommended for both natural and relaxed hair word_count:44 price:2500.0
3,4 |title porshe design wristwatches description porshe design new wristwatch now available store word_count:10 price:175000.0
4,2 |title brand new samsung 20j4003 led black description key features brand samsung model 20j4003 design led video measured diagonally wireless connectivity yes inputs outputs hdmi usb dimensions power ac110 120v 60hz product warranty 2years warranty place your order chat delivery available nationwide with discount also available bulk word_count:58 price:38000.0


In [39]:
%time
model.fit(train.data)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


{'probabilities': True, 'oaa': 14, 'quiet': True, 'loss_function': 'logistic'}

In [42]:
predict_proba = model.predict(test.data)

In [101]:
np.save('../data/features/VW_lvl1_proba_test',predict_proba)

In [89]:
def get_prediction_with_precision(classes, predict_proba, cat_num=3, with_proba=False):
    predictions_proba = predict_proba
    return [
        [
            (
                classes[pos], proba[pos]
            )
            for pos, proba in sorted(
                enumerate(predictions),
                key=lambda arg: arg[1], reverse=True
            )[:cat_num]
        ]
        if with_proba is False
        else
        [
            (
                classes[pos]
            )
            for pos, proba in sorted(
                enumerate(predictions),
                key=lambda arg: arg[1], reverse=True
            )[:cat_num]
        ]
        for i, predictions in enumerate(predictions_proba)
    ]

In [90]:
classes = topic_encoder.inverse_transform(np.array(range(0, 14, 1)))

In [92]:
predicted = get_prediction_with_precision(classes, predict_proba, 1, True)

In [93]:
predicted = [item for sublist in predicted for item in sublist]

In [96]:
print classification_report(list(df_test.lvl1), predicted)

             precision    recall  f1-score   support

          1       0.97      0.96      0.96     15503
          3       0.93      0.94      0.93     15483
          4       0.90      0.93      0.91     21531
          5       0.98      0.99      0.98     56622
          6       0.99      0.98      0.99     24020
          7       0.93      0.87      0.90       882
          8       0.85      0.75      0.80      2660
          9       0.99      0.99      0.99     16454
         27       0.94      0.95      0.95     12137
         40       0.79      0.67      0.72       239
         47       0.99      0.96      0.97     28935
         59       0.89      0.78      0.83      3649
        110       0.92      0.98      0.95     11497
        140       0.80      0.74      0.77      7063

avg / total       0.95      0.95      0.95    216675



In [103]:
M = confusion_matrix(list(df_test.lvl1), predicted)

In [104]:
print(M)

[[14858   403    68    95     8     2    11     0    10     0     0    13
      6    29]
 [  260 14542   372    85    26     4    45     6    19     0     4     3
     24    93]
 [   50   273 20040   167    73    24    52    32   202     1    20    52
     50   495]
 [   19    49   162 55873     4     1    49     6   107    11     8   206
     37    90]
 [   35    59   186    39 23572     0    11    13    20     1     4     7
     13    60]
 [    1     3    60    13     1   765     2     9     9     0     0     1
      8    10]
 [    7   111   169   132    16     2  1988     0    44     9    13    25
     31   113]
 [    2     4    42     5     8     1     1 16352     1     0     7     2
      7    22]
 [   11    12   267   152     4     1    32     2 11500    17     7    19
     15    98]
 [    0     0    13     4     1     0    33     0    27   159     0     0
      0     2]
 [    0    19    43    22     8     2    13    19    26     0 27855     3
    675   250]
 [   23    15   140  

In [126]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [127]:
def f(j):
    for i in np.where(M[j,:] > 0)[0]:
        print(classes[i], M[j,i])

In [129]:
interact(f, j=(0, 13));