In [1]:
from util import print_log, validate_model, sparse_validate_model

In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (15,30)

## Read raw data as lines

In [3]:
raw_train = pd.DataFrame([line for line in open('../data/classification_train.tsv',encoding='utf8')],columns=['line'])

In [4]:
raw_test = pd.DataFrame([line for line in open('../data/classification_blind_set.tsv',encoding='utf8')],columns=['line'])

## Extract category and brand from raw data

In [5]:
train = raw_train.line.str.extract(r'(.*)\t(\d+)\t(\d+)$',expand=True)
train.columns = ['product_title', 'brand_id', 'category_id']
train = train.dropna()
train.loc[:, ['brand_id', 'category_id']] = train.loc[:, ['brand_id', 'category_id']].astype(int)

In [6]:
test = raw_test.line.str.extract(r'(.*)\t(-?\d+)$',expand=True)
test.columns = ['product_title', 'category_id']
test = test.dropna()
test.loc[:, ['category_id']] = test.loc[:, ['category_id']].astype(int)

In [7]:
! wc -l ../data/classification_train.tsv

1000000 ../data/classification_train.tsv


In [8]:
! wc -l ../data/classification_blind_set.tsv

619243 ../data/classification_blind_set.tsv


In [9]:
train.shape, test.shape

((999996, 3), (619243, 2))

# missed rows

In [10]:
1000000 - train.shape[0], 619243 -  test.shape[0]

(4, 0)

In [11]:
pd.options.display.max_colwidth = 900

In [12]:
raw_train[~raw_train.line.str.contains(r'(.*)\t(\d+)\t(\d+)$')]

  if __name__ == '__main__':


Unnamed: 0,line
218278,title\tbid\tcid\n
246806,""" 2 Pack Panasonic Compatible KX-FA83 KXFA83 Laser Toner Cartridge, 2,500 Pa\tUnknown\tcomputers & accessories > cables & accessories > printer ink & toner > laser printer toner\n"
458263,"""Brocade VDX 6720 - switch - 16 ports - rack-mountable\tBrocade Communication Systems\tcomputers & accessories > networking products > switches\n"
575503,"""This hub is built strong and ideal for industrial environments. With the StarTech.com ST4200USBM 4-port hub you can wall-mount or install onto a DIN rail for convenient access to the ports. This four port industrial hub can be bus powered or self powered with a three wire terminal block connector 7-24V . Plug in your most demanding next-generation peripherals and still enjoy data transfer speed\tStarTech\tcomputers & accessories > networking products > hubs\n"


## exploring category and brand cardinalities

In [13]:
all_data = pd.concat([train, test])

In [14]:
all_data.category_id.value_counts().shape

(707,)

In [15]:
vc = train.brand_id.value_counts()
vc[vc<20].shape

(32231,)

# Model Learning

In [16]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk import word_tokenize

In [17]:
class Tokenizer(object):
    def __init__(self):
        self.tokenizer = word_tokenize
        self.stop_words = set(
        ['is', 'of', 'it', 'at', 'on', 'and', 'as', 'the', 'to', 'are', 'this', 'that', 'be', 'in',
          'an', 'or','any', 'all', 'am','you','we', '__NUMBER__', '__SERIAL__'])

    def __call__(self, text):
        text = text.lower()
        # replace special characters
        text = re.sub(r'[^a-z0-9\s/\\_\t,\-]', '', text,flags=re.IGNORECASE)
        text = re.sub(r'[/\\_\t,-]', ' ', text,flags=re.IGNORECASE)
        # replace numbers to reduce number of features
        text = re.sub(r'\b[0-9]+\b', ' __NUMBER__ ', text) 
        # replace possible product/serial numbers
        text = re.sub(r'\b\w*\d+\w*\d?\b', ' __SERIAL__ ', text)
                
        tokens = [w for w in self.tokenizer(text) if (w not in self.stop_words and len(w)>1)]
        # only return first and last two tokens
        return tokens if len(tokens) <5 else tokens[:2] + tokens[-2:]

In [18]:
vectorizer = TfidfVectorizer(tokenizer=Tokenizer())
print_log("starting vectorizer fit_transform")
sparse_title = vectorizer.fit_transform(train['product_title'])
print_log("completed vectorizer fit_transform")

2016-04-02 20:11:38,672511	starting vectorizer fit_transform
2016-04-02 20:17:10,986182	completed vectorizer fit_transform


In [19]:
print("distinct words found", len(vectorizer.vocabulary_))

distinct words found 62778


To build sparse matrix from vectorized tokens and category_id  
learn model on top of this

In [20]:
from sklearn.feature_extraction import DictVectorizer

In [21]:
temp_vectorizer = DictVectorizer()
print_log("starting sparse category")
sparse_category = temp_vectorizer.fit_transform(train.category_id.astype(str).apply(lambda x: {x: 1}))
print_log("completed sparse category")

2016-04-02 20:17:11,209707	starting sparse category
2016-04-02 20:17:15,249044	completed sparse category


In [22]:
sparse_category.shape, (train.category_id.shape, train.category_id.nunique())

((999996, 609), ((999996,), 609))

In [23]:
from scipy.sparse import hstack

In [24]:
joined_data = hstack([sparse_category, sparse_title], format='csr')

In [25]:
joined_data.shape

(999996, 63387)

Model validation
--

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import ShuffleSplit

In [None]:
# Validate logistic regression 
print_log("starting validation")
sparse_validate_model(X=joined_data, Y=train.brand_id.astype(str), classifier=LogisticRegression(), 
               split_generator=lambda Y: ShuffleSplit(n=Y.shape[0], n_iter=3, test_size=0.4))
print_log("completed validation")

2016-04-02 20:17:16,079569	starting validation
2016-04-02 20:17:17,595724	starting iteration 1
