In [24]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (15,30)

In [25]:
! head -10 ../data/classification_train.tsv

120GB Hard Disk Drive with 3 Years Warranty for Lenovo Essential B570 Laptop Notebook HDD Computer - Certified 3 Years Warranty from Seifelden	3950	8
"TOSHIBA SATELLITE L305-S5919 LAPTOP LCD SCREEN 15.4"" WXGA CCFL SINGLE SUBSTITUTE REPLACEMENT LCD SCREEN ONLY. NOT A LAPTOP"	35099	324
Hobby-Ace Pixhawk PX4 RGB External LED Indicator USB Module for Pixhawk Flight Controller	21822	510
Pelicans mousepad	44629	260
P4648-60029 Hewlett-Packard Tc2100 System Board	42835	68
Ectaco EI900 SD Card English - Italian	249	6
Zippered Pocket Black School Laptop Tablet Dual Straps Deluxe Backpack	4342	172
GP Mtr, 3 Ph, ODP, 3 HP, 1800 rpm, 182TC	42618	616
DIGI CM 16 PORT RJ-45 CONSOLE SERVER - Model#: 70001910	37012	248
Lenovo Ibm Thinkpad R52-1859 Laptop AC adapter, power adapter Replacement	2024	313


In [26]:
! head -10 ../data/classification_blind_set.tsv

HDE Slim Lightweight Auto Sleep Wake Smart Cover Case Stand for iPad Air 5ht Gen Sky Blue 	367
"V7 Black Toner Cartridge For Canon Image Class D1120, D1150, D1170, D1180 120 2617B001aa 5K Yld - Laser - 5000 Page ""Product Category: Print Supplies/Ink/Toner Cartridges"""	625
"Decalrus - Matte Protective Decal Skin skins Sticker for Samsung Galaxy Tab 3 with 10.1"" screen IMPORTANT: Must view ""IDENTIFY"" image for correct model case cover MATGalaxyTAB3_10-304"	250
Vantec Thermal Technologies 89661 Vantec Accessory Nst-d300s3 2.5inch/3.5inch Sata I/ii To Usb3.0 Nexstar Hard Drive Dock Retail	309
Fintie Samsung Galaxy Tab Pro 10.1 Folio Case - Slim Fit Leather Cover for TabPro 10.1-inch Tablet SM-T520/T525 with Auto Sleep/Wake Feature, Giraffe Magenta	71
Canon Powershot / IXUS / ELPH 330 HS USB Cable - Mini USB	390
RML USB Standard Type A 2.0 Male to Micro USB Female Adapter	390
Axiom 10GBASE-ER Xfp Transceiver for Cisco # XFP-10GER-192IR+	530
Supermicro X8STE Server Motherboard I

## Read raw data as lines

In [27]:
raw_train = pd.DataFrame([line for line in open('../data/classification_train.tsv',encoding='utf8')],columns=['line'])

In [28]:
raw_test = pd.DataFrame([line for line in open('../data/classification_blind_set.tsv',encoding='utf8')],columns=['line'])

## Extract category and brand from raw data

In [29]:
train = raw_train.line.str.extract(r'(.*)\t(\d+)\t(\d+)$',expand=True)
train.columns = ['product_title', 'brand_id', 'category_id']
train = train.dropna()
train.loc[:, ['brand_id', 'category_id']] = train.loc[:, ['brand_id', 'category_id']].astype(int)

In [30]:
test = raw_test.line.str.extract(r'(.*)\t(-?\d+)$',expand=True)
test.columns = ['product_title', 'category_id']
test = test.dropna()
test.loc[:, ['category_id']] = test.loc[:, ['category_id']].astype(int)

In [31]:
! wc -l ../data/classification_train.tsv

1000000 ../data/classification_train.tsv


In [32]:
! wc -l ../data/classification_blind_set.tsv

619243 ../data/classification_blind_set.tsv


In [33]:
train.shape, test.shape

((999996, 3), (619243, 2))

# missed rows

In [34]:
1000000 - train.shape[0], 619243 -  test.shape[0]

(4, 0)

In [35]:
pd.options.display.max_colwidth = 900

In [36]:
raw_train[~raw_train.line.str.contains(r'(.*)\t(\d+)\t(\d+)$')]

  if __name__ == '__main__':


Unnamed: 0,line
218278,title\tbid\tcid\n
246806,""" 2 Pack Panasonic Compatible KX-FA83 KXFA83 Laser Toner Cartridge, 2,500 Pa\tUnknown\tcomputers & accessories > cables & accessories > printer ink & toner > laser printer toner\n"
458263,"""Brocade VDX 6720 - switch - 16 ports - rack-mountable\tBrocade Communication Systems\tcomputers & accessories > networking products > switches\n"
575503,"""This hub is built strong and ideal for industrial environments. With the StarTech.com ST4200USBM 4-port hub you can wall-mount or install onto a DIN rail for convenient access to the ports. This four port industrial hub can be bus powered or self powered with a three wire terminal block connector 7-24V . Plug in your most demanding next-generation peripherals and still enjoy data transfer speed\tStarTech\tcomputers & accessories > networking products > hubs\n"


## exploring category and brand cardinalities

In [40]:
all_data = pd.concat([train, test])

In [45]:
all_data.category_id.value_counts().shape

(707,)

In [61]:
vc = train.brand_id.value_counts()
vc[vc<20].shape

(32231,)

# Model Learning

In [203]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk import word_tokenize

In [204]:
class Tokenizer(object):
    def __init__(self):
        self.tokenizer = word_tokenize
        self.stop_words = set(
        ['is', 'of', 'it', 'at', 'on', 'and', 'as', 'the', 'to', 'are', 'this', 'that', 'be', 'in',
          'an', 'or','any', 'all', 'am','you','we'])

    def __call__(self, text):
        text = text.lower()
        # replace special characters
        text = re.sub(r'[^a-z0-9\s/\\_\t,\-]', '', text,flags=re.IGNORECASE)
        text = re.sub(r'[/\\_\t,-]', ' ', text,flags=re.IGNORECASE)
        # replace numbers to reduce number of features
        text = re.sub(r'\b[0-9]+\b', ' __NUMBER__ ', text) 
        # replace possible product/serial numbers
        text = re.sub(r'\b\w*\d+\w*\d?\b', ' __SERIAL__ ', text)
                
        tokens = [w for w in self.tokenizer(text) if (w not in self.stop_words and len(w)>1)]
        return tokens

In [205]:
tokenizer = Tokenizer()
tokenizer("SUN 0066152-0409N08VG5 PULLS FROM SUN FIRE V120, 36GB SCSI 80 PIN ULTRA3 10000 RPM U160 (00661520409N08VG5)")

['sun',
 '__NUMBER__',
 '__SERIAL__',
 'pulls',
 'from',
 'sun',
 'fire',
 '__SERIAL__',
 '__SERIAL__',
 'scsi',
 '__NUMBER__',
 'pin',
 '__SERIAL__',
 '__NUMBER__',
 'rpm',
 '__SERIAL__',
 '__SERIAL__']

In [206]:
vectorizer = TfidfVectorizer(tokenizer=Tokenizer())
%time transformed_data = vectorizer.fit_transform(train['product_title'])

CPU times: user 3min 51s, sys: 947 ms, total: 3min 52s
Wall time: 3min 52s


In [207]:
cv = CountVectorizer(tokenizer=Tokenizer())
%time transformed_data = cv.fit_transform(train['product_title'])

CPU times: user 3min 48s, sys: 428 ms, total: 3min 49s
Wall time: 3min 49s


In [208]:
len(cv.vocabulary_)

89278

In [209]:
wordVec = pd.Series(cv.vocabulary_)
words = wordVec.index.to_series()

In [210]:
words = wordVec.index.to_series()
ln = words.str.len()
ln[ln>15]

aasdkdkdieoopeuiooque              21
accessoriesaradise                 18
accessoriesbrown                   16
accessoriesnextdiatm               20
acetylgalactosaminyltransferase    31
acetylglucosamine                  17
acetylhexosamine                   16
acetylneuraminate                  17
acetyltransferase                  17
adapterpowersupplyfor              21
adapterreplacement                 18
addoncomputercom                   16
adenosylhomocysteinase             22
adenosylmethionine                 18
adenylosuccinate                   16
adenylyltransferase                19
aggravatingthree                   16
ahurikahinahukurou                 18
allinstallazione                   16
allthingsbasketball                19
aluminumbrzenbook                  17
amidotransferase                   16
aminotransferase                   16
analogpersonality                  17
anesthesiologist                   16
anglebamboometal                   16
antennaimpro

In [223]:
temp = train[train.product_title.str.contains("accessories*aradise", case=False)]
temp[temp.product_title.apply(lambda x: 'accessoriesaradise' in tokenizer(x))]

Unnamed: 0,product_title,brand_id,category_id
374680,Accessoriesaradise Ultra Thin Magnetic Blue Smart Cover & Back Clear Blue Case for Apple Ipad Mini New Ipad Mini 2 - Blue - AP2501,16876,71
