In [1]:
from util import print_log, validate_model, sparse_validate_model

In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (15,30)

## Read raw data as lines

In [3]:
raw_train = pd.DataFrame([line for line in open('../data/classification_train.tsv',encoding='utf8')],columns=['line'])

In [4]:
raw_test = pd.DataFrame([line for line in open('../data/classification_blind_set_corrected.tsv',encoding='utf8')],columns=['line'])

## Extract category and brand from raw data

In [5]:
train = raw_train.line.str.extract(r'(.*)\t(\d+)\t(\d+)$',expand=True)
train.columns = ['product_title', 'brand_id', 'category_id']
train = train.dropna()
train.loc[:, ['brand_id', 'category_id']] = train.loc[:, ['brand_id', 'category_id']].astype(int)

In [6]:
test = raw_test.line.str.extract(r'(.*)\t(-?\d+)$',expand=True)
test.columns = ['product_title', 'category_id']
test = test.dropna()
test.loc[:, ['category_id']] = test.loc[:, ['category_id']].astype(int)

In [7]:
! wc -l ../data/classification_train.tsv

1000000 ../data/classification_train.tsv


In [8]:
! wc -l ../data/classification_blind_set.tsv

wc: ../data/classification_blind_set.tsv: No such file or directory


In [9]:
train.shape, test.shape

((999996, 3), (619240, 2))

# missed rows

In [10]:
1000000 - train.shape[0], 619243 -  test.shape[0]

(4, 3)

In [11]:
pd.options.display.max_colwidth = 900

# Model Learning

In [12]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk import word_tokenize

In [13]:
class Tokenizer(object):
    def __init__(self):
        self.tokenizer = word_tokenize
        self.stop_words = set(
        ['is', 'of', 'it', 'at', 'on', 'and', 'as', 'the', 'to', 'are', 'this', 'that', 'be', 'in',
          'an', 'or','any', 'all', 'am','you','we', '__NUMBER__', '__SERIAL__'])

    def __call__(self, text):
        text = text.lower()
        # replace special characters
        text = re.sub(r'[^a-z0-9\s/\\_\t,\-]', '', text,flags=re.IGNORECASE)
        text = re.sub(r'[/\\_\t,-]', ' ', text,flags=re.IGNORECASE)
        # replace numbers to reduce number of features
        text = re.sub(r'\b[0-9]+\b', ' __NUMBER__ ', text) 
        # replace possible product/serial numbers
        text = re.sub(r'\b\w*\d+\w*\d?\b', ' __SERIAL__ ', text)
                
        tokens = [w for w in self.tokenizer(text) if (w not in self.stop_words and len(w)>1)]
        # only return first and last two tokens
        return tokens if len(tokens) <5 else tokens[:2] + tokens[-2:]

In [77]:
class TokenizerV2(object):
    def __init__(self):
        self.tokenizer = word_tokenize
        self.stop_words = set(
        ['is', 'of', 'it', 'at', 'on', 'and', 'as', 'the', 'to', 'are', 'this', 'that', 'be', 'in',
          'an', 'or','any', 'all', 'am','you','we', '__NUMBER__', '__SERIAL__'])

    def __call__(self, text):
        text = text.lower()
        # replace special characters
        text = re.sub(r'[^a-z0-9\s/\\_\t,\-]', '', text,flags=re.IGNORECASE)
        text = re.sub(r'[/\\_\t,-]', ' ', text,flags=re.IGNORECASE)
        # replace numbers to reduce number of features
        text = re.sub(r'\b[0-9]+\b', ' __NUMBER__ ', text) 
        # replace possible product/serial numbers
        text = re.sub(r'\b\w*\d+\w*\d?\b', ' __SERIAL__ ', text)
                
        tokens = [w for w in self.tokenizer(text) if (w not in self.stop_words and len(w)>1)]
        # only return first and last two tokens
        return tokens

In [78]:
vectorizer = TfidfVectorizer(tokenizer=TokenizerV2())
print_log("starting vectorizer fit_transform")
sparse_title = vectorizer.fit_transform(train['product_title'])
print_log("completed vectorizer fit_transform")

2016-04-03 09:13:14,836160	starting vectorizer fit_transform
2016-04-03 09:17:30,062001	completed vectorizer fit_transform


In [79]:
print("distinct words found", len(vectorizer.vocabulary_))

distinct words found 89276


To build sparse matrix from vectorized tokens and category_id  
learn model on top of this

In [80]:
from sklearn.feature_extraction import DictVectorizer

In [81]:
category_dict_vectorizer = DictVectorizer()
print_log("starting sparse category")
sparse_category = category_dict_vectorizer.fit_transform(train.category_id.astype(str).apply(lambda x: {x: 1}))
print_log("completed sparse category")

2016-04-03 09:17:30,282636	starting sparse category
2016-04-03 09:17:33,031100	completed sparse category


In [82]:
sparse_category.shape, (train.category_id.shape, train.category_id.nunique())

((999996, 609), ((999996,), 609))

In [83]:
from scipy.sparse import hstack

In [84]:
joined_data = hstack([sparse_category, sparse_title], format='csr')

In [85]:
joined_data.shape

(999996, 89885)

Saving file

In [86]:
from scipy.io import mmwrite, mmread
mmwrite('joined_data.mtx', joined_data)
mmwrite('sparse_category.mtx', sparse_category)
mmwrite('sparse_title.mtx', sparse_title)

In [87]:
# joined_data = mmread('joined_data.mtx')
# sparse_category = mmread('sparse_category')
# sparse_title = mmread('sparse_title')

## Similar Products

In [88]:
from scipy.sparse import csr_matrix,vstack

In [89]:
%%time
brands = train['brand_id'].values
unique_brands = np.unique(brands)
joined_data_grouped = csr_matrix(np.zeros((1,joined_data.shape[1])))
for brand in unique_brands:
    grp_sum = joined_data[brands == brand].sum(axis=0)
    joined_data_grouped = vstack([joined_data_grouped,grp_sum])

CPU times: user 21min 18s, sys: 1.33 s, total: 21min 20s
Wall time: 21min 21s


In [90]:
joined_data_grouped = csr_matrix(joined_data_grouped)
joined_data_grouped = joined_data_grouped[1:]

In [91]:
from sklearn.neighbors import LSHForest
lshf = LSHForest(random_state=42)
lshf.fit(joined_data_grouped)  

LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10, n_neighbors=5,
     radius=1.0, radius_cutoff_ratio=0.9, random_state=42)

In [92]:
mmwrite('joined_data_grouped.mtx', joined_data_grouped)

In [93]:
%time distances, indices = lshf.kneighbors(joined_data_grouped, n_neighbors=5)

CPU times: user 8min 49s, sys: 611 ms, total: 8min 49s
Wall time: 8min 50s


In [94]:
dist_neighbors = np.hstack([distances,unique_brands[indices]])

In [95]:
dist_neighbors_df = pd.DataFrame(dist_neighbors,columns=['d1','d2','d3','d4','d5','brand_1',
                                                         'brand_2','brand_3','brand_4','brand_5'])

In [96]:
dist_neighbors_df = dist_neighbors_df.set_index('brand_1')

In [97]:
del dist_neighbors_df['d1']

In [98]:
dist_neighbors_df.to_csv('similar_brands_2.csv')

In [99]:
idx = dist_neighbors_df.iloc[0].select(lambda x:'brand' in x)

In [103]:
train[train['brand_id'].isin(idx)]['product_title'].apply(tokenize)

14804                                                                                                                                                 [high, power, hpc, df, watt, dual, fan, atx, power, supply]
28555                                                                                                                                              [highpower, astro, apt, digital, platinum, power, supply, hpa]
29475                                                                                                                                                [power, one, power, supply, ac, dc, openframe, medical, mbc]
52403                                                                                                                                               [power, supply, sdc, switching, dc, dc, converter, vdc, amps]
58856                                                                                                                                           [dynapower, tc, 

In [101]:
tokenize = TokenizerV2()

In [121]:
dist_neighbors_df = dist_neighbors_df.reset_index()

In [124]:
dist_neighbors_df_meleted = pd.melt(dist_neighbors_df,id_vars=['brand_1'],value_vars=['brand_2','brand_3','brand_4','brand_5'])

In [125]:
dist_neighbors_dist_meleted = pd.melt(dist_neighbors_df,id_vars=['brand_1'],value_vars=['d2','d3','d4','d5'])

In [130]:
knn_df = pd.merge(dist_neighbors_df_meleted,dist_neighbors_dist_meleted,how='inner',on='brand_1')

In [133]:
knn_df = knn_df.sort_values(by=['brand_1','value_y'],ascending=True)

In [146]:
zero_dist_neighbors = knn_df[knn_df['value_y'].astype(float)==0]
zero_dist_neighbors = zero_dist_neighbors.drop_duplicates(subset=['brand_1','value_x'])
zero_dist_neighbors.brand_1.unique()

array([  621,  1093,  2394,  4479,  4593,  5114,  5205,  7461,  7839,
        7851,  8538,  8675,  8703,  8735, 10920, 11551, 11622, 12149,
       13274, 13662, 14449, 15488, 16145, 16979, 18767, 18784, 18889,
       19132, 19186, 19542, 20096, 20738, 23188, 24053, 24489, 25108,
       25423, 25500, 25974, 27327, 27603, 27752, 28089, 29639, 29990,
       30415, 30913, 31460, 32426, 32580, 32937, 33418, 36092, 36580,
       36641, 38277, 38488, 38723, 39639, 40115, 40884, 42004, 42392,
       43813, 44491, 44788, 44904, 44920, 44924])

In [147]:
idx = zero_dist_neighbors.query('brand_1 == 1093')['value_x'].astype(int)

In [148]:
train[train.brand_id.isin(idx)]

Unnamed: 0,product_title,brand_id,category_id
145076,HP COMPAQ 380724-001 Laptop Screen 17 LCD CCFL WXGA 1440x900,37194,415
420227,HP Compaq 6730s Laptop Screen 15.4 LCD CCFL WXGA 1280x800,13522,415
511605,Compaq Presario CQ61-216EL Laptop Screen 15.6 LCD CCFL WXGA HD 1366x768,17888,415
525535,HP Compaq 6531s Laptop Screen 14.1 LCD CCFL WXGA 1280x800,44804,415


In [149]:
zero_dist_neighbors.brand_1.nunique()

69

In [160]:
possible_duplicates = zero_dist_neighbors[['brand_1','value_x']]
possible_duplicates.columns = ['brand_id','duplicate_brand_id']
possible_duplicates.to_csv('possible_brand_duplicates.csv',index=False)

In [165]:
main_product_title = pd.merge(possible_duplicates,train,on='brand_id')[['brand_id','duplicate_brand_id','product_title']]
main_product_title.columns = ['main_brand_id','duplicate_brand_id','main_product_title']

In [176]:
dup_product_title = pd.merge(main_product_title,train,right_on='brand_id',left_on='duplicate_brand_id',how='inner')

Unnamed: 0,main_brand_id,duplicate_brand_id,main_product_title,product_title,brand_id,category_id
0,621,33893,USBLT2MW 2m Lightning to USB Cable,"""USBLT15CMW 6"""" Lightning to USB Cable""",33893,390
1,621,33893,USBLT2MW 2m Lightning to USB Cable,"""USBLT15CMW 6"""" Lightning to USB Cable""",33893,390
2,621,33893,USBLT2MW 2m Lightning to USB Cable,"""USBLT15CMW 6"""" Lightning to USB Cable""",33893,390
3,621,8920,USBLT2MW 2m Lightning to USB Cable,Lightning to USB Cable,8920,390
4,621,8920,USBLT2MW 2m Lightning to USB Cable,Lightning to USB Cable,8920,390
5,621,8920,USBLT2MW 2m Lightning to USB Cable,Lightning to USB Cable,8920,390
6,621,23252,USBLT2MW 2m Lightning to USB Cable,lightning to usb cable,23252,390
7,621,23252,USBLT2MW 2m Lightning to USB Cable,lightning to usb cable,23252,390
8,621,23252,USBLT2MW 2m Lightning to USB Cable,lightning to usb cable,23252,390
9,621,3285,USBLT2MW 2m Lightning to USB Cable,Lightning to USB Cable,3285,390


In [177]:
dup_product_title = dup_product_title[['brand_id','duplicate_brand_id',
                                                                                'main_product_title',
                                                                               'product_title']]
dup_product_title.columns = ['brand_id','duplicate_brand_id','main_product_title','dup_product_title']

In [179]:
dup_product_title.to_csv('possible_duplicates.csv',index=False)

In [180]:
dup_product_title

Unnamed: 0,brand_id,duplicate_brand_id,main_product_title,dup_product_title
0,33893,33893,USBLT2MW 2m Lightning to USB Cable,"""USBLT15CMW 6"""" Lightning to USB Cable"""
1,33893,33893,USBLT2MW 2m Lightning to USB Cable,"""USBLT15CMW 6"""" Lightning to USB Cable"""
2,33893,33893,USBLT2MW 2m Lightning to USB Cable,"""USBLT15CMW 6"""" Lightning to USB Cable"""
3,8920,8920,USBLT2MW 2m Lightning to USB Cable,Lightning to USB Cable
4,8920,8920,USBLT2MW 2m Lightning to USB Cable,Lightning to USB Cable
5,8920,8920,USBLT2MW 2m Lightning to USB Cable,Lightning to USB Cable
6,23252,23252,USBLT2MW 2m Lightning to USB Cable,lightning to usb cable
7,23252,23252,USBLT2MW 2m Lightning to USB Cable,lightning to usb cable
8,23252,23252,USBLT2MW 2m Lightning to USB Cable,lightning to usb cable
9,3285,3285,USBLT2MW 2m Lightning to USB Cable,Lightning to USB Cable
