In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt



import gc
gc.enable()

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score as bas
from sklearn.ensemble import BaggingClassifier

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import NearestCentroid
from sklearn.preprocessing import StandardScaler, FunctionTransformer, MinMaxScaler, MaxAbsScaler

from sklearn.ensemble import RandomForestClassifier
from nltk.stem import SnowballStemmer


<hr>

In [2]:
# data = pd.read_csv('../data-simplified-1-reduced-wordbal-800.csv')
data = pd.read_csv('../data-reduced-800-v2-shuffled.csv', index_col = 0)

  mask |= (ar1 == a)


In [3]:
test = pd.read_csv('../test.csv')

In [4]:
catcode = pd.read_csv('../data-simplified-1-catcode.csv', header = None, names = ['category'])['category']

In [5]:
catcode.to_dict()

{0: '3D_GLASSES',
 1: '3D_PENS',
 2: '3D_PRINTERS',
 3: '3D_PRINTER_FILAMENTS',
 4: 'ABDOMINAL_TONING_BELTS',
 5: 'ABS_SENSORS',
 6: 'AB_ROLLER_WHEELS',
 7: 'ACCORDIONS',
 8: 'ACOUSTIC_GUITARS',
 9: 'ACOUSTIC_PANELS',
 10: 'ACTION_CAMERA_MOUNTS',
 11: 'ACTION_FIGURES',
 12: 'ADHESIVE_TAPES',
 13: 'AEROBIC_CRUNCH_MACHINES',
 14: 'AFTERSHAVES',
 15: 'AGOGOS',
 16: 'AIRBAGS',
 17: 'AIRBAG_MODULES',
 18: 'AIRBRUSHES',
 19: 'AIRGUN_PELLETS',
 20: 'AIRSOFT_GUNS',
 21: 'AIR_COMPRESSORS',
 22: 'AIR_CONDITIONERS',
 23: 'AIR_CONDITIONER_REMOTE_CONTROLS',
 24: 'AIR_FRESHENERS',
 25: 'AIR_MATTRESSES',
 26: 'ALARMS_AND_SENSORS',
 27: 'ALARM_CLOCKS',
 28: 'ALL_IN_ONE',
 29: 'ALTERNATORS',
 30: 'ALTERNATOR_PULLEYS',
 31: 'AM_FM_RADIOS',
 32: 'ANALOG_CAMERAS',
 33: 'ANGLE_CLAMPS',
 34: 'ANIMAL_AND_BALL_HOPPERS',
 35: 'ANIMAL_CLIPPERS',
 36: 'ANTIQUE_CHAIRS',
 37: 'ANTIQUE_TAPE_RECORDERS',
 38: 'ANTIQUE_TOY_CARS',
 39: 'ANTIVIRUS_AND_INTERNET_SECURITY',
 40: 'ANTI_STATIC_PLIERS',
 41: 'ANTI_THEFT_STUDS

In [6]:
data.head()

Unnamed: 0,title,label_quality,language,category,priorities
14106282,Engranaje Bomba Inyectora Dbr Peugeot 205 306...,1,0,807,51
6991524,Luva Térmica Mão De Gato 53cm Therm Soft,1,1,1239,0
19616569,Chave Biela Starfer C/furo 17,0,1,1581,61
12046658,Bap N&ordm;3 1200 Mm C Parafuso Nz Com 10 Unid...,1,1,1064,4
15112239,Soporte Arkon Cargador Auto P/ Celular Galaxy ...,1,0,65,1


<hr>

In [12]:
def normalize(curr):
    # remove accent
    curr = curr.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    # to lower case
    curr = curr.str.lower()
    # remove not alphanumerics or . ,
    curr = curr.str.replace('[^a-zA-Z0-9.,]', ' ')
    
    # let , and . be the same char
    curr = curr.str.replace('[.]', ',')
    
    # remove . , not between numbers
    curr = curr.str.replace('(?<=[0-9])[,]+(?=[0-9])', '.')
    curr = curr.str.replace('[,]', ' ')
    
    # set all digits to 0
    curr = curr.str.replace('[0-9]', '0')
    
    # separate ' <digits><letters ' like in 22g or 12ms
    # curr = curr.str.replace('(^| )([0-9]+)([a-zA-Z]+)($| )', r'\1\2 \3\4')
    
    # remove some Pt plurals
    curr = curr.str.replace('([a-zA-Z]+[aeiou])(s)', r'\1')
    
    # Other ideas: 
    
    return curr

In [13]:
sp = int(len(data) * 0.8) # Split Point

In [14]:
full = pd.concat([data[['title']], test[['title']]])

In [15]:
X_full = full.title

In [16]:
%%time 
X_full = normalize(X_full)

CPU times: user 18.1 s, sys: 223 ms, total: 18.3 s
Wall time: 18.3 s


In [17]:
%%time 
wordfreq = X_full.str.split(expand=True).stack().value_counts().to_dict()

CPU times: user 16 s, sys: 672 ms, total: 16.7 s
Wall time: 14.1 s


In [18]:
%%time 
uniquewords = {w for w, f in wordfreq.items() if f == 1}

CPU times: user 35.8 ms, sys: 0 ns, total: 35.8 ms
Wall time: 35.4 ms


In [19]:
xjoin = lambda s : ' '.join([w if w not in uniquewords else 'XXUXX' for w in s ])

In [20]:
%%time
X_full = X_full.str.split().apply(xjoin)

CPU times: user 4.29 s, sys: 7.8 ms, total: 4.29 s
Wall time: 4.29 s


In [21]:
replaces = {
    # da, do , de
    'da' : 'de',
    'do' : 'de',
    
    # en, em
    'en' : 'em',
    
    # con, com
    'con' : 'com',
    
    # sin, sem
    'sin' : 'sem',
    
    # kit, set, conjunto, combo ?, pack ?
    'set' : 'kit',
    'conjunto' : 'kit',

    # unidade, peca
    'unidade' : 'peca',
    
    # envio, frete
    'envio' : 'frete',
}

In [22]:
%%time
xrepl = lambda s : ' '.join([w if w not in replaces else replaces[w] for w in s ])
X_full = X_full.str.split().apply(xrepl)

CPU times: user 4.17 s, sys: 12 ms, total: 4.18 s
Wall time: 4.18 s


In [24]:
preps = {
    'da', 'de', 'do', 'en', 'em', 'con', 'com', 'sin', 'sem'
}

In [25]:
%%time
xremo = lambda s : ' '.join([w for w in s if w not in preps])
X_full_1gram = X_full.str.split().apply(xremo)

CPU times: user 4.44 s, sys: 20 ms, total: 4.46 s
Wall time: 4.46 s


In [26]:
%%time

covec_1gram = CountVectorizer(strip_accents='ascii', binary = True, min_df= 2,
                             ngram_range=(1,1),)
X_covec_1gram = covec_1gram.fit_transform(X_full_1gram)
print(X_covec_1gram.shape, X_covec_1gram.count_nonzero())

(1600475, 127024) 10888883
CPU times: user 11.7 s, sys: 185 ms, total: 11.9 s
Wall time: 11.9 s


In [27]:
docfreq1 = np.array(X_covec_1gram.sum(axis = 0)).flatten() / X_covec_1gram.shape[0]
inv_vocab1 = {v : k for k,v in covec_1gram.vocabulary_.items()}

In [28]:
np.vectorize(inv_vocab1.get)(np.argsort(docfreq1)[-200:])

array(['banco', 'bicicleta', 'al', 'bandeja', 'chevrolet', 'chave',
       'control', 'silla', 'nueva', '00g', 'carro', 'bluetooth',
       'electrico', 'vermelho', 'branca', 'unid', 'cor', 'couro', 'tubo',
       'anti', 'eletrico', 'painel', 'camera', 'radio', 'box', 'bivolt',
       'freio', 'natural', 'adeivo', 'vidrio', 'soporte', '000000', 'cae',
       'estado', 'entrega', 'controle', 'tapa', 'cable', 'lote',
       'cadeira', 'picina', 'ate', 'un', 'dico', '0v', '0gb', 'universal',
       'pc', 'profeional', 'lente', 'auto', '00w', 'funda', 'samsung',
       'papel', 'parede', 'manual', 'renault', 'hp', 'cinta', 'gol',
       'cama', 'camara', 'peugeot', 'mts', 'brinde', 'sony', 'uso', 'ga',
       'alta', 'hd', 'premium', 'doble', 'aire', 'notebook', 'macara',
       '000mm', 'blanco', 'marca', '00kg', 'ano', 'gel', 'plu', 'color',
       'protetor', 'metal', '00m', 'eletrica', '0kg', 'core', 'x0', 'dvd',
       'black', 'tela', '0m', 'a0', 'tampa', 'combo', 'el', 'ml', 'honda

In [29]:
np.sort(docfreq1)[-10:]

array([0.01711429, 0.02002655, 0.02460207, 0.0267833 , 0.06439713,
       0.07024665, 0.08150205, 0.09317421, 0.10752495, 0.19256596])

In [30]:
X_full_2gram = 'XXSXX ' + X_full + ' XXEXX'

In [31]:
%%time

covec_2gram = CountVectorizer(strip_accents='ascii', binary = True, min_df= 2,
                             ngram_range=(2,2),)
X_covec_2gram = covec_2gram.fit_transform(X_full_2gram)
print(X_covec_2gram.shape, X_covec_2gram.count_nonzero())

(1600475, 1134587) 11297423
CPU times: user 34.2 s, sys: 404 ms, total: 34.6 s
Wall time: 34 s


In [32]:
docfreq2 = np.array(X_covec_2gram.sum(axis = 0)).flatten() / X_covec_2gram.shape[0]
inv_vocab2 = {v : k for k,v in covec_2gram.vocabulary_.items()}

In [33]:
np.vectorize(inv_vocab2.get)(np.argsort(docfreq2)[-200:])

array(['verde xxexx', 'audi a0', '00 lts', 'xxsxx torneira', 'xxuxx de',
       'controle remoto', 'xxsxx carregador', 'xxsxx aceite', 'mts xxexx',
       'bola de', 'de vidrio', '000 litro', 'core xxexx', 'protetor de',
       'par de', 'xxsxx chaleco', 'xxsxx tanque', 'xxsxx torno',
       'xxsxx bicicleta', 'xxsxx saco', 'moto xxexx', 'control remoto',
       'xxsxx detector', 'ar condicionado', 'xxsxx radio', '00 unid',
       '00m xxexx', 'xxsxx fita', 'xxsxx controle', 'xxsxx reloj',
       'de luz', 'xxsxx lampara', 'xxsxx tapa', 'nueva xxexx',
       'xxsxx camara', 'tampa de', 'xxsxx bae', 'xxsxx camia',
       '00x00 xxexx', 'xxsxx antena', 'xxsxx sillon', 'xxsxx aparelho',
       'de xxuxx', 'xxsxx balanca', 'xxsxx alicate', 'roa xxexx',
       'garantia xxexx', 'xxsxx cable', 'xxsxx mangueira', 'xxsxx pedal',
       '000 mm', 'xxsxx banco', 'xxsxx bota', 'mm xxexx', '0kg xxexx',
       '0000 original', 'sem fio', 'negro xxexx', 'usado xxexx',
       'xxsxx oculo', '00000000

In [34]:
np.sort(docfreq2)[-10:]

array([0.01139787, 0.01217389, 0.01220325, 0.01602899, 0.01832831,
       0.02006967, 0.02754807, 0.03058405, 0.03732329, 0.05408957])

In [35]:
X_train_counts = sparse.hstack([X_covec_1gram, X_covec_2gram], format = 'csr')
print(X_train_counts.shape, X_train_counts.count_nonzero())

(1600475, 1261611) 22186306


In [36]:
def sbc(x):
    # sparse binary correlation; x : sparse
    # can't correlate zero columns
    cx = sparse.triu(x.T*x, k = 1, format='coo')
    # print(cx.todense())
    card = np.array(x.sum(axis = 0)).flatten()
    # print(card)
    cx.data = cx.data / (card[cx.row] + card[cx.col] - cx.data)
    # print(cx.todense())
    return np.array((cx == 1).sum(axis = 0) > 0).flatten()

In [38]:
%%time
rem = sbc(X_train_counts)
print(rem.mean())

0.08673117149422445
CPU times: user 19.3 s, sys: 2.16 s, total: 21.5 s
Wall time: 18.5 s


In [39]:
X_train_counts = X_train_counts[:, ~rem]
print(X_train_counts.shape, X_train_counts.count_nonzero())

(1600475, 1152190) 21862989


In [40]:
%%time

tfidf_transformer = TfidfTransformer(norm='l2', use_idf=False, smooth_idf=True, sublinear_tf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_go = X_train_tfidf
print(X_train_tfidf.shape)

(1600475, 1152190)
CPU times: user 258 ms, sys: 63.8 ms, total: 322 ms
Wall time: 125 ms


In [41]:
sp2 = len(data)
X_train, y_train = X_go[:sp], data.category.values[:sp]
X_test, y_test = X_go[sp:sp2], data.category.values[sp:sp2]
X_train.shape, X_test.shape

((1082816, 1152190), (270704, 1152190))

In [42]:
class_weights = (1 / pd.Series(y_train).value_counts()).to_dict() # switching 1 to len(y) seems to make diff

In [43]:
sample_weight = np.vectorize(class_weights.get)(y_train) # * rel_train

In [44]:
rel = data.label_quality.values[sp:]

In [45]:
%%time
n = sp // 1
clf_sgd = SGDClassifier(loss = 'modified_huber', #n_iter = 12,
                        max_iter=20, tol=1e-5, # try 1e-6 !!
                        alpha = 0.065e-8,
#                     early_stopping=True, validation_fraction = .2, n_iter_no_change = 5,
                    shuffle = False, n_jobs=4).fit(X_train[:n], y_train[:n],
                                                   sample_weight=sample_weight[:n],
                                                  )
clf = clf_sgd

CPU times: user 1h 17min 32s, sys: 6min 6s, total: 1h 23min 38s
Wall time: 17min 42s




In [46]:
%%time

prediction_val = clf.predict(X_test)
print('Val:', bas(y_test, prediction_val))

rel = data.label_quality.values[sp:]
print('Rel:', bas(y_test[rel == 0], prediction_val[rel == 0]))

Val: 0.8623934440796728
Rel: 0.9048376603641123
CPU times: user 15.6 s, sys: 4.72 s, total: 20.3 s
Wall time: 18.9 s




In [47]:
%%time
val_proba = clf.predict_proba(X_test)

CPU times: user 16.9 s, sys: 4.36 s, total: 21.2 s
Wall time: 20.9 s


In [48]:
val_proba = pd.DataFrame(val_proba)

In [49]:
%time val_proba.to_csv('../ensemb2/val_sgd_word-v4.csv', index = False, header = False)

CPU times: user 2min 58s, sys: 990 ms, total: 2min 59s
Wall time: 3min


In [50]:
%%time

y_data = data.category
X_data = X_go[:sp2]
class_weights_data = (1 / pd.Series(y_data).value_counts()).to_dict()
sample_weight_data = np.vectorize(class_weights_data.get)(y_data)
# rel_data =  1 + (1 - data.label_quality.values) * (relfactor - 1)
clf.fit(X_data, y_data, sample_weight=sample_weight_data ) # warm start ?

CPU times: user 1h 31min 26s, sys: 6min 9s, total: 1h 37min 35s
Wall time: 21min 8s


In [51]:
test_proba = clf.predict_proba(X_go[sp2:])

In [52]:
test_proba = pd.DataFrame(test_proba)

In [53]:
%time test_proba.to_csv('../ensemb2/test_sgd_word-v4.csv', index = False, header = False)

CPU times: user 2min 44s, sys: 713 ms, total: 2min 45s
Wall time: 2min 46s
