In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt



import gc
gc.enable()

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score as bas
from sklearn.ensemble import BaggingClassifier

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import NearestCentroid
from sklearn.preprocessing import StandardScaler, FunctionTransformer, MinMaxScaler, MaxAbsScaler

from sklearn.ensemble import RandomForestClassifier
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

import unicodedata
import re

<hr>

In [2]:
# data = pd.read_csv('../data-simplified-1-reduced-wordbal-800.csv')
data = pd.read_csv('../data-reduced-800-v3-shuffled.csv', index_col = 0)

  mask |= (ar1 == a)


In [3]:
test = pd.read_csv('../test.csv')

In [7]:
catcode = pd.read_csv('../data-simplified-1-catcode.csv', header = None, names = ['category'])['category'].to_dict()

In [8]:
pd.options.display.max_colwidth = 60

In [9]:
data.head()

Unnamed: 0,title,label_quality,language,category,priorities
473424,Kit 04 Jogo De Lençol De Berço Em Malha 3 Pcs 100% Algodão,0,1,114,6245
7519083,Bomba Submersa 450 W Agua Suja Turva Limpa Bsv 450 Vonder,1,1,1360,4
19488607,Nadador Tiburon Ys1378-5,1,0,1155,54
16895633,Máscara Angry Birds 6un Imbatível,0,1,1102,486
10369454,Aparador Fruteira Madeira De Demolição 1 Gaveta Peroba Rosa,0,1,1288,1075


<hr>

In [12]:
def normalize(curr):
    # remove accent
    curr = curr.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    # to lower case
    curr = curr.str.lower()
    # remove not alphanumerics or . ,
    curr = curr.str.replace('[^a-zA-Z0-9.,]', ' ')
    
    # let , and . be the same char
    curr = curr.str.replace('[.]', ',')
    
    # remove . , not between numbers
    curr = curr.str.replace('(?<=[0-9])[,]+(?=[0-9])', '.')
    curr = curr.str.replace('[,]', ' ')
    
    # set all digits to 0
    curr = curr.str.replace('[0-9]', '0')
    
    # separate ' <digits><letters ' like in 22g or 12ms
    # curr = curr.str.replace('(^| )([0-9]+)([a-zA-Z]+)($| )', r'\1\2 \3\4')
    
    # remove some Pt plurals
    curr = curr.str.replace('\\b([a-zA-Z]+[aeiouwy])(s)\\b', r'\1')
    
    # remove 4 consec (same) letters to just one
    curr = curr.str.replace(r'([a-zA-Z])\1{3,}', r'\1') # 3 is four? -> three of \1 after first \1...
    
    # separate 4 or more consecutive (different or not) letters
    curr = curr.str.replace(r'([a-zA-Z]{4,})', r' \1 ')
    
    # Other ideas: 
    
    return curr

In [13]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /store/tveiga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
norm = lambda w : unicodedata.normalize('NFKD', w).encode('ASCII', 'ignore').decode('ASCII')
all_stopw = set()
for corpus in ['english', 'portuguese', 'spanish']:
    all_stopw.update(set(map(norm, stopwords.words(corpus))))

In [15]:
sp = int(len(data) * 0.8) # Split Point

In [16]:
full = pd.concat([data[['title']], test[['title']]])

In [17]:
%%time
X_full = full.title
X_full = normalize(X_full)

CPU times: user 25.2 s, sys: 236 ms, total: 25.4 s
Wall time: 25.1 s


In [18]:
%%time 
wordfreq = X_full.str.split(expand=True).stack().value_counts().to_dict()

CPU times: user 15.3 s, sys: 649 ms, total: 15.9 s
Wall time: 13.3 s


In [19]:
%%time 
uniquewords = {w for w, f in wordfreq.items() if f == 1}
print(len(uniquewords))

102942
CPU times: user 29.7 ms, sys: 0 ns, total: 29.7 ms
Wall time: 29.5 ms


In [20]:
testwordfreq = X_full[len(data):].str.split(expand=True).stack().value_counts().to_dict()
testuniquewords = {w for w, f in testwordfreq.items() if wordfreq[w] == 1}
print(len(testuniquewords))

3906


In [22]:
unkn = lambda x : 'U0' if '0' in x else 'UA' # ('UFT' if x in ftwords else 'UA')
xjoin = lambda s : ' '.join([w if w not in uniquewords else unkn(w) for w in s ])

In [23]:
%%time
X_full = X_full.str.split().apply(xjoin)

CPU times: user 4.07 s, sys: 44 ms, total: 4.11 s
Wall time: 4.11 s


In [24]:
%%time
xremo = lambda s : ' '.join([w for w in s if w not in all_stopw])
X_full = X_full.str.split().apply(xremo)

CPU times: user 3.63 s, sys: 20 ms, total: 3.65 s
Wall time: 3.65 s


In [26]:
X_full_1gram = X_full

In [27]:
%%time

covec_1gram = CountVectorizer(binary = True, min_df= 2, lowercase=False,
                             ngram_range=(1,1),)
X_covec_1gram = covec_1gram.fit_transform(X_full_1gram)
print(X_covec_1gram.shape, X_covec_1gram.count_nonzero())

(1622778, 128577) 10776601
CPU times: user 8.76 s, sys: 164 ms, total: 8.92 s
Wall time: 8.92 s


In [28]:
docfreq1 = np.array(X_covec_1gram.sum(axis = 0)).flatten() / X_covec_1gram.shape[0]
inv_vocab1 = {v : k for k,v in covec_1gram.vocabulary_.items()}

In [29]:
np.vectorize(inv_vocab1.get)(np.argsort(docfreq1)[-200:])

array(['alto', 'bicicleta', 'bandeja', 'cuero', 'serie', 'silla', 'banco',
       'guitarra', 'nueva', 'carro', 'yamaha', 'anti', 'vermelho', 'tubo',
       'chave', '00g', 'modulo', 'couro', 'cor', 'philips', 'box',
       'branca', 'painel', 'bluetooth', 'eletrico', 'vidrio', 'electrico',
       'piso', 'camera', 'freio', 'natural', 'lote', 'radio', 'bivolt',
       '000000', 'tapa', 'soporte', 'unid', 'cable', 'cadeira',
       'universal', 'entrega', 'controle', 'pc', 'auto', 'renault',
       'manual', 'profesional', 'funda', 'lente', 'parede', 'papel', '0v',
       'cama', 'mts', 'uso', 'cinta', 'hp', 'peugeot', 'camara', 'alta',
       '00w', 'adesivo', 'brinde', '0gb', 'aire', 'gol', 'marca', 'ga',
       'premium', 'doble', 'hd', '000mm', 'gel', '00kg', 'ano', 'color',
       'protetor', 'blanco', 'plu', 'metal', 'conjunto', 'piscina',
       'mascara', 'notebook', '0kg', 'core', 'sony', 'tampa', 'tela',
       'samsung', 'eletrica', 'x0', 'ml', 'dvd', '00m', 'combo', 'a0',
  

In [30]:
np.sort(docfreq1)[-10:]

array([0.01394337, 0.01429093, 0.01489791, 0.01826991, 0.02725018,
       0.0476091 , 0.05859951, 0.09254685, 0.10779478, 0.19641812])

In [31]:
X_full_2gram = 'SS ' + X_full + ' EE'

In [32]:
%%time

covec_2gram = CountVectorizer(binary = True, min_df= 2, lowercase=False,
                             ngram_range=(2,2),)
X_covec_2gram = covec_2gram.fit_transform(X_full_2gram)
print(X_covec_2gram.shape, X_covec_2gram.count_nonzero())

(1622778, 1137791) 10475416
CPU times: user 29.2 s, sys: 384 ms, total: 29.6 s
Wall time: 28.9 s


In [33]:
docfreq2 = np.array(X_covec_2gram.sum(axis = 0)).flatten() / X_covec_2gram.shape[0]
inv_vocab2 = {v : k for k,v in covec_2gram.vocabulary_.items()}

In [34]:
np.vectorize(inv_vocab2.get)(np.argsort(docfreq2)[-200:])

array(['SS chave', 'SS escova', '00v EE', 'inox EE', '00 pcs',
       'bivolt EE', 'impecable EE', 'SS cortina', 'SS camera',
       'pedido EE', '00kg EE', 'excelente EE', 'SS radiador', 'SS 0000',
       'SS faca', 'mts EE', 'usb EE', 'regalo EE', 'control remoto',
       'preta EE', 'SS guitarra', 'verde EE', 'SS paleta', 'SS mochila',
       'SS aceite', '0m EE', 'SS torneira', '00 lts', 'moto EE',
       'core EE', 'SS tanque', 'SS chaleco', 'nova EE', 'SS cortador',
       'SS torno', 'SS horno', 'pack 00', 'SS saco', 'audi a0',
       'SS bicicleta', 'controle remoto', '000 litro', 'bomba agua',
       'SS detector', 'nueva EE', 'UA UA', 'SS fita', 'SS reloj',
       'SS tapa', 'black decker', 'SS aparelho', '00x00 EE', 'SS radio',
       'SS camisa', 'ar condicionado', 'SS carregador', '00m EE',
       '000 mm', 'SS sillon', 'SS alicate', 'SS balanca', 'SS lampara',
       'SS camara', 'SS antena', 'SS cable', 'SS base', '00 unid',
       'mm EE', '00000000 EE', 'SS bota', 'SS 

In [35]:
np.sort(docfreq2)[-10:]

array([0.00972222, 0.01049743, 0.01198439, 0.01218281, 0.01652228,
       0.02074221, 0.0283711 , 0.03207709, 0.03661068, 0.04596069])

In [36]:
X_train_counts = sparse.hstack([X_covec_1gram, X_covec_2gram], format = 'csr')
print(X_train_counts.shape, X_train_counts.count_nonzero())

(1622778, 1266368) 21252017


In [37]:
def sbc(x):
    # sparse binary correlation; x : sparse
    # can't correlate zero columns
    cx = sparse.triu(x.T*x, k = 1, format='coo')
    # print(cx.todense())
    card = np.array(x.sum(axis = 0)).flatten()
    # print(card)
    cx.data = cx.data / (card[cx.row] + card[cx.col] - cx.data)
    # print(cx.todense())
    return np.array((cx == 1).sum(axis = 0) > 0).flatten()

In [39]:
%%time
rem = sbc(X_train_counts)
print(rem.mean())

0.08779596452216101
CPU times: user 16.7 s, sys: 1.66 s, total: 18.4 s
Wall time: 15.6 s


In [40]:
X_train_counts = X_train_counts[:, ~rem]
print(X_train_counts.shape, X_train_counts.count_nonzero())

(1622778, 1155186) 20921274


In [41]:
X_go = X_train_counts

In [42]:
%%time

tfidf_transformer = TfidfTransformer(norm='l2', use_idf=False, smooth_idf=True, sublinear_tf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_go = X_train_tfidf
print(X_train_tfidf.shape)

(1622778, 1155186)
CPU times: user 303 ms, sys: 43.9 ms, total: 347 ms
Wall time: 124 ms


In [43]:
sp2 = len(data)
X_train, y_train = X_go[:sp], data.category.values[:sp]
X_test, y_test = X_go[sp:sp2], data.category.values[sp:sp2]
X_train.shape, X_test.shape

((1100658, 1155186), (275165, 1155186))

In [44]:
class_weights = (1 / pd.Series(y_train).value_counts()).to_dict() # switching 1 to len(y) seems to make diff

In [45]:
sample_weight = np.vectorize(class_weights.get)(y_train) # * rel_train

In [46]:
rel = data.label_quality.values[sp:]

In [47]:
%%time
n = sp // 1
clf_sgd = SGDClassifier(loss = 'modified_huber', #n_iter = 12,
                        max_iter=20, tol=1e-5, # try 1e-6 !!
                        alpha = 0.065e-8,
#                     early_stopping=True, validation_fraction = .2, n_iter_no_change = 5,
                    shuffle = False, n_jobs=4).fit(X_train[:n], y_train[:n],
                                                   sample_weight=sample_weight[:n],
                                                  )
clf = clf_sgd

CPU times: user 1h 12min 38s, sys: 5min 54s, total: 1h 18min 33s
Wall time: 16min 55s




In [48]:
%%time

prediction_val = clf.predict(X_test)
print('Val:', bas(y_test, prediction_val))

rel = data.label_quality.values[sp:]
print('Rel:', bas(y_test[rel == 0], prediction_val[rel == 0]))

Val: 0.8648353168144917
Rel: 0.9047296235948962
CPU times: user 15.7 s, sys: 4.66 s, total: 20.3 s
Wall time: 19.2 s




In [49]:
%%time
val_proba = clf.predict_proba(X_test)

CPU times: user 16.1 s, sys: 4.64 s, total: 20.8 s
Wall time: 20.8 s


In [50]:
val_proba = pd.DataFrame(val_proba)

In [51]:
%time val_proba.to_csv('../ensemb3/val_sgd_word-v7.csv', index = False, header = False)

CPU times: user 2min 43s, sys: 794 ms, total: 2min 44s
Wall time: 2min 46s


In [52]:
%%time

y_data = data.category
X_data = X_go[:sp2]
class_weights_data = (1 / pd.Series(y_data).value_counts()).to_dict()
sample_weight_data = np.vectorize(class_weights_data.get)(y_data)
# rel_data =  1 + (1 - data.label_quality.values) * (relfactor - 1)
clf.fit(X_data, y_data, sample_weight=sample_weight_data ) # warm start ?

CPU times: user 1h 29min 48s, sys: 6min 2s, total: 1h 35min 50s
Wall time: 21min 11s


In [53]:
test_proba = clf.predict_proba(X_go[sp2:])

In [54]:
test_proba = pd.DataFrame(test_proba)

In [55]:
%time test_proba.to_csv('../ensemb3/test_sgd_word-v7.csv', index = False, header = False)

CPU times: user 2min 30s, sys: 786 ms, total: 2min 30s
Wall time: 2min 32s
