In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt



import gc
gc.enable()

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score as bas
from sklearn.ensemble import BaggingClassifier

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import NearestCentroid
from sklearn.preprocessing import StandardScaler, FunctionTransformer, MinMaxScaler, MaxAbsScaler

from sklearn.ensemble import RandomForestClassifier
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

import unicodedata
import re

<hr>

In [2]:
# data = pd.read_csv('../data-simplified-1-reduced-wordbal-800.csv')
data = pd.read_csv('../data-reduced-800-v3-shuffled.csv', index_col = 0)

  mask |= (ar1 == a)


In [3]:
test = pd.read_csv('../test.csv')

In [4]:
catcode = pd.read_csv('../data-simplified-1-catcode.csv', header = None, names = ['category'])['category'].to_dict()

In [5]:
pd.options.display.max_colwidth = 60

In [6]:
data.head()

Unnamed: 0,title,label_quality,language,category,priorities
473424,Kit 04 Jogo De Lençol De Berço Em Malha 3 Pcs 100% Algodão,0,1,114,6245
7519083,Bomba Submersa 450 W Agua Suja Turva Limpa Bsv 450 Vonder,1,1,1360,4
19488607,Nadador Tiburon Ys1378-5,1,0,1155,54
16895633,Máscara Angry Birds 6un Imbatível,0,1,1102,486
10369454,Aparador Fruteira Madeira De Demolição 1 Gaveta Peroba Rosa,0,1,1288,1075


<hr>

In [9]:
def normalize(curr):
    # remove accent
    curr = curr.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    # to lower case
    curr = curr.str.lower()
    # remove not alphanumerics or . ,
    curr = curr.str.replace('[^a-zA-Z0-9.,]', ' ')
    
    # let , and . be the same char
    curr = curr.str.replace('[.]', ',')
    
    # remove . , not between numbers
    curr = curr.str.replace('(?<=[0-9])[,]+(?=[0-9])', '.')
    curr = curr.str.replace('[,]', ' ')
    
    # set all digits to 0
    curr = curr.str.replace('[0-9]', '0')
    
    # separate ' <digits><letters ' like in 22g or 12ms
    # curr = curr.str.replace('(^| )([0-9]+)([a-zA-Z]+)($| )', r'\1\2 \3\4')
    
    # remove some Pt plurals
    curr = curr.str.replace('\\b([a-zA-Z]+[aeiouwy])(s)\\b', r'\1')
    
    # remove 4 consec (same) letters to just one
    curr = curr.str.replace(r'([a-zA-Z])\1{3,}', r'\1') # 3 is four? -> three of \1 after first \1...
    
    # separate 4 or more consecutive (different or not) letters
    curr = curr.str.replace(r'([a-zA-Z]{4,})', r' \1 ')
    
    # Other ideas: 
    
    return curr

In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /store/tveiga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
norm = lambda w : unicodedata.normalize('NFKD', w).encode('ASCII', 'ignore').decode('ASCII')
all_stopw = set()
for corpus in ['english', 'portuguese', 'spanish']:
    all_stopw.update(set(map(norm, stopwords.words(corpus))))

In [12]:
sp = int(len(data) * 0.8) # Split Point

In [13]:
full = pd.concat([data[['title']], test[['title']]])

In [14]:
%%time
X_full = full.title
X_full = normalize(X_full)

CPU times: user 24.9 s, sys: 187 ms, total: 25.1 s
Wall time: 24.6 s


In [15]:
%%time 
wordfreq = X_full.str.split(expand=True).stack().value_counts().to_dict()

CPU times: user 15 s, sys: 628 ms, total: 15.6 s
Wall time: 13.1 s


In [16]:
%%time 
uniquewords = {w for w, f in wordfreq.items() if f == 1}
print(len(uniquewords))

102942
CPU times: user 30.1 ms, sys: 41 µs, total: 30.2 ms
Wall time: 30 ms


In [17]:
testwordfreq = X_full[len(data):].str.split(expand=True).stack().value_counts().to_dict()
testuniquewords = {w for w, f in testwordfreq.items() if wordfreq[w] == 1}
print(len(testuniquewords))

3906


In [18]:
unkn = lambda x : 'U0' if '0' in x else 'UA' # ('UFT' if x in ftwords else 'UA')
xjoin = lambda s : ' '.join([w if w not in uniquewords else unkn(w) for w in s ])

In [19]:
%%time
X_full = X_full.str.split().apply(xjoin)

CPU times: user 3.73 s, sys: 40.1 ms, total: 3.77 s
Wall time: 3.77 s


In [20]:
%%time
xremo = lambda s : ' '.join([w for w in s if w not in all_stopw])
X_full = X_full.str.split().apply(xremo)

CPU times: user 3.74 s, sys: 40 ms, total: 3.78 s
Wall time: 3.78 s


In [21]:
X_full_1gram = X_full

In [22]:
%%time

covec_1gram = CountVectorizer(binary = True, min_df= 2, lowercase=False,
                             ngram_range=(1,1),)
X_covec_1gram = covec_1gram.fit_transform(X_full_1gram)
print(X_covec_1gram.shape, X_covec_1gram.count_nonzero())

(1622778, 128577) 10776601
CPU times: user 8.05 s, sys: 172 ms, total: 8.22 s
Wall time: 8.29 s


In [23]:
docfreq1 = np.array(X_covec_1gram.sum(axis = 0)).flatten() / X_covec_1gram.shape[0]
inv_vocab1 = {v : k for k,v in covec_1gram.vocabulary_.items()}

In [24]:
np.vectorize(inv_vocab1.get)(np.argsort(docfreq1)[-200:])

array(['alto', 'bicicleta', 'bandeja', 'cuero', 'serie', 'silla', 'banco',
       'guitarra', 'nueva', 'carro', 'yamaha', 'anti', 'vermelho', 'tubo',
       'chave', '00g', 'modulo', 'couro', 'cor', 'philips', 'box',
       'branca', 'painel', 'bluetooth', 'eletrico', 'vidrio', 'electrico',
       'piso', 'camera', 'freio', 'natural', 'lote', 'radio', 'bivolt',
       '000000', 'tapa', 'soporte', 'unid', 'cable', 'cadeira',
       'universal', 'entrega', 'controle', 'pc', 'auto', 'renault',
       'manual', 'profesional', 'funda', 'lente', 'parede', 'papel', '0v',
       'cama', 'mts', 'uso', 'cinta', 'hp', 'peugeot', 'camara', 'alta',
       '00w', 'adesivo', 'brinde', '0gb', 'aire', 'gol', 'marca', 'ga',
       'premium', 'doble', 'hd', '000mm', 'gel', '00kg', 'ano', 'color',
       'protetor', 'blanco', 'plu', 'metal', 'conjunto', 'piscina',
       'mascara', 'notebook', '0kg', 'core', 'sony', 'tampa', 'tela',
       'samsung', 'eletrica', 'x0', 'ml', 'dvd', '00m', 'combo', 'a0',
  

In [25]:
np.sort(docfreq1)[-10:]

array([0.01394337, 0.01429093, 0.01489791, 0.01826991, 0.02725018,
       0.0476091 , 0.05859951, 0.09254685, 0.10779478, 0.19641812])

In [26]:
X_full_2gram = 'SS ' + X_full # + ' EE'

In [27]:
%%time

covec_2gram = CountVectorizer(binary = True, min_df= 2, lowercase=False,
                             ngram_range=(2,2),)
X_covec_2gram = covec_2gram.fit_transform(X_full_2gram)
print(X_covec_2gram.shape, X_covec_2gram.count_nonzero())

(1622778, 1073510) 8873701
CPU times: user 25 s, sys: 308 ms, total: 25.3 s
Wall time: 24.7 s


In [28]:
docfreq2 = np.array(X_covec_2gram.sum(axis = 0)).flatten() / X_covec_2gram.shape[0]
inv_vocab2 = {v : k for k,v in covec_2gram.vocabulary_.items()}

In [29]:
np.vectorize(inv_vocab2.get)(np.argsort(docfreq2)[-200:])

array(['00 pare', 'SS puerta', 'mercede benz', '00 UA', 'SS globo',
       'SS llave', 'SS control', 'SS carro', 'SS impresora', 'aro 00',
       'SS entrada', 'kit 000', '000 cm', 'SS flauta', 'SS gel',
       'SS polia', 'SS fonte', '00 ano', 'SS taco', 'SS tapete',
       'SS adaptador', 'SS colchao', 'SS tinta', 'tp link', 'SS andador',
       'SS pintura', 'SS monitor', 'SS amplificador', 'SS leitor',
       'SS furadeira', 'SS aparador', 'SS caneta', 'SS roda', 'SS tubo',
       'SS oferta', 'SS tabla', 'SS pulseira', 'SS vaso', 'ano 0000',
       'ping pong', 'SS cortadora', 'SS ducha', 'SS papel',
       'SS aquecedor', 'SS piso', '00 00mm', 'SS cargador', 'SS pileta',
       'SS bolso', 'UA 000', '00x00 cm', 'SS garrafa', '00x0 00',
       'SS camiseta', '000 000v', 'SS pistola', 'SS cilindro',
       '00 pulgada', 'SS campera', 'SS pinza', 'SS chave', 'SS escova',
       '00 pcs', 'SS cortina', 'SS camera', 'SS radiador', 'SS 0000',
       'SS faca', 'control remoto', 'SS gui

In [30]:
np.sort(docfreq2)[-10:]

array([0.00621157, 0.00703978, 0.00711065, 0.00778295, 0.00959281,
       0.00972222, 0.01218281, 0.01652228, 0.0283711 , 0.04596069])

In [31]:
X_train_counts = sparse.hstack([X_covec_1gram, X_covec_2gram], format = 'csr')
print(X_train_counts.shape, X_train_counts.count_nonzero())

(1622778, 1202087) 19650302


In [32]:
def sbc(x):
    # sparse binary correlation; x : sparse
    # can't correlate zero columns
    cx = sparse.triu(x.T*x, k = 1, format='coo')
    # print(cx.todense())
    card = np.array(x.sum(axis = 0)).flatten()
    # print(card)
    cx.data = cx.data / (card[cx.row] + card[cx.col] - cx.data)
    # print(cx.todense())
    return np.array((cx == 1).sum(axis = 0) > 0).flatten()

In [33]:
%%time
rem = sbc(X_train_counts)
print(rem.mean())

0.08644382644517411
CPU times: user 13.8 s, sys: 1.38 s, total: 15.2 s
Wall time: 12.8 s


In [34]:
X_train_counts = X_train_counts[:, ~rem]
print(X_train_counts.shape, X_train_counts.count_nonzero())

(1622778, 1098174) 19359372


In [35]:
X_go = X_train_counts

In [36]:
%%time

tfidf_transformer = TfidfTransformer(norm='l2', use_idf=False, smooth_idf=True, sublinear_tf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_go = X_train_tfidf
print(X_train_tfidf.shape)

(1622778, 1098174)
CPU times: user 256 ms, sys: 39.9 ms, total: 296 ms
Wall time: 110 ms


In [37]:
sp2 = len(data)
X_train, y_train = X_go[:sp], data.category.values[:sp]
X_test, y_test = X_go[sp:sp2], data.category.values[sp:sp2]
X_train.shape, X_test.shape

((1100658, 1098174), (275165, 1098174))

In [38]:
class_weights = (1 / pd.Series(y_train).value_counts()).to_dict() # switching 1 to len(y) seems to make diff

In [39]:
sample_weight = np.vectorize(class_weights.get)(y_train) # * rel_train

In [40]:
rel = data.label_quality.values[sp:]

In [41]:
%%time
n = sp // 1
clf_mnb = MultinomialNB(alpha = 3e-5, fit_prior=False).fit(X_train[:n], y_train[:n],
                                                   sample_weight=sample_weight[:n],
                                                              )
clf = clf_mnb

CPU times: user 52.4 s, sys: 22.3 s, total: 1min 14s
Wall time: 2min 6s


In [42]:
%%time

prediction_val = clf.predict(X_test)
print('Val:', bas(y_test, prediction_val))

rel = data.label_quality.values[sp:]
print('Rel:', bas(y_test[rel == 0], prediction_val[rel == 0]))

Val: 0.8502886983706467
Rel: 0.8929651006339197
CPU times: user 13.9 s, sys: 4.32 s, total: 18.3 s
Wall time: 19.8 s




In [43]:
%%time
val_proba = clf.predict_proba(X_test)

CPU times: user 14.6 s, sys: 6.6 s, total: 21.2 s
Wall time: 21.4 s


In [44]:
val_proba = pd.DataFrame(val_proba)

In [45]:
%time val_proba.to_csv('../ensemb3/val_mnb.csv', index = False, header = False)

CPU times: user 7min 15s, sys: 3.19 s, total: 7min 18s
Wall time: 7min 28s


In [37]:
y_data = data.category

In [42]:
sp2 = len(y_data)

In [38]:
# del X_full
# del X_full_1gram
# del X_full_2gram
# del X_train_counts
# del data

In [39]:
# del clf
clf_mnb = MultinomialNB(alpha = 3e-5, fit_prior=False)
clf = clf_mnb
gc.collect()

250

In [43]:
%%time

# y_data = data.category
X_data = X_go[:sp2]
class_weights_data = (1 / pd.Series(y_data).value_counts()).to_dict()
sample_weight_data = np.vectorize(class_weights_data.get)(y_data)
# rel_data =  1 + (1 - data.label_quality.values) * (relfactor - 1)
clf.fit(X_data, y_data, sample_weight=sample_weight_data ) # warm start ?

CPU times: user 57.1 s, sys: 23.1 s, total: 1min 20s
Wall time: 2min 24s


In [44]:
test_proba = clf.predict_proba(X_go[sp2:])

In [45]:
test_proba = pd.DataFrame(test_proba)

In [46]:
%time test_proba.to_csv('../ensemb3/test_mnb.csv', index = False, header = False)

CPU times: user 6min 16s, sys: 2.89 s, total: 6min 19s
Wall time: 6min 28s
