In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt



import gc
gc.enable()

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score as bas
from sklearn.ensemble import BaggingClassifier

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import NearestCentroid
from sklearn.preprocessing import StandardScaler, FunctionTransformer, MinMaxScaler, MaxAbsScaler

from sklearn.ensemble import RandomForestClassifier
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

import unicodedata
import re

<hr>

In [2]:
# data = pd.read_csv('../data-simplified-1-reduced-wordbal-800.csv')
data = pd.read_csv('../data-reduced-800-v3-shuffled.csv', index_col = 0)

  mask |= (ar1 == a)


In [3]:
test = pd.read_csv('../test.csv')

In [4]:
catcode = pd.read_csv('../data-simplified-1-catcode.csv', header = None, names = ['category'])['category'].to_dict()

In [5]:
pd.options.display.max_colwidth = 60

In [6]:
data.head()

Unnamed: 0,title,label_quality,language,category,priorities
473424,Kit 04 Jogo De Lençol De Berço Em Malha 3 Pcs 100% Algodão,0,1,114,6245
7519083,Bomba Submersa 450 W Agua Suja Turva Limpa Bsv 450 Vonder,1,1,1360,4
19488607,Nadador Tiburon Ys1378-5,1,0,1155,54
16895633,Máscara Angry Birds 6un Imbatível,0,1,1102,486
10369454,Aparador Fruteira Madeira De Demolição 1 Gaveta Peroba Rosa,0,1,1288,1075


<hr>

In [9]:
def normalize(curr):
    # remove accent
    curr = curr.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    # to lower case
    curr = curr.str.lower()
    # remove not alphanumerics or . ,
    curr = curr.str.replace('[^a-zA-Z0-9.,]', ' ')
    
    # let , and . be the same char
    curr = curr.str.replace('[.]', ',')
    
    # remove . , not between numbers
    curr = curr.str.replace('(?<=[0-9])[,]+(?=[0-9])', '.')
    curr = curr.str.replace('[,]', ' ')
    
    # set all digits to 0
    curr = curr.str.replace('[0-9]', '0')
    
    # separate ' <digits><letters ' like in 22g or 12ms
    # curr = curr.str.replace('(^| )([0-9]+)([a-zA-Z]+)($| )', r'\1\2 \3\4')
    
    # remove some Pt plurals
    curr = curr.str.replace('\\b([a-zA-Z]+[aeiouwy])(s)\\b', r'\1')
    
    # remove 4 consec (same) letters to just one
    curr = curr.str.replace(r'([a-zA-Z])\1{3,}', r'\1') # 3 is four? -> three of \1 after first \1...
    
    # separate 4 or more consecutive (different or not) letters
    curr = curr.str.replace(r'([a-zA-Z]{4,})', r' \1 ')
    
    # Other ideas: 
    
    return curr

In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /store/tveiga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
norm = lambda w : unicodedata.normalize('NFKD', w).encode('ASCII', 'ignore').decode('ASCII')
all_stopw = set()
for corpus in ['english', 'portuguese', 'spanish']:
    all_stopw.update(set(map(norm, stopwords.words(corpus))))

In [12]:
sp = int(len(data) * 0.8) # Split Point

In [13]:
full = pd.concat([data[['title']], test[['title']]])

In [14]:
%%time
X_full = full.title
X_full = normalize(X_full)

CPU times: user 29.9 s, sys: 262 ms, total: 30.2 s
Wall time: 29.8 s


In [15]:
%%time 
wordfreq = X_full.str.split(expand=True).stack().value_counts().to_dict()

CPU times: user 16.7 s, sys: 716 ms, total: 17.4 s
Wall time: 14.8 s


In [16]:
%%time 
uniquewords = {w for w, f in wordfreq.items() if f == 1}
print(len(uniquewords))

102942
CPU times: user 32.9 ms, sys: 22 µs, total: 32.9 ms
Wall time: 32.6 ms


In [17]:
testwordfreq = X_full[len(data):].str.split(expand=True).stack().value_counts().to_dict()
testuniquewords = {w for w, f in testwordfreq.items() if wordfreq[w] == 1}
print(len(testuniquewords))

3906


In [20]:
%%time
xremo = lambda s : ' '.join([w for w in s if w not in all_stopw])
X_full = X_full.str.split().apply(xremo)

CPU times: user 4.26 s, sys: 32 ms, total: 4.29 s
Wall time: 4.29 s


In [21]:
%%time
# go full crazy!
X_full = X_full.str.split().str.join('')

CPU times: user 2.41 s, sys: 16 ms, total: 2.43 s
Wall time: 2.43 s


In [22]:
# X_full = '1' + X_full + '2' #actaully don't need it when joining everything

In [23]:
X_full_1gram = X_full

In [33]:
%%time

covec_1gram = CountVectorizer(binary = True, min_df= 50, analyzer = 'char_wb', max_df = .05,
                             ngram_range=(4,5),)
X_covec_1gram = covec_1gram.fit_transform(X_full)
print(X_covec_1gram.shape, X_covec_1gram.count_nonzero())

(1622778, 218233) 107500515
CPU times: user 1min 39s, sys: 2.72 s, total: 1min 41s
Wall time: 1min 59s


In [34]:
docfreq1 = np.array(X_covec_1gram.sum(axis = 0)).flatten() / X_covec_1gram.shape[0]
inv_vocab1 = {v : k for k,v in covec_1gram.vocabulary_.items()}

In [35]:
np.vectorize(inv_vocab1.get)(np.argsort(docfreq1)[-200:])

array(['mult', 'zador', 'digi', 'lectr', 'ogra', 'acor', 'ific', 'entr',
       'inho', 'rico', 'porte', 'ro00', 'rador', 'prof', 'nado', 'grat',
       'lumin', 't0000', 'trol', 'capa', 'olor', 'pare', 'eletr', 'etal',
       'elect', 'avel', 'onta', 'lant', 'er00', 'itro', 'erta', 'rote',
       'umin', '000s', 'nal ', 'reto', 'lumi', '0000c', 'cador', 'bate',
       'acion', 'ulti', 'dore', 'raca', 'icad', 'ectr', 'mesa', 'quin',
       'auto', 'aqui', 'teria', 'laca', '000x', 'cort', 'amen', 'egra',
       'ader', 'ctor', 'p000', 'tador', 'letr', 'bran', 'lanc', 'illo',
       'ande', 'otor', '000ml', 'r0000', 'enta', 'ado ', 'ampa', '00un',
       'acio', 'elec', '000l', '000b', 'last', 'ional', 'e0000', 'colo',
       'metro', 'elet', 'cont', 'unida', ' kit0', '0000m', 'mento',
       'amar', '.000', '0.000', 'ster', 'lect', 'pres', 'made', 'anca',
       'asti', 'nidad', 'para', 'acao', 'nida', 'pret', 'ital', 'n000',
       'rica', 'izado', 's000', 'm000', '0unid', 'd000', 'ran

In [36]:
np.sort(docfreq1)[-10:]

array([0.03864361, 0.04064696, 0.04068825, 0.04183813, 0.04289003,
       0.04344772, 0.04466415, 0.0464204 , 0.04944114, 0.0496408 ])

In [37]:
X_train_counts = X_covec_1gram
print(X_train_counts.shape, X_train_counts.count_nonzero())

(1622778, 218233) 107500515


In [38]:
def sbc(x):
    # sparse binary correlation; x : sparse
    # can't correlate zero columns
    cx = sparse.triu(x.T*x, k = 1, format='coo')
    # print(cx.todense())
    card = np.array(x.sum(axis = 0)).flatten()
    # print(card)
    cx.data = cx.data / (card[cx.row] + card[cx.col] - cx.data)
    # print(cx.todense())
    return np.array((cx == 1).sum(axis = 0) > 0).flatten()

In [39]:
%%time
rem = sbc(X_train_counts)
print(rem.mean())

0.007546979604367809
CPU times: user 3min 23s, sys: 30.9 s, total: 3min 54s
Wall time: 4min 36s


In [40]:
X_train_counts = X_train_counts[:, ~rem]
print(X_train_counts.shape, X_train_counts.count_nonzero())

(1622778, 216586) 107036608


In [41]:
X_go = X_train_counts

In [42]:
%%time

tfidf_transformer = TfidfTransformer(norm='l2', use_idf=False, smooth_idf=True, sublinear_tf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_go = X_train_tfidf
print(X_train_tfidf.shape)

(1622778, 216586)
CPU times: user 1.11 s, sys: 224 ms, total: 1.33 s
Wall time: 654 ms


In [43]:
sp2 = len(data)
X_train, y_train = X_go[:sp], data.category.values[:sp]
X_test, y_test = X_go[sp:sp2], data.category.values[sp:sp2]
X_train.shape, X_test.shape

((1100658, 216586), (275165, 216586))

In [44]:
class_weights = (1 / pd.Series(y_train).value_counts()).to_dict() # switching 1 to len(y) seems to make diff

In [45]:
sample_weight = np.vectorize(class_weights.get)(y_train) # * rel_train

In [46]:
rel = data.label_quality.values[sp:]

In [49]:
%%time
n = sp // 1
clf_sgd = SGDClassifier(loss = 'modified_huber', #n_iter = 12,
                        max_iter=20, tol=1e-5, # try 1e-6 !!
                        alpha = 0.065e-8,
#                     early_stopping=True, validation_fraction = .2, n_iter_no_change = 5,
                    shuffle = False, n_jobs=4).fit(X_train[:n], y_train[:n],
                                                   sample_weight=sample_weight[:n],
                                                  )
clf = clf_sgd

CPU times: user 2h 2min 16s, sys: 6min 2s, total: 2h 8min 19s
Wall time: 28min 51s




In [50]:
%%time

prediction_val = clf.predict(X_test)
print('Val:', bas(y_test, prediction_val))

rel = data.label_quality.values[sp:]
print('Rel:', bas(y_test[rel == 0], prediction_val[rel == 0]))

Val: 0.8595869215203136
Rel: 0.902481976745019
CPU times: user 20.2 s, sys: 2.54 s, total: 22.7 s
Wall time: 21.4 s




In [None]:
print('Rel:', bas(y_test[rel == 0], prediction_val[rel == 0]))

In [51]:
%%time
val_proba = clf.predict_proba(X_test)

CPU times: user 21.1 s, sys: 2.33 s, total: 23.5 s
Wall time: 23.3 s


In [52]:
val_proba = pd.DataFrame(val_proba)

In [53]:
%time val_proba.to_csv('../ensemb3/val_sgd_char-v7_45.csv', index = False, header = False)

CPU times: user 2min 47s, sys: 953 ms, total: 2min 48s
Wall time: 2min 50s


In [54]:
%%time

y_data = data.category
X_data = X_go[:sp2]
class_weights_data = (1 / pd.Series(y_data).value_counts()).to_dict()
sample_weight_data = np.vectorize(class_weights_data.get)(y_data)
# rel_data =  1 + (1 - data.label_quality.values) * (relfactor - 1)
clf.fit(X_data, y_data, sample_weight=sample_weight_data ) # warm start ?

CPU times: user 2h 6min 32s, sys: 6min 58s, total: 2h 13min 31s
Wall time: 29min 32s


In [55]:
test_proba = clf.predict_proba(X_go[sp2:])

In [56]:
test_proba = pd.DataFrame(test_proba)

In [57]:
%time test_proba.to_csv('../ensemb3/test_sgd_char-v7_45.csv', index = False, header = False)

CPU times: user 2min 16s, sys: 728 ms, total: 2min 17s
Wall time: 2min 19s
