In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt



import gc
gc.enable()

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score as bas
from sklearn.ensemble import BaggingClassifier

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import NearestCentroid
from sklearn.preprocessing import StandardScaler, FunctionTransformer, MinMaxScaler, MaxAbsScaler

from sklearn.ensemble import RandomForestClassifier
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

import unicodedata
import re

<hr>

In [2]:
# data = pd.read_csv('../data-simplified-1-reduced-wordbal-800.csv')
data = pd.read_csv('../data-reduced-800-v3-shuffled.csv', index_col = 0)

  mask |= (ar1 == a)


In [3]:
test = pd.read_csv('../test.csv')

In [4]:
catcode = pd.read_csv('../data-simplified-1-catcode.csv', header = None, names = ['category'])['category'].to_dict()

In [5]:
pd.options.display.max_colwidth = 60

In [6]:
data.head()

Unnamed: 0,title,label_quality,language,category,priorities
473424,Kit 04 Jogo De Lençol De Berço Em Malha 3 Pcs 100% Algodão,0,1,114,6245
7519083,Bomba Submersa 450 W Agua Suja Turva Limpa Bsv 450 Vonder,1,1,1360,4
19488607,Nadador Tiburon Ys1378-5,1,0,1155,54
16895633,Máscara Angry Birds 6un Imbatível,0,1,1102,486
10369454,Aparador Fruteira Madeira De Demolição 1 Gaveta Peroba Rosa,0,1,1288,1075


<hr>

In [9]:
def normalize(curr):
    # remove accent
    curr = curr.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    # to lower case
    curr = curr.str.lower()
    # remove not alphanumerics or . ,
    curr = curr.str.replace('[^a-zA-Z0-9.,]', ' ')
    
    # let , and . be the same char
    curr = curr.str.replace('[.]', ',')
    
    # remove . , not between numbers
    curr = curr.str.replace('(?<=[0-9])[,]+(?=[0-9])', '.')
    curr = curr.str.replace('[,]', ' ')
    
    # set all digits to 0
    curr = curr.str.replace('[0-9]', '0')
    
    # separate ' <digits><letters ' like in 22g or 12ms
    # curr = curr.str.replace('(^| )([0-9]+)([a-zA-Z]+)($| )', r'\1\2 \3\4')
    
    # remove some Pt plurals
    curr = curr.str.replace('\\b([a-zA-Z]+[aeiouwy])(s)\\b', r'\1')
    
    # remove 4 consec (same) letters to just one
    curr = curr.str.replace(r'([a-zA-Z])\1{3,}', r'\1') # 3 is four? -> three of \1 after first \1...
    
    # separate 4 or more consecutive (different or not) letters
    curr = curr.str.replace(r'([a-zA-Z]{4,})', r' \1 ')
    
    # Other ideas: 
    
    return curr

In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /store/tveiga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
norm = lambda w : unicodedata.normalize('NFKD', w).encode('ASCII', 'ignore').decode('ASCII')
all_stopw = set()
for corpus in ['english', 'portuguese', 'spanish']:
    all_stopw.update(set(map(norm, stopwords.words(corpus))))

In [12]:
sp = int(len(data) * 0.8) # Split Point

In [13]:
full = pd.concat([data[['title']], test[['title']]])

In [14]:
%%time
X_full = full.title
X_full = normalize(X_full)

CPU times: user 29.6 s, sys: 324 ms, total: 29.9 s
Wall time: 29.8 s


In [15]:
%%time 
wordfreq = X_full.str.split(expand=True).stack().value_counts().to_dict()

CPU times: user 16.6 s, sys: 704 ms, total: 17.3 s
Wall time: 14.7 s


In [16]:
%%time 
uniquewords = {w for w, f in wordfreq.items() if f == 1}
print(len(uniquewords))

102942
CPU times: user 34.7 ms, sys: 0 ns, total: 34.7 ms
Wall time: 34.5 ms


In [17]:
testwordfreq = X_full[len(data):].str.split(expand=True).stack().value_counts().to_dict()
testuniquewords = {w for w, f in testwordfreq.items() if wordfreq[w] == 1}
print(len(testuniquewords))

3906


In [18]:
unkn = lambda x : 'U0' if '0' in x else 'UA' # ('UFT' if x in ftwords else 'UA')
xjoin = lambda s : ' '.join([w if w not in uniquewords else unkn(w) for w in s ])

In [19]:
%%time
X_full = X_full.str.split().apply(xjoin)

CPU times: user 4.2 s, sys: 72 ms, total: 4.27 s
Wall time: 4.27 s


In [20]:
%%time
xremo = lambda s : ' '.join([w for w in s if w not in all_stopw])
X_full = X_full.str.split().apply(xremo)

CPU times: user 4.28 s, sys: 52 ms, total: 4.33 s
Wall time: 4.33 s


In [21]:
%%time
# go full crazy!
X_full = X_full.str.split().str.join('')

CPU times: user 2.48 s, sys: 31.9 ms, total: 2.51 s
Wall time: 2.51 s


In [22]:
# X_full = '1' + X_full + '2' #actaully don't need it when joining everything

In [26]:
X_full_1gram = X_full

In [27]:
%%time

covec_1gram = CountVectorizer(binary = True, min_df= 2, analyzer = 'char_wb', max_df = .95,
                             ngram_range=(3,4),)
X_covec_1gram = covec_1gram.fit_transform(X_full)
print(X_covec_1gram.shape, X_covec_1gram.count_nonzero())

(1622778, 275175) 120623543
CPU times: user 1min 1s, sys: 1.56 s, total: 1min 2s
Wall time: 1min 2s


In [28]:
docfreq1 = np.array(X_covec_1gram.sum(axis = 0)).flatten() / X_covec_1gram.shape[0]
inv_vocab1 = {v : k for k,v in covec_1gram.vocabulary_.items()}

In [29]:
np.vectorize(inv_vocab1.get)(np.argsort(docfreq1)[-200:])

array(['cab', '0mm', 'gin', 'elo', 'cap', 'ras', '00g', 'der', 'inh',
       'lado', 'ente', 'abl', 'til', 'cion', 'nda', 'ral', 'las', 'men',
       'rom', 'erm', 'uni', 'adi', 'ste', 'inal', 'sti', 'rio', 'lac',
       'qui', 'l00', 'rin', 'tad', 'ula', 'ase', 'ole', 'tac', 'ato',
       'ric', '0x00', 'cal', 'ari', 'rol', 'rig', 'can', '0cm', 'cao',
       'rte', 'rra', 'ram', 'oco', 'dora', 'ini', 'eco', '00x0', 'ino',
       'ont', 'ame', 'ata', 'ano', 'res', '0.00', '.00', 'lar', 'ver',
       't00', 'ave', 'ura', 'lla', 'ble', 'art', 'ante', 'cio', 'lin',
       'eto', 'ast', 'orta', 'nti', '00v', 'ill', 'dad', 'ria', 'o000',
       'ana', 'bra', 'cha', 'cam', 'and', 'nto', 'tec', 'oto', 'etr',
       'ale', 'r00', 'cad', ' kit', 'igi', 'que', 'ote', 'man', 'int',
       ' ki', 'ero', '00x', 'lan', 'arr', 'eri', ' co', 'ela', 'tri',
       'mar', 'oma', 'col', 'lad', '00p', '000m', 'ret', 'pla', 'eira',
       'rac', 'rat', 'ali', 'ete', 'eca', 'one', 'ola', 'apa', 'par',
      

In [30]:
np.sort(docfreq1)[-10:]

array([0.09971851, 0.09978383, 0.10190303, 0.1046206 , 0.12050015,
       0.13252645, 0.14365797, 0.19747556, 0.21753622, 0.3871084 ])

In [31]:
X_train_counts = X_covec_1gram
print(X_train_counts.shape, X_train_counts.count_nonzero())

(1622778, 275175) 120623543


In [32]:
def sbc(x):
    # sparse binary correlation; x : sparse
    # can't correlate zero columns
    cx = sparse.triu(x.T*x, k = 1, format='coo')
    # print(cx.todense())
    card = np.array(x.sum(axis = 0)).flatten()
    # print(card)
    cx.data = cx.data / (card[cx.row] + card[cx.col] - cx.data)
    # print(cx.todense())
    return np.array((cx == 1).sum(axis = 0) > 0).flatten()

In [33]:
%%time
rem = sbc(X_train_counts)
print(rem.mean())

0.014910511492686473
CPU times: user 2min 27s, sys: 16.4 s, total: 2min 43s
Wall time: 2min 35s


In [34]:
X_train_counts = X_train_counts[:, ~rem]
print(X_train_counts.shape, X_train_counts.count_nonzero())

(1622778, 271072) 120283593


In [35]:
X_go = X_train_counts

In [36]:
%%time

tfidf_transformer = TfidfTransformer(norm='l2', use_idf=False, smooth_idf=True, sublinear_tf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_go = X_train_tfidf
print(X_train_tfidf.shape)

(1622778, 271072)
CPU times: user 1.15 s, sys: 300 ms, total: 1.45 s
Wall time: 672 ms


In [37]:
sp2 = len(data)
X_train, y_train = X_go[:sp], data.category.values[:sp]
X_test, y_test = X_go[sp:sp2], data.category.values[sp:sp2]
X_train.shape, X_test.shape

((1100658, 271072), (275165, 271072))

In [38]:
class_weights = (1 / pd.Series(y_train).value_counts()).to_dict() # switching 1 to len(y) seems to make diff

In [39]:
sample_weight = np.vectorize(class_weights.get)(y_train) # * rel_train

In [40]:
rel = data.label_quality.values[sp:]

In [43]:
%%time
n = sp // 1
clf_sgd = SGDClassifier(loss = 'modified_huber', #n_iter = 12,
                        max_iter=20, tol=1e-5, # try 1e-6 !!
                        alpha = 0.065e-8,
#                     early_stopping=True, validation_fraction = .2, n_iter_no_change = 5,
                    shuffle = False, n_jobs=4).fit(X_train[:n], y_train[:n],
                                                   sample_weight=sample_weight[:n],
                                                  )
clf = clf_sgd

CPU times: user 1h 46min 3s, sys: 6min 51s, total: 1h 52min 55s
Wall time: 24min 31s




In [44]:
%%time

prediction_val = clf.predict(X_test)
print('Val:', bas(y_test, prediction_val))

rel = data.label_quality.values[sp:]
print('Rel:', bas(y_test[rel == 0], prediction_val[rel == 0]))

Val: 0.8576398827488646
Rel: 0.9016474912129034
CPU times: user 20.2 s, sys: 2.44 s, total: 22.7 s
Wall time: 21.8 s




In [None]:
print('Rel:', bas(y_test[rel == 0], prediction_val[rel == 0]))

In [45]:
%%time
val_proba = clf.predict_proba(X_test)

CPU times: user 21.5 s, sys: 2.45 s, total: 24 s
Wall time: 23.6 s


In [46]:
val_proba = pd.DataFrame(val_proba)

In [47]:
%time val_proba.to_csv('../ensemb3/val_sgd_char-v7.csv', index = False, header = False)

CPU times: user 2min 30s, sys: 812 ms, total: 2min 31s
Wall time: 2min 32s


In [48]:
%%time

y_data = data.category
X_data = X_go[:sp2]
class_weights_data = (1 / pd.Series(y_data).value_counts()).to_dict()
sample_weight_data = np.vectorize(class_weights_data.get)(y_data)
# rel_data =  1 + (1 - data.label_quality.values) * (relfactor - 1)
clf.fit(X_data, y_data, sample_weight=sample_weight_data ) # warm start ?

CPU times: user 1h 58min 52s, sys: 6min 54s, total: 2h 5min 46s
Wall time: 27min 35s


In [49]:
test_proba = clf.predict_proba(X_go[sp2:])

In [50]:
test_proba = pd.DataFrame(test_proba)

In [51]:
%time test_proba.to_csv('../ensemb3/test_sgd_char-v7.csv', index = False, header = False)

CPU times: user 2min 16s, sys: 708 ms, total: 2min 17s
Wall time: 2min 19s
