In [1]:
from sys import argv, exit
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import Doc2Vec

class Classificador():

    def pre_proc(self, df):

        ## Pré processamento do dataset.
        # Definindo quais vão ser as categorias enviadas para classificação.
        categorias_finais = ['tec', 'esporte', 'ilustrada', 'mercado', 'poder', 'mundo']

        categorias_diluidas = list(set(df['category']) - set(categorias_finais))

        # Agrupando coluna de tecnologia com ciência.
        df["category"]= df["category"].replace("ciencia", "tec")

        # Criando dataset com categorias finais para teste
        docs_classif = df#.copy(deep=True)#.sample(frac=1)
        for cat in categorias_diluidas:
            docs_classif = docs_classif[docs_classif['category'] != cat]

        docs_sem_classif = df#.copy(deep=True)#.sample(frac=1)
        for cat in categorias_finais:
            docs_sem_classif = docs_sem_classif[docs_sem_classif['category'] != cat]
        
        return docs_classif, docs_sem_classif
    
    def dataset_treino(self, docs_classif, docs_sem_classif, modelo):

        vetores_treino = list()
        classes_treino = list()

        vetores_teste = list()
        classes_teste = list()

        for index, row in docs_classif.iterrows():
            try:
                vetores_treino.append(modelo['DOC_'+str(index)])
                classes_treino.append(row['category'])
            except:
                print("PADRÃO: ", index)

        for index, row in docs_sem_classif.iterrows():
            try:
                vetores_teste.append(modelo['DOC_'+str(index)])
                classes_teste.append(row['category'])
            except:
                print("GENÉRICOS: ", index)

        docs_classif["vetor"] = vetores_treino
        docs_classif["classe"] = classes_treino
        docs_sem_classif["vetor"] = vetores_teste

    def cross_valid(self, clf, vetores, classes):

        scores = cross_val_score(clf, vetores, classes,\
            cv=5, scoring='f1_macro')    
        print("F1-Measure: ", scores)

    def treinar(self, treino, alg):

        # Regressão Logistica.
        if alg == 1:
            clf = LogisticRegression(random_state=0, max_iter=150, solver='sag')#C=0.8)
        # Naive Bayes.
        elif alg == 2:
            clf = GaussianNB()
        # Rede Neural.
        elif alg == 3:
            clf = MLPClassifier(solver='sgd', alpha=1e-5,\
                hidden_layer_sizes=(100, 1), random_state=1, learning_rate= 'adaptive')
        else:
            clf = RandomForestClassifier(max_depth=2, random_state=0)
            
        self.cross_valid(clf, list(treino['vetor']), list(treino['classe']))

    def classificar(self, treino, teste, alg):
        
        # Regressão Logistica.
        if alg == 1:
            clf = LogisticRegression(random_state=0, max_iter=150, solver='sag')#, C=0.8)
        # Naive Bayes.
        elif alg == 2:
            clf = GaussianNB()
        # Rede Neural.
        elif alg == 3:
            clf = MLPClassifier(solver='sgd', alpha=1e-5,\
                hidden_layer_sizes=(100, 1), random_state=1, learning_rate= 'adaptive', max_iter=300)
        else:
            clf = RandomForestClassifier(max_depth=2, random_state=0)
        
        clf.fit(list(treino['vetor']), list(treino['classe']))

        # Classificando as novas amostras.
        classificados = clf.predict(list(teste['vetor']))
        teste["classe"] = classificados
        ndf_teste = teste.drop(columns=['vetor'])
        ndf_treino = treino.drop(columns=['vetor'])
        ndf = pd.concat([ndf_treino, ndf_teste]).sort_index()
        ndf.to_csv('../dados/classficados.csv', index=False)

    def executar(self, dataset, caminho_modelo, operacao='treinar', alg=1):

        modelo = Doc2Vec.load(caminho_modelo)
        df = pd.read_csv(dataset)
        docs_classif, docs_sem_classif = self.pre_proc(df)
        self.dataset_treino(docs_classif, docs_sem_classif, modelo)
        if operacao == 'treinar':
            self.treinar(docs_classif, alg)
        elif operacao == 'classif':
            self.classificar(docs_classif, docs_sem_classif, alg)
 
    def data_set(self, dataset, caminho_modelo):
        modelo = Doc2Vec.load(caminho_modelo)
        df = pd.read_csv(dataset)
        docs_classif, docs_sem_classif = self.pre_proc(df)
        self.dataset_treino(docs_classif, docs_sem_classif, modelo)
        return docs_classif, docs_sem_classif
        
        

In [2]:
classi = Classificador()



In [3]:
data_train, data_test = classi.data_set("../dados/articles_limpo.csv", 'modelos/doc2vec.text_0_100_10')

In [4]:
len(data_train) + len(data_test)

166288

In [5]:
data_train.head()

Unnamed: 0,title,text,date,category,link,vetor,classe
0,"Lula diz que está 'lascado', mas que ainda tem...",Com a possibilidade de uma condenação impedir ...,2017-09-10,poder,http://www1.folha.uol.com.br/poder/2017/10/192...,"[-0.68763316, 0.296071, -0.29345122, 0.1390960...",poder
1,"'Decidi ser escrava das mulheres que sofrem', ...","Para Oumou Sangaré, cantora e ativista malines...",2017-09-10,ilustrada,http://www1.folha.uol.com.br/ilustrada/2017/10...,"[-0.122184984, 0.6587932, 0.062461976, -0.0535...",ilustrada
2,Três reportagens da Folha ganham Prêmio Petrob...,Três reportagens da Folha foram vencedoras do ...,2017-09-10,poder,http://www1.folha.uol.com.br/poder/2017/10/192...,"[-0.56213987, 0.31456754, -0.15940027, 0.30708...",poder
3,Filme 'Star Wars: Os Últimos Jedi' ganha trail...,A Disney divulgou na noite desta segunda-feira...,2017-09-10,ilustrada,http://www1.folha.uol.com.br/ilustrada/2017/10...,"[-0.86896574, 0.59616095, 0.25170445, 0.112854...",ilustrada
4,CBSS inicia acordos com fintechs e quer 30% do...,"O CBSS, banco da holding Elopar dos sócios Bra...",2017-09-10,mercado,http://www1.folha.uol.com.br/mercado/2017/10/1...,"[-0.65253127, 0.46439856, 0.25892583, 0.578771...",mercado


In [6]:
#ORGANIZANDO OS TESTES 
X = data_train['vetor'][:]
X_test = data_test['vetor'][:]
Y_train = data_train['category']
Y_test = data_test['category']

In [7]:
type(X)

pandas.core.series.Series

In [8]:
counter = []
for cat in Y_train:
    if cat not in counter:
        counter.append(cat)
counter

['poder', 'ilustrada', 'mercado', 'mundo', 'esporte', 'tec']

In [9]:
# ADICIONANDO CODIFICAÇÃO AD-HOC das categorias
Y = pd.get_dummies(Y_train).values
Y_test = pd.get_dummies(Y_test).values

In [10]:
Y_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [12]:
import tensorflow as tf
# para usar redes do tensor flow é necessario usar  tipo de dados deles.
def ndarray_to_tensor(X):
    for i in range(len(X)):
        X[i] = list(X[i])
    X = list(X)
    return tf.constant(X)

In [13]:
X_tensor = ndarray_to_tensor(X.values)
Y_tensor = ndarray_to_tensor(Y)

In [14]:
print(X_tensor.shape)
print(Y_tensor.shape)

(99064, 100)
(99064, 6)


In [15]:
# contruindo rede neural MLP
import tensorflow as tf

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units= 200, activation='relu', input_shape=(X_tensor.shape[1], )))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(units= 100, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(units= Y_tensor.shape[1], activation='softmax'))

model.compile(optimizer='nadam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 200)               20200     
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 606       
Total params: 40,906
Trainable params: 40,906
Non-trainable params: 0
_________________________________________________________________


In [17]:
# gerando o crossvalidation com tensor flow.

from sklearn.model_selection import KFold
 
n_split=5
acc = []
loss = []
y_predito = []
y_teste = []
count_fold = 1
for train_index,test_index in KFold(n_split).split(X.values):
    
    print('##### folder: {}/{}'.format(count_fold, n_split) )
    
    x_train,x_test=X.values[train_index],X.values[test_index]
    y_train,y_test=Y[train_index],Y[test_index]
  
    model.fit(ndarray_to_tensor(x_train), ndarray_to_tensor(y_train), epochs = 1)

    loss_, acc_ = model.evaluate(ndarray_to_tensor(x_test),
                                             ndarray_to_tensor(y_test))
    print('###### Folder loss:', loss, "Folder acc: ", acc_ )
    y_teste.append(y_test)
    y_pred = model.predict(ndarray_to_tensor(x_test))
    y_predito.append(y_pred)
    acc.append(acc_)
    loss.append(loss_)

##### folder: 1/5
###### Folder loss: [] Folder acc:  0.9195477962493896
##### folder: 1/5
###### Folder loss: [0.22330105304718018] Folder acc:  0.9163680672645569
##### folder: 1/5
###### Folder loss: [0.22330105304718018, 0.2178974449634552] Folder acc:  0.9408974051475525
##### folder: 1/5
###### Folder loss: [0.22330105304718018, 0.2178974449634552, 0.16049307584762573] Folder acc:  0.9325695037841797
##### folder: 1/5
###### Folder loss: [0.22330105304718018, 0.2178974449634552, 0.16049307584762573, 0.16984009742736816] Folder acc:  0.9290834069252014


In [18]:
sum(acc)/5

0.9276932358741761

In [19]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

precision = []
recall = []
f1 = []
accu = []

for i in range(len(y_teste)):
    y_pred = np.argmax(y_predito[i], axis=1)
    y_test = np.argmax(y_teste[i], axis=1)

    # Print f1, precision, and recall scores
    precision.append(precision_score(y_test, y_pred , average="macro"))
    recall.append(recall_score(y_test, y_pred , average="macro"))
    f1.append(f1_score(y_test, y_pred , average="macro"))
    accu.append(accuracy_score(y_test, y_pred))

In [20]:
print("Precision: ", sum(precision)/n_split)
print("Recall: ", sum(recall)/n_split)
print("F1 Score: ", sum(f1)/n_split)
print("Accuracy: ", sum(accu)/n_split)

Precision:  0.9039302438949829
Recall:  0.9084911066667443
F1 Score:  0.9038817559752387
Accuracy:  0.9276932224638752
