# Text Classification - Dataset Victor

In [8]:
# Packages imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

In [9]:
# Data load
train = pd.read_csv('./dataset/train_small.csv')
test = pd.read_csv('./dataset/test_small.csv')
validation = pd.read_csv('./dataset/validation_small.csv')

In [10]:
# Quick view at the dataset
print(train.info())
print(train['document_type'].unique())
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149217 entries, 0 to 149216
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   themes         149217 non-null  object
 1   process_id     149217 non-null  object
 2   file_name      149217 non-null  object
 3   document_type  149217 non-null  object
 4   pages          149217 non-null  int64 
 5   body           149217 non-null  object
dtypes: int64(1), object(5)
memory usage: 6.8+ MB
None
['outros' 'sentenca' 'peticao_do_RE' 'despacho_de_admissibilidade'
 'acordao_de_2_instancia' 'agravo_em_recurso_extraordinario']


Unnamed: 0,themes,process_id,file_name,document_type,pages,body
0,[232],AI_856934,AI_856934_1926210_1060_17072013.pdf,outros,1,"{""tribunal justiça estado bahia poder judiciár..."
1,[232],AI_856934,AI_856934_1926211_34_17072013.pdf,outros,1,"{""excelentíssimo senhor doutor juiz direito ju..."
2,[232],AI_856934,AI_856934_1926211_34_17072013.pdf,outros,2,"{""razões recurso inominado recorrente atlantic..."
3,[232],AI_856934,AI_856934_1926211_34_17072013.pdf,outros,3,"{""empresa recorrente tornou credora dos débito..."
4,[232],AI_856934,AI_856934_1926211_34_17072013.pdf,outros,4,"{""entretanto verdade parte apelante tornou tit..."


In [11]:
# Data partition
X_train = train['body']
Y_train = train['document_type']

X_valid = validation['body']
Y_valid = validation['document_type']

X_test = test['body']
Y_test = test['document_type']

scoring = {
    'f1_micro',
    'f1_macro',
    'f1_weighted'
}

## Machine Learning Without Preprocessing

In [5]:
# TfidfVectorizer + MultinomialNB

pipe_tfidf_nb = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

param_tfidf_nb = {
    'vect__min_df': [1, 2, 3],
    'vect__smooth_idf': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

grid_search_tfidf_nb = GridSearchCV(estimator=pipe_tfidf_nb, param_grid=param_tfidf_nb, n_jobs=-1, verbose=10, scoring=scoring, refit='f1_micro')
grid_search_tfidf_nb.fit(X_train, Y_train)
print("Best parameters:")
print(grid_search_tfidf_nb.best_params_)
print("Best scorers: ") # 0.913
print(grid_search_tfidf_nb.best_score_)
tfidf_naive = grid_search_tfidf_nb.best_estimator_
print('F1-Score (micro) Validation: ', f1_score(Y_valid, tfidf_naive.predict(X_valid), average='micro'))
print('F1-Score (micro) Test: ', f1_score(Y_test, tfidf_naive.predict(X_test), average='micro')) # 0.922 - antes 0.84

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters:
{'vect__min_df': 3, 'vect__ngram_range': (1, 3), 'vect__smooth_idf': True}
Best scorers: 
0.9130794220352293
F1-Score (micro) Validation:  0.9185623053781601
F1-Score (micro) Test:  0.9225132424680191


In [6]:
# TfidfVectorizer + SDGClassifier

pipe_tfidf_sdg = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', SGDClassifier())
])

param_tfidf_sdg = {
    'vect__min_df': [1, 2, 3],
    'vect__smooth_idf': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

scoring = {
    'f1_micro',
    'f1_macro',
    'f1_weighted'
}

grid_search_tfidf_sdg = GridSearchCV(estimator=pipe_tfidf_sdg, param_grid=param_tfidf_sdg, n_jobs=-1, verbose=10, scoring=scoring, refit='f1_micro')
grid_search_tfidf_sdg.fit(X_train, Y_train)
print("Best parameters:")
print(grid_search_tfidf_sdg.best_params_)
print("Best scorers: ")
print(grid_search_tfidf_sdg.best_score_)
tfidf_sdg = grid_search_tfidf_sdg.best_estimator_
print('F1-Score (micro) Validation: ', f1_score(Y_valid, tfidf_sdg.predict(X_valid), average='micro'))
print('F1-Score (micro) Test: ', f1_score(Y_test, tfidf_sdg.predict(X_test), average='micro')) # 0.93 - antes 0.90

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters:
{'vect__min_df': 3, 'vect__ngram_range': (1, 3), 'vect__smooth_idf': True}
Best scorers: 
0.9215904678885938
F1-Score (micro) Validation:  0.9282102707552646
F1-Score (micro) Test:  0.9310449511127861


In [7]:
# CountVectorizer + SDGClassifier

pipe_countv_sdg = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SGDClassifier())
])

param_countv_sdg = {
    'vect__min_df': [1, 2, 3],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vect__max_df': [0.5, 0.75, 1.0]
}

grid_search_countv_sdg = GridSearchCV(estimator=pipe_countv_sdg, param_grid=param_countv_sdg, n_jobs=-1, verbose=10, scoring=scoring, refit='f1_micro')
grid_search_countv_sdg.fit(X_train, Y_train)
print("Best parameters:")
print(grid_search_countv_sdg.best_params_)
print("Best scorers: ")
print(grid_search_countv_sdg.best_score_)
countv_sdg = grid_search_countv_sdg.best_estimator_
print('F1-Score (micro) Validation: ', f1_score(Y_valid, countv_sdg.predict(X_valid), average='micro'))
print('F1-Score (micro) Test: ', f1_score(Y_test, countv_sdg.predict(X_test), average='micro'))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters:
{'vect__max_df': 0.5, 'vect__min_df': 1, 'vect__ngram_range': (1, 3)}
Best scorers: 
0.9160213152849689
F1-Score (micro) Validation:  0.9505040375785084
F1-Score (micro) Test:  0.9531855201725185


Logo, obtemos que o modelo com maior acurácia é aquele em que se usa um pipeline de **CountVectorizer(max_df=0.5, min_df=1, ngram_range=(1,3))** e **SDGClassifier()**.

# PreProcessing

In [12]:
# Data load
train = pd.read_csv('./dataset/train_small.csv')
test = pd.read_csv('./dataset/test_small.csv')
validation = pd.read_csv('./dataset/validation_small.csv')

# Data partition
X_train = train['body']
Y_train = train['document_type']

X_valid = validation['body']
Y_valid = validation['document_type']

X_test = test['body']
Y_test = test['document_type']

scoring = {
    'f1_micro',
    'f1_macro',
    'f1_weighted'
}

In [None]:
# Applying GPAM Preprocessing to the datasets
from pre_processing import Pipeline

for text in X_train:
    Pipeline().apply(text)

for text in X_test:
    Pipeline().apply(text)
    
for text in X_valid:
    Pipeline().apply(text)

# Machine Learning + PreProcessing (GPAM)

In [15]:
# TfidfVectorizer + MultinomialNB

from sklearn.pipeline import make_pipeline, Pipeline

pipe_tfidf_nb = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

param_tfidf_nb = {
    'vect__min_df': [1, 2, 3],
    'vect__smooth_idf': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

grid_search_tfidf_nb = GridSearchCV(estimator=pipe_tfidf_nb, param_grid=param_tfidf_nb, n_jobs=-1, verbose=10, scoring=scoring, refit='f1_micro')
grid_search_tfidf_nb.fit(X_train, Y_train)
print("Best parameters:")
print(grid_search_tfidf_nb.best_params_)
print("Best scorers: ") # 0.913
print(grid_search_tfidf_nb.best_score_)
tfidf_naive = grid_search_tfidf_nb.best_estimator_
print('F1-Score (micro) Validation: ', f1_score(Y_valid, tfidf_naive.predict(X_valid), average='micro'))
print('F1-Score (micro) Test: ', f1_score(Y_test, tfidf_naive.predict(X_test), average='micro')) # 0.922 - antes 0.84

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters:
{'vect__min_df': 3, 'vect__ngram_range': (1, 3), 'vect__smooth_idf': True}
Best scorers: 
0.9130794220352293
F1-Score (micro) Validation:  0.9185623053781601
F1-Score (micro) Test:  0.9225132424680191


In [16]:
# TfidfVectorizer + SDGClassifier

pipe_tfidf_sdg = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', SGDClassifier())
])

param_tfidf_sdg = {
    'vect__min_df': [1, 2, 3],
    'vect__smooth_idf': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

scoring = {
    'f1_micro',
    'f1_macro',
    'f1_weighted'
}

grid_search_tfidf_sdg = GridSearchCV(estimator=pipe_tfidf_sdg, param_grid=param_tfidf_sdg, n_jobs=-1, verbose=10, scoring=scoring, refit='f1_micro')
grid_search_tfidf_sdg.fit(X_train, Y_train)
print("Best parameters:")
print(grid_search_tfidf_sdg.best_params_)
print("Best scorers: ")
print(grid_search_tfidf_sdg.best_score_)
tfidf_sdg = grid_search_tfidf_sdg.best_estimator_
print('F1-Score (micro) Validation: ', f1_score(Y_valid, tfidf_sdg.predict(X_valid), average='micro'))
print('F1-Score (micro) Test: ', f1_score(Y_test, tfidf_sdg.predict(X_test), average='micro')) # 0.93 - antes 0.90

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters:
{'vect__min_df': 3, 'vect__ngram_range': (1, 3), 'vect__smooth_idf': True}
Best scorers: 
0.9213827153238074
F1-Score (micro) Validation:  0.9281469361904259
F1-Score (micro) Test:  0.9310030776961247


In [17]:
# CountVectorizer + SDGClassifier

pipe_countv_sdg = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SGDClassifier())
])

param_countv_sdg = {
    'vect__min_df': [1, 2, 3],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vect__max_df': [0.5, 0.75, 1.0]
}

grid_search_countv_sdg = GridSearchCV(estimator=pipe_countv_sdg, param_grid=param_countv_sdg, n_jobs=-1, verbose=10, scoring=scoring, refit='f1_micro')
grid_search_countv_sdg.fit(X_train, Y_train)
print("Best parameters:")
print(grid_search_countv_sdg.best_params_)
print("Best scorers: ")
print(grid_search_countv_sdg.best_score_)
countv_sdg = grid_search_countv_sdg.best_estimator_
print('F1-Score (micro) Validation: ', f1_score(Y_valid, countv_sdg.predict(X_valid), average='micro'))
print('F1-Score (micro) Test: ', f1_score(Y_test, countv_sdg.predict(X_test), average='micro'))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters:
{'vect__max_df': 0.5, 'vect__min_df': 2, 'vect__ngram_range': (1, 2)}
Best scorers: 
0.9156125044840504
F1-Score (micro) Validation:  0.9499973610597984
F1-Score (micro) Test:  0.9527667860059041


## Deep Learning

In [5]:
# Imports and text cleaning
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
from tensorflow.keras.utils import to_categorical

import re
def clean_text(text):
    text = re.compile('[{}"]').sub('', text)
    return text

X_train.apply(clean_text)
X_valid.apply(clean_text)

tokenizer = Tokenizer(num_words=50000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)

tokenizer.fit_on_texts(X_train.values)

tokenizer.fit_on_texts(X_valid.values)

# New X_train
X_train_tok = tokenizer.texts_to_sequences(X_train.values)
X_train_tok = pad_sequences(X_train_tok, maxlen=250)

# New X_valid
X_valid_tok = tokenizer.texts_to_sequences(X_valid.values)
X_valid_tok = pad_sequences(X_valid_tok, maxlen=250)

In [6]:
# Y separation
def to_category(tipo):
    if tipo[0] == 'outros':
        tipo[0] = 0
    elif tipo == 'sentenca':
        tipo = 1
    elif tipo == 'peticao_do_RE':
        tipo = 2
    elif tipo == 'despacho_de_admissibilidade':
        tipo = 3
    elif tipo == 'acordao_de_2_instancia':
        tipo = 4
    elif tipo == 'agravo_em_recurso_extraordinario':
        tipo = 5
    return tipo

Y_train = pd.get_dummies(Y_train).values

In [7]:
# Model creation
model = Sequential()
model.add(Embedding(50000, 100, input_length=X_train_tok.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 250, 100)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 6)                 606       
Total params: 5,081,006
Trainable params: 5,081,006
Non-trainable params: 0
_________________________________________________________________
None


In [69]:
print(X_train_tok.shape)
print(Y_train_deep.shape)

(149217, 250)
(149217, 6)


In [8]:
epochs = 3
batch_size = 64

history = model.fit(X_train_tok, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [10]:
Y_valid = pd.get_dummies(Y_valid).values
validation_values = model.evaluate(X_valid_tok,Y_valid)

# Loss and Accuracy
validation_values



[0.2241349071264267, 0.9389877319335938]