# Developing text clustering algorithms
This notebook shall contain an k-means clustering algorithm and a siameses manhattan LSTM

In [1]:
import spacy
from spacy.lang.pt.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from fastparquet import write 
import pandas as pd

## 1. Loading data

In [2]:
%%time 
RELATIVE_FOLDER_PATH = "../assets/data/"
VECTOR_MODEL_NAME = "pt_core_news_sm"
NLP_SPACY = spacy.load(VECTOR_MODEL_NAME)
filename = "data"
stopwords_set = set(STOP_WORDS)

parquet_filename = RELATIVE_FOLDER_PATH + filename + ".parquet.gzip"
ailab_df = pd.read_parquet(parquet_filename)
print(ailab_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2036 entries, 0 to 2035
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   process_class  2036 non-null   object
 1   process_id     2036 non-null   object
 2   doc_id         2036 non-null   object
 3   path_img       2036 non-null   object
 4   text           2036 non-null   object
 5   doc_type       2036 non-null   object
 6   num_pag        2036 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 111.5+ KB
None
CPU times: user 5.21 s, sys: 422 ms, total: 5.63 s
Wall time: 5.64 s


### 1.1  Sampling data
As some text processement can take a long time, it's good to only process the necessary amount

In [3]:
sample_df = ailab_df.sample(n=200)
print(sample_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 687 to 778
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   process_class  200 non-null    object
 1   process_id     200 non-null    object
 2   doc_id         200 non-null    object
 3   path_img       200 non-null    object
 4   text           200 non-null    object
 5   doc_type       200 non-null    object
 6   num_pag        200 non-null    int64 
dtypes: int64(1), object(6)
memory usage: 12.5+ KB
None


## 2. Counting and Vectorizing

In [4]:
%%time
tokenizer = NLP_SPACY.Defaults.create_tokenizer(NLP_SPACY)
raw_text = sample_df['text'].to_list()

tokenized_text = []
for row in raw_text:
    doc = tokenizer(row)
    preprocessed_doc = [token for token in doc if not token.norm_ in stopwords_set]
    tokenized_text.append(" ".join([word.text for word in preprocessed_doc]))

count_vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()

''' Encapsuling components in pipeline '''
pipeline = Pipeline([
    ('count_vectorizer', count_vectorizer),
    ('tfidf_transformer', tfidf_transformer)
])

vectorized_docs = pipeline.fit_transform(tokenized_text)

CPU times: user 5.5 s, sys: 51.5 ms, total: 5.55 s
Wall time: 5.57 s


## 3. Clustering with k-means

In [5]:
%%time
kmeans = KMeans(6)
kmeans.fit(vectorized_docs)
kmeans_df = sample_df.copy()
kmeans_df['cluster_label'] = [label for label in kmeans.labels_]
kmeans_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 687 to 778
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   process_class  200 non-null    object
 1   process_id     200 non-null    object
 2   doc_id         200 non-null    object
 3   path_img       200 non-null    object
 4   text           200 non-null    object
 5   doc_type       200 non-null    object
 6   num_pag        200 non-null    int64 
 7   cluster_label  200 non-null    int64 
dtypes: int64(2), object(6)
memory usage: 14.1+ KB
CPU times: user 16.6 s, sys: 9.91 s, total: 26.5 s
Wall time: 10.5 s


In [7]:
kmeans_df.head()

Unnamed: 0,process_class,process_id,doc_id,path_img,text,doc_type,num_pag,cluster_label
687,ARE,787966,3772555,[./processos_imgs/outros_puro/ARE_787966_37725...,PÁGINA DE SEPARAÇÃO\n(Gerada automaticamente ...,outros_puro,1,1
350,ARE,811207,4428326,[./processos_imgs/outros_puro/ARE_811207_44283...,PÁGINA DE SEPARAÇÃO\n(Gerada automaticamente ...,outros_puro,1,1
28,ARE,956225,308960860,[./processos_imgs/despacho_de_admissibilidade_...,ADMINISTRAÇÃO.” 4. Agravo regimental DESPROVI...,despacho_de_admissibilidade_puro,3,2
1080,RE,763775,3210930,[./processos_imgs/peticao_do_RE_puro/RE_763775...,que constitui direito sociat ligado à cidadan...,peticao_do_RE_puro,47,2
1409,ARE,824506,4849148,[./processos_imgs/sentenca_puro/ARE_824506_484...,fis. 95\n\nESTADO DE SANTA CATARINA Justiça G...,sentenca_puro,3,0
