In [6]:
import utils
import pandas as pd
from time import time
from collections import defaultdict
import spacy
import gensim.models.keyedvectors as word2vec
import numpy as np


In [23]:
#initial params
model_path = './word_emb/model_100.bin'
vec_dim = 100
vectors_name_to_save = 'w2v_vectors'
dataset_file = './dataset/data_preprocessed.csv'
csv_column_name = 'descricao'

In [10]:
#read the csv
df = pd.read_csv(dataset_file, encoding = 'latin')
display(df.shape)
df.head()

(201182, 3)

Unnamed: 0,ID_PDC,descricao,GMDN_TERMO
0,174274,HISTERECTOMIA TOTAL LAPAROSCOPICA COM ANEXECTO...,"uterine manipulator, single-use"
1,176721,LESÃO AGUDA DE LIGAMENTO COLATERAL DO JOELHO...,"bone matrix implant, synthetic"
2,181909,EMBOLIZAÃÃO DE ANEURISMA CEREBRAL POR OCLUSÃ...,vascular catheter introduction set
3,183381,PANCREATO-DUODENECTOMIA COM LINFADENECTOMIA [ ...,-
4,183381,PANCREATO-DUODENECTOMIA COM LINFADENECTOMIA [ ...,-


In [11]:
#set spacy model and add stop words
nlp = spacy.load('pt', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed
customize_stop_words = ['cm', 'i', 'm', 'x', 'ml', 'mm', 'mmx', 'g', 'u','c', 'cc','l']
for w in customize_stop_words:
    nlp.vocab[w].is_stop = True

In [16]:
#preprocess and clean text in column target
t = time()
cleaned = utils.get_clean_list_docs(nlp, df, csv_column_name)
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 4.77 mins


In [17]:
cleaned = [' '.join(e) for e in cleaned]
df['preprocess'] = cleaned

In [24]:
#load word2vec model trained and generate a vector for each row in csv
embed_map = word2vec.KeyedVectors.load_word2vec_format(model_path, binary=True, unicode_errors='ignore')
vectors_des = []
t = time()
for i in range(df.shape[0]):
    vector_des = utils.doc_to_sum_vector(df.loc[i]['preprocess'], embed_map, vec_dim)
    vectors_des.append(vector_des)

vectors_des = np.asarray(vectors_des)

print('Time to create vectors: {} mins'.format(round((time() - t) / 60, 2)))

#save the vectors in file
np.save(vectors_name_to_save, vectors_des)

Time to create vectors: 0.52 mins
