# TF-IDF Vectors

In this section we will obtain the vectors tf-idf from the most frequent words of our dataset.

In [52]:
#Import libraries
import pandas as pd
import spacy
import re
import scipy
import unidecode
import warnings
import numpy as np
import sys
import scipy.sparse
import nltk
from nltk import word_tokenize          
from nltk.stem import RSLPStemmer
from scipy.sparse import hstack,vstack
warnings.filterwarnings('ignore')


## Build a Vocabulary

We now need to create the vocabulary and start the counting process. We can use the CountVectorizer to create a vocabulary from all the text in our dataset followed by the counts of words in the vocabulary.

In [53]:
#load a set of portugues stopwords (eg, assim , como)
spacy_nlp = spacy.load('pt')
spacy_stopwords = spacy.lang.pt.STOP_WORDS

In [54]:
#We use a method to reduce a word to its root or to a stem.
class SteamTokenizer(object):
    def __init__(self):
        self.wnl = RSLPStemmer()
    def __call__(self, doc):
        return [self.wnl.stem(t) for t in word_tokenize(doc)]

In [55]:
#Load dataset
data = pd.read_csv("dataset/data_preprocessed.csv",encoding='latin')

In [56]:
#Dimension of dataset (rows,columns)
data.shape

(201182, 3)

In [57]:
#First 5 rows of our datset
data.head()

Unnamed: 0,ID_PDC,descricao,GMDN_TERMO
0,174274,HISTERECTOMIA TOTAL LAPAROSCOPICA COM ANEXECTO...,"uterine manipulator, single-use"
1,176721,LESÃO AGUDA DE LIGAMENTO COLATERAL DO JOELHO...,"bone matrix implant, synthetic"
2,181909,EMBOLIZAÃÃO DE ANEURISMA CEREBRAL POR OCLUSÃ...,vascular catheter introduction set
3,183381,PANCREATO-DUODENECTOMIA COM LINFADENECTOMIA [ ...,-
4,183381,PANCREATO-DUODENECTOMIA COM LINFADENECTOMIA [ ...,-


In [58]:
#Create a GMDN_TERMO ID as category id
data["CATEGORY_ID"] = data["GMDN_TERMO"].astype("category")
data["CATEGORY_ID"] = data["CATEGORY_ID"].cat.codes
data.head()

Unnamed: 0,ID_PDC,descricao,GMDN_TERMO,CATEGORY_ID
0,174274,HISTERECTOMIA TOTAL LAPAROSCOPICA COM ANEXECTO...,"uterine manipulator, single-use",314
1,176721,LESÃO AGUDA DE LIGAMENTO COLATERAL DO JOELHO...,"bone matrix implant, synthetic",35
2,181909,EMBOLIZAÃÃO DE ANEURISMA CEREBRAL POR OCLUSÃ...,vascular catheter introduction set,315
3,183381,PANCREATO-DUODENECTOMIA COM LINFADENECTOMIA [ ...,-,0
4,183381,PANCREATO-DUODENECTOMIA COM LINFADENECTOMIA [ ...,-,0


In [8]:
#Create a column DOCS. It represents our corpus
data["DOCS"] = data["descricao"]

In [9]:
#Convert to lowercase
data["DOCS"] = data["DOCS"].apply(lambda x : x.lower() )
#remove accents and numbers
data["DOCS"] = data["DOCS"].apply(lambda x : re.sub("[^a-zA-ZÀ-ú']+",' ',x) )
#remove portugues symbols and replace by a normal letter eg. â => a.
data["DOCS"] = data["DOCS"].apply(lambda x : unidecode.unidecode(x) )

In [10]:
#First 5 rows of our corpus
data["DOCS"].head()

0    histerectomia total laparoscopica com anexecto...
1    lesa o aguda de ligamento colateral do joelho ...
2    embolizaa a o de aneurisma cerebral por oclusa...
3    pancreato duodenectomia com linfadenectomia pa...
4    pancreato duodenectomia com linfadenectomia pa...
Name: DOCS, dtype: object

In [11]:
#Import CountVectorizer to build our vocabulary
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
#Convert to matrix
docs = data["DOCS"].tolist()

In [13]:
#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate portugues stop words
#use SteamTokenizer function.
CV = CountVectorizer(max_df=0.85,stop_words=spacy_stopwords,lowercase=True,tokenizer=SteamTokenizer())
#Build vocabulary with our corpus
CV.fit(docs)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.85, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words={'tens', 'se', 'ambos', 'à', 'catorze', 'duas', 'após', 'onze', 'sexta', 'nós', 'segundo', 'vocês', 'quinze', 'quarta', 'então', 'tipo', 'antes', 'próxima', 'custa', 'dessa', 'entre', 'muitos', 'fazia', 'dois', 'quarto', 'logo', 'tiveste', 'noite', 'questão', 'tivestes', 'tu', 'nosso', 'v...mbora', 'da', 'dúvida', 'nova', 'nenhuma', 'verdadeira', 'cinco', 'ambas', 'forma', 'cedo', 'esses'},
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.SteamTokenizer object at 0x7fe725b56588>,
        vocabulary=None)

In [14]:
#Vocabluary
CV.vocabulary_

{'histerectom': 3682,
 'total': 7916,
 'laparoscop': 4325,
 'anexectom': 339,
 'uni': 8260,
 'bilater': 838,
 'uter': 8348,
 'manipul': 4673,
 'uterin': 8349,
 'clearview': 1467,
 'cm': 1483,
 'les': 4402,
 'o': 5407,
 'agud': 206,
 'lig': 4432,
 'colater': 1543,
 'joelh': 4197,
 'trat': 8038,
 'cirurg': 1418,
 'reconstru': 6613,
 'a': 1,
 'acl': 80,
 'titani': 7853,
 'interf': 4009,
 'nci': 5211,
 'x': 8643,
 'traumed': 8048,
 'enxert': 2634,
 'osse': 5593,
 'bloc': 958,
 'cc': 1269,
 'attrax': 626,
 'putty': 6421,
 'emboliza': 2473,
 'aneurism': 336,
 'cerebr': 1319,
 'oclus': 5442,
 'sacul': 6968,
 'vas': 8395,
 'metod': 4863,
 'interven': 4042,
 'terapeu': 7737,
 'imag': 3824,
 'introdu': 4101,
 'f': 2975,
 'ultimum': 8223,
 'pancreat': 5703,
 'duodenectom': 2326,
 'linfadenectom': 4469,
 'pancre': 5701,
 'pin': 6015,
 'sur': 7550,
 'mm': 5012,
 'trocat': 8118,
 'descarta': 2057,
 'simpatectom': 7186,
 'videotoracoscop': 8512,
 'nerv': 5266,
 'perifer': 5905,
 'dissec': 2213,
 'gan

In [15]:
#Size of vocabulary
len(CV.vocabulary_)

8742

In [16]:
#Remove words less than 3 words
vocabulary = CV.vocabulary_.copy()
for key,value in vocabulary.items():
    if len(key) <= 3 :
        del CV.vocabulary_[key]

In [17]:
len(CV.vocabulary_)

7146

In [18]:
# TF vectors
tf_vectors = CV.transform(docs)

In [19]:
# (number samples,vocabulary)
tf_vectors.shape

(201182, 7146)

## TF-IDF Vectors

We will create tf-idf vectors, we are essentially taking the sparse matrix from CountVectorizer (word_count_vector).
We use TfidfTransformer from sklearn library

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer

In [21]:
#Fit model
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(tf_vectors)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [111]:
#Get TF-IDF vectors
tf_idf_vector = tfidf_transformer.transform(tf_vectors)

In [112]:
#Size of tf-idf vectors
tf_idf_vector.shape

(201182, 7146)

In [113]:
#Get category id array
category_vector = data["CATEGORY_ID"].astype(float).values
category_vector[0:5]

array([314.,  35., 315.,   0.,   0.])

In [114]:
#reshape
category_vector = np.reshape(category_vector,(data.shape[0],1))
category_vector[0:5]

array([[314.],
       [ 35.],
       [315.],
       [  0.],
       [  0.]])

In [115]:
#concat category_id to tf-idf vectors
tf_idf_vector = scipy.sparse.hstack([category_vector,tf_idf_vector],format="csr")

In [119]:
# first column : category_id
tf_idf_vector.shape

(201182, 7147)

In [116]:
#Export vectors to matrix dense format
scipy.sparse.save_npz("tfidf_vectors.npz", tf_idf_vector)