# TF-IDF Vectors

In this section we will obtain the vectors tf-idf from the most frequent words of our dataset.

In [11]:
#Import libraries
import pandas as pd
import spacy
import re
import scipy
import unidecode
import warnings
import numpy as np
import sys
import scipy.sparse
import nltk
from nltk import word_tokenize          
from nltk.stem import RSLPStemmer
warnings.filterwarnings('ignore')


## Build a Vocabulary

We now need to create the vocabulary and start the counting process. We can use the CountVectorizer to create a vocabulary from all the text in our dataset followed by the counts of words in the vocabulary.

In [17]:
#load a set of portugues stopwords (eg, assim , como)
spacy_nlp = spacy.load('pt')
spacy_stopwords = spacy.lang.pt.STOP_WORDS

In [18]:
#We use a method to reduce a word to its root or to a stem.
class SteamTokenizer(object):
    def __init__(self):
        self.wnl = RSLPStemmer()
    def __call__(self, doc):
        return [self.wnl.stem(t) for t in word_tokenize(doc)]

In [19]:
#Load dataset
data = pd.read_csv("raw_products.csv",encoding='latin')

In [21]:
#Dimension of dataset (rows,columns)
data.shape

(674719, 10)

In [22]:
#First 5 rows of our datset
data.head()

Unnamed: 0,ID_PDC,ESPECIALIDADE,PROCEDIMENTO_PRINCIPAL,DESCRICAO_DO_PRODUTO,ANVISA,MARCA,REFERENCIA,PRECO,ANVISA_DEF,GMDN_TERMO
0,174274,SISTEMA GENITAL E REPRODUTOR FEMININO,HISTERECTOMIA TOTAL LAPAROSCOPICA COM ANEXECTO...,MANIPULADOR UTERINO CLEARVIEW 7CM,80517980053,CLINICAL INNOVATIONS,UM700,9625.0,-,"uterine manipulator, single-use"
1,176721,SISTEMA MUSCULO-ESQUELETICO E ARTICULAÇÕES,LESÃO AGUDA DE LIGAMENTO COLATERAL DO JOELHO ...,ENXERTO OSSEO EM BLOCO 6 CC 2 X 3 CC ATTRAX PUTTY,80074640024,NUVASIVE,5018006,6875.0,attrax putty nuvasive nuvasive inc,"bone matrix implant, synthetic"
2,181909,METODOS DIAGNOSTICOS POR IMAGEM,EMBOLIZAÇÃO DE ANEURISMA CEREBRAL POR OCLUSÃO ...,INTRODUTOR 6F 12CM ULTIMUM,10332340107,ST JUDE,407694,1375.0,-,vascular catheter introduction set
3,183381,SISTEMA DIGESTIVO E ANEXOS,PANCREATO-DUODENECTOMIA COM LINFADENECTOMIA [ ...,PINÇA LIGA SURE 5MM,10349000188,VALLEYLAB,LS1500,33687.5,-,-
4,183381,SISTEMA DIGESTIVO E ANEXOS,PANCREATO-DUODENECTOMIA COM LINFADENECTOMIA [ ...,TROCATER DESCARTAVEL 12.5X100MM,80082910071,HANGZHOU KANGJI MEDI,101Y524,2062.5,trocarter descartavel hangzhou kangji medical ...,-


In [24]:
#Create a column DOCS. It represents our corpus
data["DOCS"] = data["PROCEDIMENTO_PRINCIPAL"] + " " + data["DESCRICAO_DO_PRODUTO"]

In [27]:
#Convert to lowercase
data["DOCS"] = data["DOCS"].apply(lambda x : x.lower() )
#remove accents and numbers
data["DOCS"] = data["DOCS"].apply(lambda x : re.sub("[^a-zA-ZÀ-ú']+",' ',x) )
#remove portugues symbols and replace by a normal letter eg. â => a.
data["DOCS"] = data["DOCS"].apply(lambda x : unidecode.unidecode(x) )

In [28]:
#First 5 rows of our corpus
data["DOCS"].head()

0    histerectomia total laparoscopica com anexecto...
1    lesao aguda de ligamento colateral do joelho t...
2    embolizacao de aneurisma cerebral por oclusao ...
3    pancreato duodenectomia com linfadenectomia pa...
4    pancreato duodenectomia com linfadenectomia pa...
Name: DOCS, dtype: object

In [29]:
#Import CountVectorizer to build our vocabulary
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
#Convert to matrix
docs = data["DOCS"].tolist()

In [44]:
#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate portugues stop words
#use SteamTokenizer function.
CV = CountVectorizer(max_df=0.85,stop_words=spacy_stopwords,lowercase=True,tokenizer=SteamTokenizer())
#Build vocabulary with our corpus
CV.fit(docs)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.85, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words={'apoia', 'dar', 'desligada', 'todo', 'povo', 'maiorias', 'tentar', 'tens', 'onze', 'enquanto', 'partir', 'quê', 'vão', 'às', 'estar', 'as', 'ligado', 'assim', 'coisa', 'seria', 'valor', 'uma', 'muito', 'após', 'dos', 'neste', 'quer', 'sétimo', 'vos', 'números', 'tuas', 'que', 'oitavo', '...'depois', 'vários', 'tu', 'és', 'nove', 'usa', 'se', 'dá', 'pegar', 'estivemos', 'boa', 'eu', 'põe'},
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.SteamTokenizer object at 0x7f407c804438>,
        vocabulary=None)

In [46]:
#Vocabluary
CV.vocabulary_

{'histerectom': 3762,
 'total': 8081,
 'laparoscop': 4427,
 'anexectom': 357,
 'uni': 8434,
 'bilater': 866,
 'uter': 8524,
 'manipul': 4771,
 'uterin': 8525,
 'clearview': 1504,
 'cm': 1521,
 'lesa': 4508,
 'agud': 219,
 'lig': 4537,
 'colater': 1578,
 'joelh': 4294,
 'trat': 8206,
 'cirurg': 1455,
 'reconstruca': 6729,
 'acl': 86,
 'titani': 8014,
 'interferenc': 4110,
 'x': 8839,
 'traumed': 8216,
 'enxert': 2690,
 'osse': 5690,
 'bloc': 990,
 'cc': 1310,
 'attrax': 646,
 'putty': 6535,
 'embolizaca': 2525,
 'aneurism': 354,
 'cerebr': 1363,
 'oclusa': 5535,
 'sacul': 7084,
 'vas': 8571,
 'metod': 4970,
 'interven': 4137,
 'terapeu': 7892,
 'imag': 3908,
 'introdu': 4194,
 'f': 3038,
 'ultimum': 8395,
 'pancreat': 5802,
 'duodenectom': 2373,
 'linfadenectom': 4573,
 'pancre': 5800,
 'pinc': 6121,
 'sur': 7695,
 'mm': 5122,
 'trocat': 8287,
 'descarta': 2106,
 'simpatectom': 7315,
 'videotoracoscop': 8701,
 'nerv': 5364,
 'perifer': 6008,
 'dissec': 2258,
 'ganch': 3365,
 'angioplast

In [47]:
#Size of vocabulary
len(CV.vocabulary_)

8945

In [52]:
#Remove words less than 3 words
vocabulary = CV.vocabulary_.copy()
for key,value in vocabulary.items():
    if len(key) <= 3 :
        del CV.vocabulary_[key]

In [53]:
len(CV.vocabulary_)

7370

In [50]:
# TF vectors
tf_vectors = CV.transform(docs)

In [55]:
# (number samples,vocabulary)
tf_vectors.shape

(674719, 7370)

## TF-IDF Vectors

We will create tf-idf vectors, we are essentially taking the sparse matrix from CountVectorizer (word_count_vector).
We use TfidfTransformer from sklearn library

In [56]:
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
#Fit model
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(tf_vectors)

NameError: name 'TfidfTransformer' is not defined

In [1]:
#Get TF-IDF vectors
tf_idf_vector = tfidf_transformer.transform(tf_vectors)

NameError: name 'tfidf_transformer' is not defined

In [3]:
#Size of tf-idf vectors
tf_idf_vector.shape

NameError: name 'tf_idf_vector' is not defined

In [4]:
#Export vectors to matrix dense format
scipy.sparse.save_npz("tfidf_vectors.npz", tf_idf_vector)

NameError: name 'scipy' is not defined