# TFIDF

In [61]:
import re
import pandas as pd
import numpy as np

corpus = {'D1': 'in the new york times in',
          'D2': 'the new york post',
          'D3': 'the los angeles times'}

corpus = pd.DataFrame.from_dict(corpus, orient="index", columns=["texto"])
corpus["d"] = corpus.texto.apply(lambda x: len(x.split()))
corpus

Unnamed: 0,texto,d
D1,in the new york times in,6
D2,the new york post,4
D3,the los angeles times,4


In [62]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
bow = cv.fit_transform(corpus.texto.values)

tf = pd.DataFrame(bow.toarray())
tf.columns = cv.get_feature_names()
tf.index = corpus.index
tf = tf.T
tf

Unnamed: 0,D1,D2,D3
angeles,0,0,1
in,2,0,0
los,0,0,1
new,1,1,0
post,0,1,0
the,1,1,1
times,1,0,1
york,1,1,0


In [63]:
tf = tf.div(corpus['d'], axis=1).round(3)
tf

Unnamed: 0,D1,D2,D3
angeles,0.0,0.0,0.25
in,0.333,0.0,0.0
los,0.0,0.0,0.25
new,0.167,0.25,0.0
post,0.0,0.25,0.0
the,0.167,0.25,0.25
times,0.167,0.0,0.25
york,0.167,0.25,0.0


In [64]:
# En cuántos documentos aparece cada una de las palabras, dividido por la cantidad de documentos
df = {}

for palabra in cv.get_feature_names():
    suma = corpus['texto'].apply(lambda val: palabra in val).sum()
    df[palabra] = suma

df = pd.DataFrame.from_dict(df, orient="index", columns=['doc_count'])

N = corpus.shape[0]

df['df'] = df['doc_count']/N
df['idf'] = 1/df['df']
df['log_idf'] = np.log10(df['idf'])


tfidf = df.join(tf)
tfidf


Unnamed: 0,doc_count,df,idf,log_idf,D1,D2,D3
angeles,1,0.333333,3.0,0.477121,0.0,0.0,0.25
in,1,0.333333,3.0,0.477121,0.333,0.0,0.0
los,1,0.333333,3.0,0.477121,0.0,0.0,0.25
new,2,0.666667,1.5,0.176091,0.167,0.25,0.0
post,1,0.333333,3.0,0.477121,0.0,0.25,0.0
the,3,1.0,1.0,0.0,0.167,0.25,0.25
times,2,0.666667,1.5,0.176091,0.167,0.0,0.25
york,2,0.666667,1.5,0.176091,0.167,0.25,0.0


In [75]:
tfidf["tfidf_d1"] = tfidf['D1'] * tfidf['log_idf']
tfidf["tfidf_d2"] = tfidf['D2'] * tfidf['log_idf']
tfidf["tfidf_d3"] = tfidf['D3'] * tfidf['log_idf']

tfidf[['tfidf_d1', 'tfidf_d2', 'tfidf_d3']]

Unnamed: 0,tfidf_d1,tfidf_d2,tfidf_d3
angeles,0.0,0.0,0.11928
in,0.158881,0.0,0.0
los,0.0,0.0,0.11928
new,0.029407,0.044023,0.0
post,0.0,0.11928,0.0
the,0.0,0.0,0.0
times,0.029407,0.0,0.044023
york,0.029407,0.044023,0.0


In [77]:
# La forma de hacerlo
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(corpus['texto'].values)

tfidf_matrix = pd.DataFrame(tfidf.toarray(), columns=tfidf_vect.get_feature_names())
tfidf_matrix.index = corpus.index

tfidf_matrix.T.round(3)

Unnamed: 0,D1,D2,D3
angeles,0.0,0.0,0.584
in,0.811,0.0,0.0
los,0.0,0.0,0.584
new,0.308,0.48,0.0
post,0.0,0.632,0.0
the,0.239,0.373,0.345
times,0.308,0.0,0.445
york,0.308,0.48,0.0


# Medidas de Similitud

In [78]:
####### Segunda parte de la clase

n1 = "La compañía Boring de Elon Musk construirá una conexión de alta velocidad en el aeropuerto de Chicago"
n2 = "La compañía Boring de Elon Musk construirá un enlace de alta velocidad al aeropuerto de Chicago"
n3 = "La empresa Boring de Elon Musk aprobó la construcción del tránsito de alta velocidad entre el centro de Chicago y el aeropuerto O'Hare."
n4 = "Tanto la manzana como la naranja son frutas"

corpus = {'n1': n1,
          'n2': n2,
          'n3': n3,
          'n4': n4}

corpus = pd.DataFrame.from_dict(corpus, orient='index', columns=['texto'])

corpus

Unnamed: 0,texto
n1,La compañía Boring de Elon Musk construirá una...
n2,La compañía Boring de Elon Musk construirá un ...
n3,La empresa Boring de Elon Musk aprobó la const...
n4,Tanto la manzana como la naranja son frutas


In [88]:
import re
from nltk.corpus import stopwords

stopwords_sp = stopwords.words("spanish")

def pre_procesado(texto):
    texto = [words for words in re.sub(r"[\W\d]", " ", texto.lower().strip()).split() if words not in stopwords_sp]
    texto = ' '.join(texto)
    return texto

corpus["pp"] = corpus.texto.apply(lambda x: pre_procesado(x))

corpus

Unnamed: 0,texto,pp
n1,La compañía Boring de Elon Musk construirá una...,compañía boring elon musk construirá conexión ...
n2,La compañía Boring de Elon Musk construirá un ...,compañía boring elon musk construirá enlace al...
n3,La empresa Boring de Elon Musk aprobó la const...,empresa boring elon musk aprobó construcción t...
n4,Tanto la manzana como la naranja son frutas,manzana naranja frutas


In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer

tdidf_vec = TfidfVectorizer()
tfidf = tdidf_vec.fit_transform(corpus.pp.values)

tfidf_matrix = pd.DataFrame(tfidf.toarray())
tfidf_matrix.columns = tdidf_vec.get_feature_names()
tfidf_matrix.index = corpus.index
tfidf_matrix = tfidf_matrix.T
tfidf_matrix

Unnamed: 0,n1,n2,n3,n4
aeropuerto,0.282775,0.282775,0.214535,0.0
alta,0.282775,0.282775,0.214535,0.0
aprobó,0.0,0.0,0.336111,0.0
boring,0.282775,0.282775,0.214535,0.0
centro,0.0,0.0,0.336111,0.0
chicago,0.282775,0.282775,0.214535,0.0
compañía,0.349284,0.349284,0.0,0.0
conexión,0.443022,0.0,0.0,0.0
construcción,0.0,0.0,0.336111,0.0
construirá,0.349284,0.349284,0.0,0.0


## Distancia Euclideana

In [95]:
from sklearn.metrics.pairwise import euclidean_distances

dist_euc = euclidean_distances(tfidf_matrix.T.values)
dist_euc = pd.DataFrame(dist_euc, columns = tfidf_matrix.columns, index = tfidf_matrix.columns)
dist_euc

Unnamed: 0,n1,n2,n3,n4
n1,0.0,0.626528,1.072701,1.414214
n2,0.626528,0.0,1.072701,1.414214
n3,1.072701,1.072701,0.0,1.414214
n4,1.414214,1.414214,1.414214,0.0


## Distancia Coseno

In [97]:
from sklearn.metrics.pairwise import cosine_distances

dist_cos = cosine_distances(tfidf_matrix.T.values)
dist_cos = pd.DataFrame(dist_euc, columns = tfidf_matrix.columns, index = tfidf_matrix.columns)
dist_cos

Unnamed: 0,n1,n2,n3,n4
n1,0.0,0.196269,0.575343,1.0
n2,0.196269,0.0,0.575343,1.0
n3,0.575343,0.575343,0.0,1.0
n4,1.0,1.0,1.0,0.0


## Distancia de Jaccard

In [99]:
def jaccard_distance(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    resultado = 1 - len(s1.intersection(s2)) / len(s1.union(s2))
    return resultado

jaccard_distance(corpus.iloc[0]['pp'].split(), corpus.iloc[2]['pp'].split())

0.5625

## Distancia de Levenshtein

In [104]:
import nltk

nltk.edit_distance(corpus.iloc[0].pp.split(), corpus.iloc[2].pp.split())

7