<img src='https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQ-VfNtOyJbsaxu43Kztf_cv1mgBG6ZIQZEVw&usqp=CAU'>

# Procesamiento de Lenguaje Natural

## Taller #5: TF-IDF
`Fecha de entrega: Marzo 25, 2021. (Antes del inicio de la próxima clase).`

`Modo de entrega: Subir link de GitHub al aula virtual.`

In [1]:
import re
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
stopwords_sp = stopwords.words('spanish')

from nltk.stem.snowball import SnowballStemmer
spanishStemmer=SnowballStemmer("spanish")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances

# Punto 1: Pre-Procesamiento

- `[9 pts]` Leer el archivo `bob_esponja.csv` usando `pandas`
- `[9 pts]` Crear una nueva columna con el texto en minúscula, sin caracteres especiales ni números, sin palabras vacias y hacer stemming de las palabras

In [2]:
data = pd.read_csv("../../archivos/bob_esponja.csv")
data

Unnamed: 0,Personaje,Descripción
0,Bob Esponja Pantalones Cuadrados,Bob Esponja es una esponja de mar con forma re...
1,Patricio Estrella,"Es una estrella de mar de color rosa, el mejor..."
2,Calamardo Tentáculos,Calamardo Tentáculos es un personaje principal...
3,Arenita Mejillas,Arenita Mejillas (Sandy Cheeks en inglés) en u...
4,Eugenio H. Cangrejo,Don Eugenio H. Cangrejo (normalmente llamado S...
5,Sheldon J. Plankton,"Sheldon J. Plankton, o simplemente Plankton, e..."
6,Karen Plankton,Karen Plankton es uno de los dos principales a...
7,Perlita Cangrejo,"Perla ""Perlita"" Cangrejo es un personaje princ..."
8,Sra. Puff,La Señora Puff es un personaje principal de Bo...
9,Gary el Caracol,Gary el Caracol es un personaje principal de B...


In [3]:
def pre_procesado(texto):
    texto = texto.lower()
    texto = re.sub(r"[\W\d_]+", " ", texto)
    texto = [palabra for palabra in texto.split() if palabra not in stopwords_sp]
    texto = " ".join([spanishStemmer.stem(palabra) for palabra in texto])
    return texto

In [4]:
data['pre-procesado'] = data['Descripción'].apply(lambda texto: pre_procesado(texto))

In [5]:
data

Unnamed: 0,Personaje,Descripción,pre-procesado
0,Bob Esponja Pantalones Cuadrados,Bob Esponja es una esponja de mar con forma re...,bob esponj esponj mar form rectangul color ama...
1,Patricio Estrella,"Es una estrella de mar de color rosa, el mejor...",estrell mar color ros mejor amig bob esponj ju...
2,Calamardo Tentáculos,Calamardo Tentáculos es un personaje principal...,calamard tentacul personaj principal bob espon...
3,Arenita Mejillas,Arenita Mejillas (Sandy Cheeks en inglés) en u...,arenit mejill sandy cheeks ingles personaj pri...
4,Eugenio H. Cangrejo,Don Eugenio H. Cangrejo (normalmente llamado S...,don eugeni h cangrej normal llam señor cangrej...
5,Sheldon J. Plankton,"Sheldon J. Plankton, o simplemente Plankton, e...",sheldon j plankton simplement plankton dos ant...
6,Karen Plankton,Karen Plankton es uno de los dos principales a...,kar plankton dos principal antagon bob esponj ...
7,Perlita Cangrejo,"Perla ""Perlita"" Cangrejo es un personaje princ...",perl perlit cangrej personaj principal bob esp...
8,Sra. Puff,La Señora Puff es un personaje principal de Bo...,señor puff personaj principal bob esponj maest...
9,Gary el Caracol,Gary el Caracol es un personaje principal de B...,gary caracol personaj principal bob esponj que...


# Punto 2: TF-IDF

- `[16 pts]` Crear la matriz TF-IDF

In [6]:
tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(data['pre-procesado'].values)

tfidf_matrix = pd.DataFrame(data=tfidf.toarray(), columns=tfidf_vect.get_feature_names(), index=data.Personaje.values)

tfidf_matrix = tfidf_matrix.T.round(3)

tfidf_matrix

Unnamed: 0,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
abaj,0.057,0.00,0.000,0.000,0.000,0.0,0.000,0.0,0.000,0.000
aborrec,0.000,0.00,0.091,0.000,0.000,0.0,0.000,0.0,0.000,0.000
acab,0.000,0.00,0.091,0.000,0.000,0.0,0.000,0.0,0.000,0.000
accident,0.049,0.00,0.000,0.000,0.045,0.0,0.000,0.0,0.000,0.000
acept,0.000,0.00,0.000,0.000,0.000,0.0,0.064,0.0,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...
viv,0.000,0.05,0.060,0.108,0.070,0.0,0.000,0.0,0.000,0.000
vol,0.057,0.00,0.000,0.000,0.000,0.0,0.000,0.0,0.000,0.000
voz,0.195,0.00,0.000,0.139,0.000,0.0,0.000,0.0,0.000,0.000
vuelv,0.172,0.00,0.000,0.000,0.000,0.0,0.000,0.0,0.000,0.000


# Punto 3: Distancia del coseno
- `[12 pts]` Calcular la distancia del coseno entre cada una de los personajes
- `[2 pts]` ¿Cuáles son los personajes más parecidos?
- `[2 pts]` ¿Cuáles son los personajes más diferentes?

In [15]:
dist_cos = cosine_distances(tfidf_matrix.T.values)
dist_cos = pd.DataFrame(dist_cos, columns = tfidf_matrix.columns, index = tfidf_matrix.columns)
dist_cos = dist_cos.round(3)
dist_cos

Unnamed: 0,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
Bob Esponja Pantalones Cuadrados,0.0,0.826,0.919,0.838,0.868,0.872,0.939,0.886,0.858,0.924
Patricio Estrella,0.826,0.0,0.791,0.841,0.832,0.966,0.93,0.892,0.953,0.895
Calamardo Tentáculos,0.919,0.791,0.0,0.92,0.831,0.918,0.95,0.967,0.968,0.852
Arenita Mejillas,0.838,0.841,0.92,0.0,0.908,0.968,0.948,0.962,0.972,0.901
Eugenio H. Cangrejo,0.868,0.832,0.831,0.908,0.0,0.687,0.892,0.797,0.896,0.921
Sheldon J. Plankton,0.872,0.966,0.918,0.968,0.687,0.0,0.795,0.923,0.987,0.969
Karen Plankton,0.939,0.93,0.95,0.948,0.892,0.795,0.0,0.966,0.985,0.965
Perlita Cangrejo,0.886,0.892,0.967,0.962,0.797,0.923,0.966,0.0,0.874,0.97
Sra. Puff,0.858,0.953,0.968,0.972,0.896,0.987,0.985,0.874,0.0,0.964
Gary el Caracol,0.924,0.895,0.852,0.901,0.921,0.969,0.965,0.97,0.964,0.0


# ¿Cuáles son los personajes más parecidos?

In [8]:
np.fill_diagonal(dist_cos.values, 999999)
resultado = round(dist_cos.min().min(),6)
resultado

0.68738

In [9]:
dist_cos.where(dist_cos==resultado).dropna(how='all').dropna(axis=1)

Eugenio H. Cangrejo
Sheldon J. Plankton


<img src='https://i.ytimg.com/vi/ixVbFbb_R1Q/maxresdefault.jpg' style="width:300px">

# ¿Cuáles son los personajes más diferentes?

In [10]:
np.fill_diagonal(dist_cos.values, -999999)
round(dist_cos.max().max(),6)

0.986733

In [11]:
dist_cos.where(dist_cos==round(dist_cos.max().max(),6)).dropna(how='all').dropna(axis=1)

Sheldon J. Plankton
Sra. Puff


<img src='https://cdn.staticneo.com/w/spongebob/thumb/Spongebob_Squarepants_Mrs_Puff.png/300px-Spongebob_Squarepants_Mrs_Puff.png' style="width:300px">

# Otras opciones...

In [12]:
dist_cos = cosine_distances(tfidf_matrix.T.values)
dist_cos = pd.DataFrame(dist_cos, columns = tfidf_matrix.columns, index = tfidf_matrix.columns)
dist_cos = dist_cos.round(6)
dist_cos

Unnamed: 0,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
Bob Esponja Pantalones Cuadrados,0.0,0.826052,0.918644,0.838317,0.868204,0.872076,0.938609,0.88581,0.858077,0.924158
Patricio Estrella,0.826052,0.0,0.79053,0.841122,0.831634,0.966449,0.930045,0.891502,0.952583,0.89479
Calamardo Tentáculos,0.918644,0.79053,0.0,0.919857,0.831344,0.917606,0.949913,0.967265,0.968111,0.851644
Arenita Mejillas,0.838317,0.841122,0.919857,0.0,0.908202,0.967774,0.948323,0.962015,0.97212,0.900641
Eugenio H. Cangrejo,0.868204,0.831634,0.831344,0.908202,0.0,0.68738,0.891897,0.796723,0.89608,0.920664
Sheldon J. Plankton,0.872076,0.966449,0.917606,0.967774,0.68738,0.0,0.794981,0.922577,0.986733,0.969239
Karen Plankton,0.938609,0.930045,0.949913,0.948323,0.891897,0.794981,0.0,0.966088,0.984919,0.965252
Perlita Cangrejo,0.88581,0.891502,0.967265,0.962015,0.796723,0.922577,0.966088,0.0,0.873832,0.969504
Sra. Puff,0.858077,0.952583,0.968111,0.97212,0.89608,0.986733,0.984919,0.873832,0.0,0.964177
Gary el Caracol,0.924158,0.89479,0.851644,0.900641,0.920664,0.969239,0.965252,0.969504,0.964177,0.0


In [13]:
def color_red(val):
    color = 'red' if val ==True else 'black'
    return f'color: {color}'
    
t = dist_cos>0.95
t.style.applymap(color_red)

Unnamed: 0,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
Bob Esponja Pantalones Cuadrados,False,False,False,False,False,False,False,False,False,False
Patricio Estrella,False,False,False,False,False,True,False,False,True,False
Calamardo Tentáculos,False,False,False,False,False,False,False,True,True,False
Arenita Mejillas,False,False,False,False,False,True,False,True,True,False
Eugenio H. Cangrejo,False,False,False,False,False,False,False,False,False,False
Sheldon J. Plankton,False,True,False,True,False,False,False,False,True,True
Karen Plankton,False,False,False,False,False,False,False,True,True,True
Perlita Cangrejo,False,False,True,True,False,False,True,False,False,True
Sra. Puff,False,True,True,True,False,True,True,False,False,True
Gary el Caracol,False,False,False,False,False,True,True,True,True,False


In [14]:
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

def highlight_min(s):
    min_val = min(sorted(s)[1:])
    is_max = s == min_val
    return ['background-color: green' if v else '' for v in is_max]

dist_cos.style.apply(highlight_max).apply(highlight_min)

Unnamed: 0,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
Bob Esponja Pantalones Cuadrados,0.0,0.826052,0.918644,0.838317,0.868204,0.872076,0.938609,0.88581,0.858077,0.924158
Patricio Estrella,0.826052,0.0,0.79053,0.841122,0.831634,0.966449,0.930045,0.891502,0.952583,0.89479
Calamardo Tentáculos,0.918644,0.79053,0.0,0.919857,0.831344,0.917606,0.949913,0.967265,0.968111,0.851644
Arenita Mejillas,0.838317,0.841122,0.919857,0.0,0.908202,0.967774,0.948323,0.962015,0.97212,0.900641
Eugenio H. Cangrejo,0.868204,0.831634,0.831344,0.908202,0.0,0.68738,0.891897,0.796723,0.89608,0.920664
Sheldon J. Plankton,0.872076,0.966449,0.917606,0.967774,0.68738,0.0,0.794981,0.922577,0.986733,0.969239
Karen Plankton,0.938609,0.930045,0.949913,0.948323,0.891897,0.794981,0.0,0.966088,0.984919,0.965252
Perlita Cangrejo,0.88581,0.891502,0.967265,0.962015,0.796723,0.922577,0.966088,0.0,0.873832,0.969504
Sra. Puff,0.858077,0.952583,0.968111,0.97212,0.89608,0.986733,0.984919,0.873832,0.0,0.964177
Gary el Caracol,0.924158,0.89479,0.851644,0.900641,0.920664,0.969239,0.965252,0.969504,0.964177,0.0
