# Nubes de palabras

Este ejercicio es una adaptación del tutorial disponible en Kaggle [aquí](https://www.kaggle.com/code/lakshmi25npathi/sentiment-analysis-of-imdb-movie-reviews).

### Cargamos los módulos a utilizar

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer

### Accediendo datos en Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls "/content/drive/MyDrive/Colab Notebooks"

In [None]:
%cd "/content/drive/MyDrive/Colab Notebooks"

In [None]:
!ls

In [None]:
!wget --no-check-certificate https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv -O imdb.csv

In [None]:
# Path of the file to read
filepath = "/content/drive/MyDrive/Colab Notebooks/imdb.csv"

# Read the file into a variable fifa_data
data = pd.read_csv(filepath)

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data["sentiment"].value_counts()

### Creamos las nubes de palabras

In [None]:
from wordcloud import WordCloud,STOPWORDS

plt.figure(figsize=(40,25))

# positivos
subset = data[data.sentiment=="positive"]
text = subset.review.values
cloud_positivos = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          collocations=False,
                          width=2500,
                          height=1800
                         ).generate(" ".join(text))

plt.subplot(1, 2, 1)
plt.axis('off')
plt.title("Comentarios positivos",fontsize=40)
plt.imshow(cloud_positivos)

# negativos
subset = data[data.sentiment=="negative"]
text = subset.review.values
cloud_positivos = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          collocations=False,
                          width=2500,
                          height=1800
                         ).generate(" ".join(text))

plt.subplot(1, 2, 2)
plt.axis('off')
plt.title("Comentarios negativos",fontsize=40)
plt.imshow(cloud_positivos)

### Preprocesamiento

#### Palabras sin valor (stopwords)

In [None]:
#Descargamos
nltk.download('stopwords')

#Separación en palabras (Tokenization)
tokenizer=ToktokTokenizer()

#stopwords en inglés
stopword_list=nltk.corpus.stopwords.words('english')

### Quitamos etiquetas HTML y texto innecesario

In [None]:
#Removemos etiquetas html
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removemos corchetes
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removemos caracteres especiales
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

#Sustituimos múltiples espacios por espacio sencillo
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text, flags=re.I)

#Convertimos a minúsculas
def convert_lowercase(text):
  return text.lower()

#Removemos texto innecesario
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    text = remove_extra_spaces(text)
    text = convert_lowercase(text)
    return text

#Apply function on review column
data['review']=data['review'].apply(denoise_text)

### Convertir palabras a su raíz (Text Stemming)

In [None]:
#Pasando el texto a su raíz
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

#Aplicamos la función en la columna review
data['review']=data['review'].apply(simple_stemmer)

### Quitando las palabras que no aportan valor (stopwords)



In [None]:
#seleccionamos las palabras en inglés
stop=set(stopwords.words('english'))
print(stop)

#removemos las stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

#Aplicamos la función en la columna review
data['review']=data['review'].apply(remove_stopwords)

### Creamos las nubes de palabras de nuevo

In [None]:
from wordcloud import WordCloud,STOPWORDS

plt.figure(figsize=(40,25))

# positivos
subset = data[data.sentiment=="positive"]
text = subset.review.values
cloud_positivos = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          collocations=False,
                          width=2500,
                          height=1800
                         ).generate(" ".join(text))

plt.subplot(1, 2, 1)
plt.axis('off')
plt.title("Comentarios positivos",fontsize=40)
plt.imshow(cloud_positivos)

# negativos
subset = data[data.sentiment=="negative"]
text = subset.review.values
cloud_positivos = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          collocations=False,
                          width=2500,
                          height=1800
                         ).generate(" ".join(text))

plt.subplot(1, 2, 2)
plt.axis('off')
plt.title("Comentarios negativos",fontsize=40)
plt.imshow(cloud_positivos)