In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Install kaggle package quietly
!pip install -q kaggle==1.5.12

# Create a kaggle folder
!mkdir ~/.kaggle
# Copy json file into the kaggle folder
!cp /content/gdrive/MyDrive/recording/imdb-reviews/kaggle.json ~/.kaggle/
# Give full read & write permission only to the owner
!chmod 600 ~/.kaggle/kaggle.json
# Download the IMDb Reviews dataset
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
# Unzip the downloaded dataset
!unzip imdb-dataset-of-50k-movie-reviews.zip
# Delete the zip file
!rm -rf imdb-dataset-of-50k-movie-reviews.zip

# Unmount the gdrive
drive.flush_and_unmount()

In [None]:
# Import the modules
import pandas as pd
pd.set_option("display.max_colwidth", None)

In [None]:
# Load the data
df = pd.read_csv(filepath_or_buffer="/content/IMDB Dataset.csv")
# Trace
df.sample(n=5, random_state=42)

In [None]:
# Check the shape and the memory usage of the data
print(f"Shape: {df.shape}, Memory: {round(df.memory_usage(deep=True).sum() * 1e-6, 3)} MB.")

In [None]:
def calc_missing(df: pd.DataFrame) -> pd.DataFrame:
    """Returns the missing percentages of the given frame.

    Args:
        df (pd.DataFrame): The Dataframe. 

    Returns:
        pd.DataFrame: The missing percentages.
    """
    # Calculate the number of missing values
    missing = df.isnull().sum()
    # Calculate the missing percentages
    percent_missing = missing * 100 / len(df)
    # Create a frame with dict format
    missing_value_df = pd.DataFrame(
        {"num_missing": missing, "percent_missing": percent_missing}
    )
    # Round the percentage values
    missing_value_df = round(number=missing_value_df, ndigits=2)
    # Sort from highest to lowest
    missing_value_df.sort_values("percent_missing", inplace=True)
    # Return the missing value frame
    return missing_value_df


# Check the missing value portions of the data
df_missing = calc_missing(df=df); df_missing

In [None]:
# Import the modules
from functools import partial
from re import sub

from bs4 import BeautifulSoup
from nltk import download
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
# Download missing nltk packages
download('punkt')
download('wordnet')
download('omw-1.4')
download('stopwords')

In [None]:
wnl = WordNetLemmatizer()
swords = stopwords.words("english")


def process_text(wnl: WordNetLemmatizer, swords: list, text: pd.Series) -> pd.Series:
    """
    Process the given text by applying;
        - HTML parsing,
        - RegEx ops for brackets and non-alphanumeric chars in the words,
        - Tokenization,
        - Lowercase,
        - Lemmatization,
        - Removing stop words,
        - Removing more than one space.

    Args:
        wnl (WordNetLemmatizer): Lemmatizer.
        swords (list): Stop words list.
        text (pd.Series): Text to be processed.

    Returns:
        pd.Series: Processed text.
    """
    # Parse the HTML
    text = BeautifulSoup(markup=text, features="html.parser").get_text()
    # Remove the brackets
    text = sub(pattern=r"[\[\]\(\)\{\}]", repl=" ", string=text)
    # Remove the numbers from the text
    text = sub(pattern=r"[^a-zA-z0-9\s]", repl=" ", string=text)
    # Tokenize the text
    text = word_tokenize(text=text)
    # Lowercase
    text = [t.lower() for t in text]
    # Lemmatize the text
    text = [wnl.lemmatize(t) for t in text]
    # Remove the stop words from the processed text
    text = [t for t in text if t not in swords]
    # Join back the tokens
    text = " ".join(text)
    # Remove more than one space due to processing ops
    text = sub(pattern=r"\s+", repl=" ", string=text)
    # Return the processed text
    return text

df.insert(loc=1,
          column="processed_review",
          value=df.review.apply(func=partial(process_text, wnl, swords))
        )
df

In [None]:
print(f"Memory: {round(df.drop(columns=['review'], axis=1).memory_usage(deep=True).sum() * 1e-6, 3)} MB.")

# Text Hero Approach

In [None]:
!pip uninstall -yq nltk
!pip uninstall -yq spacy

!pip install -Uq texthero

In [None]:
!python -m spacy download en_core_web_sm # To solve https://github.com/jbesomi/texthero/issues/122

# Import the modules
import texthero as hero
import pandas as pd

In [None]:
# Reload the data
df_hero = pd.read_csv(filepath_or_buffer="/content/IMDB Dataset.csv")

In [None]:
# Preprocessing pipeline by Texthero
df_hero.loc[:, "processed_review"] = df_hero.loc[:, "review"]\
    .pipe(hero.remove_html_tags)\
    .pipe(hero.remove_urls)\
    .pipe(hero.remove_brackets)\
    .pipe(hero.clean)
# Trace
df_hero.sample(n=5, random_state=42)

In [None]:
# Visualize the top number of words by the sentiment
df_hero.groupby("sentiment")["processed_review"].apply(
    lambda x: hero.top_words(x)[:10]
    )

In [None]:
df.loc[:5, :]

In [None]:
# Tfidf
df_hero.loc[:, "tfidf"] = df_hero.loc[:, "processed_review"]\
    .pipe(hero.tfidf, max_features=1000)
# Trace
df_hero.loc[:, "pca"] = df_hero.loc[:, "tfidf"].pipe(hero.pca)

In [None]:
df_hero.loc[0, "tfidf"][:]

In [None]:
# Scatter plot visualization
hero.scatterplot(df=df_hero,
                 col="pca",
                 color="sentiment",
                 title="PCA IMDb Reviews")

In [None]:
# K-means with 3 clusters
df_hero.loc[:, "topics"] = df_hero.loc[:, "tfidf"].pipe(hero.kmeans, n_clusters=3)
hero.scatterplot(df=df_hero,
                 col="pca",
                 color="topics",
                 title="PCA IMDb Reviews by 3 topics")

In [None]:
# Positive wordcloud
hero.visualization.wordcloud(s=df_hero.loc[df_hero.sentiment == "positive", "processed_review"],
                             max_words=20)