## Imports

In [None]:
import re
import string

import nltk
import numpy as np
import pandas as pd

from collections import Counter 

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")
nltk.download('punkt')

In [None]:
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

# Load data 
dataset = tfds.load("imdb_reviews", as_supervised = True)
unsup_data = dataset["unsupervised"]

# Unsup
unsup_sens = [str(s[0].numpy()) for s in unsup_data] 

In [3]:
df = pd.DataFrame(unsup_sens, columns = ["text"])
df.head()

Unnamed: 0,text
0,"b""SPOILER - Now knowing the ending I find it s..."
1,b'I knew about this film long before I saw it....
2,"b""This movie is really really awful. It's as b..."
3,b'Wait a minute... yes I do.<br /><br />The di...
4,b'This is the type of movie that\'s just barel...


## Clean data

### Define function to clean and tokenize

In [4]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens
    
    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()                            # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)               # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)                    # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)                  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)         # Replace dash between words
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation

    tokens = tokenizer(text)                                            # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]                  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]                 # Remove digits
    tokens = [t for t in tokens if len(t) > 1]                          # Remove short tokens
    return tokens

### Apply function and remove duplicates

In [5]:
stop_words = set(stopwords.words("english"))

df["tokens"] = df["text"].map(lambda x: clean_text(x, word_tokenize, stop_words))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

### Check vocabulary

In [6]:
docs = df["text"].values
tokenized_docs = df["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [7]:
vocab.most_common(10)

[('br', 114467),
 ('movie', 83804),
 ('film', 76140),
 ('one', 52238),
 ('like', 39851),
 ('good', 28801),
 ('even', 24223),
 ('time', 24029),
 ('would', 24025),
 ('really', 22801)]

## Generate vectors from document

### Define function for creating a single vectors from word embeddings

In [8]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

### Apply function to previously pre-processed text

In [9]:
model = Word2Vec(sentences=tokenized_docs, workers=1, seed=42)

In [10]:
model.wv.most_similar("film")

[('movie', 0.8345317840576172),
 ('filmbr', 0.7882905006408691),
 ('moviebr', 0.6851252317428589),
 ('films', 0.662550151348114),
 ('picture', 0.6391305327415466),
 ('flick', 0.6316883563995361),
 ('movies', 0.5813993215560913),
 ('cinema', 0.5772110223770142),
 ('documentary', 0.5747405886650085),
 ('filmsbr', 0.5089253783226013)]

In [11]:
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(49506, 100)

### Generate and analyze clusters

In [12]:
def mbkmeans_clusters(X, k, mb=500, print_silhouette_values=False):
    """Generate clusters.

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches. Defaults to 500.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [13]:
clustering, cluster_labels = mbkmeans_clusters(X=vectorized_docs, k=2, print_silhouette_values=True)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 2
Silhouette coefficient: 0.15
Inertia:154019.53867232235
Silhouette values:
    Cluster 0: Size:27460 | Avg:0.20 | Min:0.02 | Max: 0.38
    Cluster 1: Size:22046 | Avg:0.08 | Min:-0.08 | Max: 0.27


In [14]:
print("Top terms per cluster (based on centroids):")
for i in range(2):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Top terms per cluster (based on centroids):
Cluster 0: worser irked washout wantbr itit 
Cluster 1: movie really suppose anyway guess 


In [15]:
test_cluster = 1
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

b'This movie was a little bit better than i thought it would be. Which isn\'t saying much as the Scifi Channel has a history of showing some real sulfurous stinkers.<br /><br />But volcanoes are one of my most favorite movie genres, and wild dead horses couldn\'t drag me away from watching one i haven\'t seen yet. And i do believe i\'ve seen just about all of them. Or at least those that i have been currently aware of.<br /><br />As this was a Scifi Channel movie showing, i wasn\'t really expecting an outstanding movie, and in that, i wasn\'t disappointed. The screenplay seemed awfully, awfully similar in some places to a certain volcano movie that took place on the US west coast, on Wilshire Blvd. Apparently, some idea "borrowing" went on...<br /><br />I was pleased to see Mike Ironsides still in the movie biz. Mike has been in good movies and in bad movies, and sometimes in the bad movies, he was the only reason to bother watching it. His character here is pretty amusing in its irres