In [1]:
import gc
from tqdm import tqdm
import numpy as np
import re
import os
import glob
import csv
import joblib
import pandas as pd

In [2]:
def extract_reviews(filename,review_col,max_reviews_per_file):
    reviews = []
    try:
        with open(filename, 'r', encoding="utf8") as file:
            total_reviews = sum(1 for _ in file)
            print(f"{filename} has {total_reviews - 1} reviews")
            file.seek(0)
            my_reader = csv.reader(file, delimiter=',')
            next(my_reader, None)
            for i, row in enumerate(my_reader):
                if i >= max_reviews_per_file:
                    break
                reviews.append(row[review_col])
    except Exception as e:
        print(f"Error reading {filename}: {e}")
    return reviews

In [3]:
def lower_case_reviews(reviews):
    for i in tqdm(range(len(reviews)), desc="Lowercasing"):
        reviews[i] = reviews[i].lower()

In [4]:
def replace_punctuation(reviews):
    punctuation_chars_to_remove = {'.',',','(',')','[',']','`',"'",'"', ';', ':'}
    punctuation_chars_to_space = {'_', '-', '–', '\''}
    for i in tqdm(range(len(reviews)), desc="Cleaning reviews"):
        for char in punctuation_chars_to_remove:
            reviews[i] = reviews[i].replace(char, '')  
        for char in punctuation_chars_to_space:
            reviews[i] = reviews[i].replace(char, ' ') 

In [5]:
def split_extend(reviews,delimeter='.'):
    sentences = []
    for i in tqdm(range(len(reviews)), desc=f"Splitting reviews by '{delimeter}'"):
        sentences.extend(reviews[i].split('.'))
    reviews = sentences

In [6]:
def split_sentences_into_words(sentences,delimeter=' '):
    for i in tqdm(range(len(sentences)), desc="Splitting sentences into words"):
        sentences[i] = sentences[i].split(' ')

In [7]:
def replace_words_with_special_symbols(sentences):
    special_symbol_mapping = {'&': 'AMP', '#': 'HT', '@': 'AT'}
    for sentence in tqdm(sentences, desc="Replacing special symbol words", total=len(sentences)):
        for i in range(len(sentence)):
            for symbol, token in special_symbol_mapping.items():
                if symbol in sentence[i]:
                    sentence[i] = f"{token}_TOKEN"
                    break

In [8]:
def replace_digit_words(sentences):
    digit_symbols = {'%': 'PCT', '$': 'USD', '€': 'EUR'}
    for sentence in tqdm(sentences, desc="Replacing digit words", total=len(sentences)):
        for i in range(len(sentence)):
            word = sentence[i]
            if not re.search(r'\d', word):
                continue
                
            for symbol, token in digit_symbols.items():
                if symbol in word:
                    sentence[i] = f"{len(word)}_{token}_DIGIT_TOKEN"
                    break
                elif re.match(r'^\d+$', word):  
                    sentence[i] = f"{len(word)}_DIGIT_TOKEN"
                elif re.search(r'\d.*[a-zA-Z]|[a-zA-Z].*\d', word): 
                    sentence[i] = None  

    for sentence in tqdm(sentences, desc="Removing None values", total=len(sentences)):
        while None in sentence:
            sentence.remove(None)

In [9]:
def remove_non_alpha_numeric(sentences):
    for sentence in tqdm(sentences, desc="Removing non-alphanumeric words", total=len(sentences)):
        sentence[:] = [word for word in sentence if re.match(r'^[a-zA-Z0-9_]+$', word)]

In [10]:
def filter_empty_words_big_words_and_small_sentences(sentences,max_word_size=15,min_sentence_size=5):
    for i in tqdm(range(len(sentences)), desc="Filtering small sentences", total=len(sentences)):
        sentences[i] = [word for word in sentences[i] if 0 < len(word) < max_word_size]
        if len(sentences[i])<min_sentence_size:
            sentences[i] = None
    sentences = [s for s in sentences if s is not None]

In [11]:
def save_sentences_to_text(sentences, filename, output_dir='sentence_texts/', max_words_per_file=10_000_000):
    os.makedirs(output_dir, exist_ok=True)
    base_filename = filename.replace('.csv', '')
    skipped = 0
    word_count = 0
    file_index = 0
    current_file = open(os.path.join(output_dir, f"{base_filename}_sentences_{file_index}.txt"), 'w', encoding='utf-8')
    for sentence in tqdm(sentences, desc="Writing sentences to text files"):
        if sentence is None:
            skipped += 1
            continue

        if word_count + len(sentence) > max_words_per_file:
            current_file.close()
            file_index += 1
            current_file = open(os.path.join(output_dir, f"{base_filename}_sentences_{file_index}.txt"), 'w', encoding='utf-8')
            word_count = 0
        line = ' '.join(sentence)
        current_file.write(line + '\n')
        word_count += len(sentence)
    current_file.close()
    if skipped > 0:
        print(f"Skipped {skipped} sentences due to errors or being empty.")

In [12]:
def csv_pipeline(filename,col,max_reviews_per_file=10e7,split_to_smaller = True):
    reviews = extract_reviews(filename,col,max_reviews_per_file)
    lower_case_reviews(reviews)
    replace_punctuation(reviews)
    if split_to_smaller:
        split_extend(reviews)
    split_sentences_into_words(reviews)
    replace_words_with_special_symbols(reviews)
    replace_digit_words(reviews)
    remove_non_alpha_numeric(reviews)
    filter_empty_words_big_words_and_small_sentences(reviews)
    save_sentences_to_text(reviews,filename)

In [13]:
csv_files_reviews = {
    'wikitext_sentences.csv' : 0,
    'book_reviews.csv': 9,
    'food_reviews.csv': 5,
    'hotel_reviews.csv': 2,
    'movies_reviews.csv': 0,
    'steam_game_reviews.csv': 0,
    'amazon_reviews.csv': 2
}

In [14]:
# for filename,review_col in csv_files_reviews.items():
#     print(f"Proccesing {filename}")
#     csv_pipeline(filename,review_col)

<h2>Generating the Word Embeddings</h2>

In [16]:
import glob
import joblib
import random
from tqdm import tqdm
from scipy.sparse import lil_matrix,coo_matrix, save_npz,csr_matrix
from collections import defaultdict
import numpy as np
import os
from sklearn.preprocessing import normalize
import pickle

def load_text_batches(sentences_dir='sentence_texts/', num_files=None):
    text_files = sorted(glob.glob(f'{sentences_dir}*.txt'))
    if num_files is None:
        selected_files = text_files
    else:
        num_files = min(num_files, len(text_files))
        selected_files = random.sample(text_files, num_files)
    for file in tqdm(selected_files, desc="Processing text batches"):
        sentences = []
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                words = line.strip().split(' ') 
                if words:
                    sentences.append(words)
        yield sentences

In [17]:
def count_word_occurrences(sentences_dir='sentence_texts/', num_files=None):
    word_count = defaultdict(int)
    for sentences in load_text_batches(sentences_dir, num_files):
        for sentence in sentences:
            for word in sentence:
                word_count[word] += 1
    return word_count

In [18]:
def get_vocabulary(vocabulary_size=75_000):
    word_count_dict = count_word_occurrences()
    print("Sorting counter")
    sorted_dict = sorted(word_count_dict.items(), key=lambda x: x[1], reverse=True)
    top_words = dict(sorted_dict[:vocabulary_size])
    return top_words

In [19]:
#word_counts = get_vocabulary()

In [20]:
def save_word_counts(word_counts, filepath='word_counts.pkl'):
    with open(filepath, 'wb') as f:
        pickle.dump(word_counts, f)
def load_word_counts(filepath='word_counts.pkl'):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

In [21]:
#save_word_counts(word_counts, 'word_counts.pkl')
word_counts = load_word_counts('word_counts.pkl')
print(len(word_counts)) 

75000


In [22]:
def process_batch(sentences, word_to_idx, vocab_size, window_size):
    row_indices = []
    col_indices = []
    values = []
    
    for sentence in sentences:
        for i, target_word in enumerate(sentence):
            if target_word not in word_to_idx:
                continue
            target_idx = word_to_idx[target_word]
            start_idx = max(i - window_size, 0)
            end_idx = min(i + window_size + 1, len(sentence))
    
            for j in range(start_idx, end_idx):
                if j == i:
                    continue
                context_word = sentence[j]
                if context_word not in word_to_idx:
                    continue
                context_idx = word_to_idx[context_word]
                row_indices.append(target_idx)
                col_indices.append(context_idx)
                values.append(1)
    
    return coo_matrix((values, (row_indices, col_indices)), shape=(vocab_size, vocab_size))

def build_cooccurrence_matrix(vocabulary, sentences_dir='sentence_texts/', window_size=5, save_interval=10):
    word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}
    vocab_size = len(vocabulary)

    total_matrix = coo_matrix((vocab_size, vocab_size))  
    file_count = 0

    for batch_sentences in load_text_batches(sentences_dir=sentences_dir):
        file_count += 1
        matrix = process_batch(batch_sentences, word_to_idx, vocab_size, window_size)
        total_matrix = total_matrix + matrix

        if file_count % save_interval == 0:
            print(f"Saving co-occurrence matrix after processing {file_count} files...")
            save_npz(f'cooccurrence_matrix.npz', total_matrix.tocsr())

    final_matrix = total_matrix.tocsr()
    save_npz('cooccurrence_matrix_final.npz', final_matrix)
    return final_matrix, word_to_idx

Creating the cooccurrence matrix

In [25]:
#cooccurrence_matrix, word_to_idx = build_cooccurrence_matrix(top_words, window_size=5, save_interval=10)

In [26]:
import pickle
from scipy.sparse import load_npz,save_npz, csr_matrix, issparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import numpy as np
from tqdm import tqdm
import random

In [27]:
def save_embeddings_and_dict(embeddings_matrix, word_to_index, embeddings_file, dict_file):
    save_npz(embeddings_file, embeddings_matrix)
    with open(dict_file, 'wb') as f:
        pickle.dump(word_to_index, f)

#save_embeddings_and_dict(cooccurrence_matrix, word_to_idx, "cooccurrence_matrix_final.npz", "word_to_index.pkl")

In [28]:
def load_embeddings_and_dicts(embeddings_file, word_to_idx_file, word_count_file):
    embeddings_matrix = load_npz(embeddings_file)
    with open(word_to_idx_file, 'rb') as f:
        word_to_index = pickle.load(f)
    with open(word_count_file, 'rb') as f:
        word_count = pickle.load(f)
    return embeddings_matrix, word_to_index, word_count

In [29]:
cooccurrence_matrix, word_to_idx, word_count = load_embeddings_and_dicts("cooccurrence_matrix_final.npz", "word_to_index.pkl","word_counts.pkl")

#Normalize the rows and columns of the matrix before reducing the dimensions to get better results

#cooccurrence_matrix_normalized = normalize(cooccurrence_matrix, norm='l2', axis=1, copy=True)
#cooccurrence_matrix_normalized = normalize(cooccurrence_matrix_normalized, norm='l2', axis=0,copy=False)

In [30]:
def return_most_similar_words(embeddings, word_to_index, target_word, k=10):
    target_index = word_to_index[target_word]
    if issparse(embeddings):
        target_embedding = embeddings[target_index].toarray().reshape(1, -1)
    else:
        target_embedding = embeddings[target_index].reshape(1, -1)
    similarities = cosine_similarity(target_embedding, embeddings).flatten()
    similar_indices = similarities.argsort()[::-1][1:k+1]
    index_to_word = {idx: word for word, idx in word_to_index.items()}
    top_k_words = [index_to_word[i] for i in similar_indices]
    return top_k_words

Reducing dimension from 75_000 to 200 in a computationally efficient way using RandomProjection and SVD

In [32]:
from sklearn.random_projection import SparseRandomProjection
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
def reduce_embeddings(cooccurrence_matrix_normalized,projection_components=5_000,pca_components=100):
    random_projection = SparseRandomProjection(n_components=projection_components)
    print("Projecting")
    embeddings_reduced = random_projection.fit_transform(cooccurrence_matrix_normalized)
    print("Scaling")
    scaler = StandardScaler(with_mean=True)  
    embeddings_reduced = scaler.fit_transform(embeddings_reduced.toarray())
    print("SVD")
    svd = TruncatedSVD(n_components=pca_components)
    embeddings_reduced = svd.fit_transform(embeddings_reduced)
    return embeddings_reduced

In [33]:
#embeddings = reduce_embeddings(cooccurrence_matrix_normalized,8_000,200)
#save_embeddings_and_dict(csr_matrix(embeddings), word_to_idx, "embeddings_pca_200_75k_window5.npz", "word_to_index.pkl")

<h2>Testing the word embeddings</h2>

In [35]:
embeddings = load_npz('embeddings_pca_200_75k_window5.npz')

Printing similar words

In [37]:
import pandas as pd

words_to_check = ['baby', 'boat', 'energy', 'gold', 'danger','phone','philosophy','drink','fight']
results = {}

for word in words_to_check:
    similar_words = return_most_similar_words(embeddings, word_to_idx, word, k=5)
    results[word] = similar_words

df = pd.DataFrame(results)
df

Unnamed: 0,baby,boat,energy,gold,danger,phone,philosophy,drink,fight
0,babies,boats,energies,silver,terror,phones,philosophical,beverage,fighting
1,newborn,sail,generate,diamond,escape,cellphone,philosophies,drinks,enemy
2,babys,ship,generating,platinum,dangers,charging,philosophers,wine,fights
3,toddler,sailing,renewable,turquoise,struggle,handset,ethics,cocktail,enemies
4,mom,ships,produce,bronze,fear,cingular,thinkers,beer,boss


This test consists of checking whether we can correctly classify each word in unknown_words into its correct category in the categories dictionary based on their representations.

In [39]:
categories = {
    'animals': ['dog','horse','fish'],
    'food': ['pizza','bread','tuna'],
    'weather': ['rain','wind','humidity'],
    'sports': ['soccer','tennis','karate'],
    'technology': ['computer','screen','information']
}

unknown_words = [
    # Animals (8)
    'elephant', 'tiger', 'rabbit', 'cat', 'monkey', 'lion', 'giraffe', 'bear',

    # Food (8)
    'cheese', 'pasta', 'salad', 'burger', 'apple', 'steak', 'sushi', 'chocolate',

    # Weather (8)
    'snow', 'sun', 'storm', 'cloud', 'fog', 'lightning', 'hail', 'temperature',

    # Sports (8)
    'basketball', 'golf', 'baseball', 'boxing', 'hockey', 'hiking', 'swimming', 'cycling',

    # Technology (8)
    'internet', 'keyboard', 'software', 'algorithm', 'cable', 'machine', 'network', 'database'
]

random.shuffle(unknown_words)

In [40]:
def get_average_vector(words, embeddings, word_to_idx):
    vectors = [embeddings[word_to_idx[w]].toarray() for w in words]
    return np.mean(vectors, axis=0)

In [41]:
for word in unknown_words:
    curr_word_emb = embeddings[word_to_idx[word]].toarray()
    max_similarity = -1
    best_bin = None
    for category, bin_words in categories.items():
        bin_vec = get_average_vector(bin_words, embeddings,word_to_idx)
        similarity = cosine_similarity(curr_word_emb, bin_vec)[0][0]
        if similarity > max_similarity:
            max_similarity = similarity
            best_bin = category
    if best_bin:
        categories[best_bin].append(word)

In [42]:
import pandas as pd
max_len = max(len(words) for words in categories.values())
df = pd.DataFrame({
    category.capitalize(): categories[category] + [""] * (max_len - len(categories[category]))
    for category in categories
})

In [43]:
df.head(15)

Unnamed: 0,Animals,Food,Weather,Sports,Technology
0,dog,pizza,rain,soccer,computer
1,horse,bread,wind,tennis,screen
2,fish,tuna,humidity,karate,information
3,tiger,burger,snow,golf,database
4,elephant,cheese,sun,basketball,keyboard
5,cat,sushi,temperature,hiking,internet
6,rabbit,steak,lightning,cycling,cable
7,monkey,chocolate,cloud,baseball,network
8,bear,apple,hail,swimming,machine
9,giraffe,salad,storm,hockey,algorithm


<h3>Amazing!</h3>

<h2>Embedding sentences and Testing it</h2>

Here, we are going to try to generate sentence embeddings from word embeddings. The task is to identify which two out of three given sentences are on the same topic, where two sentences share a topic and the third does not.

I made sure that sentences in the same triplet don't share any words, which makes approaches like CountVectorizer with TF-IDF completely useless.

By randomly guessing the answer, we would get 33% correct. In our test, the results were 93%, which is impressive.

In [77]:
#Sample triplets. The rest are in the hidden cell
triplets = [
    (['carrots', 'are', 'rich', 'in', 'vitamin', 'a'],
     ['broccoli', 'contains', 'lots', 'of', 'fiber'],
     ['mountains', 'reach', 'into', 'the', 'sky']),

    (['lions', 'roam', 'across', 'savannahs'],
     ['cheetahs', 'sprint', 'after', 'prey'],
     ['paint', 'dries', 'slowly', 'on', 'canvas']),

    (['python', 'is', 'a', 'popular', 'language'],
     ['java', 'supports', 'object', 'oriented', 'coding'],
     ['balloons', 'float', 'during', 'parties'])
]

In [78]:
triplets = [
    (['carrots', 'are', 'rich', 'in', 'vitamin', 'a'],
     ['broccoli', 'contains', 'lots', 'of', 'fiber'],
     ['mountains', 'reach', 'into', 'the', 'sky']),

    (['lions', 'roam', 'across', 'savannahs'],
     ['cheetahs', 'sprint', 'after', 'prey'],
     ['paint', 'dries', 'slowly', 'on', 'canvas']),

    (['python', 'is', 'a', 'popular', 'language'],
     ['java', 'supports', 'object', 'oriented', 'coding'],
     ['balloons', 'float', 'during', 'parties']),

    (['cucumbers', 'are', 'cool', 'and', 'crisp'],
     ['zucchini', 'grows', 'in', 'warm', 'seasons'],
     ['giraffes', 'browse', 'on', 'acacia', 'trees']),

    (['basketball', 'requires', 'speed', 'with', 'agility'],
     ['soccer', 'demands', 'endurance', 'and', 'skill'],
     ['candles', 'melt', 'into', 'pools']),

    (['trains', 'move', 'along', 'tracks'],
     ['planes', 'fly', 'across', 'continents'],
     ['oranges', 'are', 'juicy', 'and', 'tart']),

    (['eagles', 'soar', 'above', 'valleys'],
     ['owls', 'hunt', 'by', 'night'],
     ['toys', 'scatter', 'across', 'the', 'floor']),

    (['volcanoes', 'erupt', 'with', 'magma'],
     ['earthquakes', 'shake', 'the', 'ground'],
     ['pencils', 'rest', 'on', 'a', 'desk']),

    (['rabbits', 'hop', 'through', 'fields'],
     ['foxes', 'prowl', 'after', 'dusk'],
     ['cameras', 'snap', 'bright', 'photos']),

    (['roses', 'smell', 'sweet', 'in', 'bloom'],
     ['tulips', 'sway', 'in', 'the', 'wind'],
     ['monkeys', 'swing', 'through', 'branches']),

    (['keyboards', 'have', 'many', 'buttons'],
     ['monitors', 'display', 'high', 'resolution'],
     ['snowflakes', 'land', 'on', 'tongues']),

    (['dolphins', 'leap', 'from', 'water'],
     ['whales', 'sing', 'beneath', 'waves'],
     ['bookshelves', 'line', 'the', 'hallway']),

    (['notebooks', 'hold', 'handwritten', 'notes'],
     ['binders', 'organize', 'paperwork', 'easily'],
     ['zebras', 'graze', 'on', 'plains']),

    (['chairs', 'support', 'our', 'backs'],
     ['couches', 'provide', 'comfort', 'at', 'home'],
     ['glaciers', 'carve', 'through', 'valleys']),

    (['bats', 'navigate', 'using', 'echoes'],
     ['mice', 'scurry', 'into', 'holes'],
     ['umbrellas', 'open', 'during', 'storms']),

    (['trombones', 'blare', 'in', 'orchestras'],
     ['violins', 'play', 'gentle', 'melodies'],
     ['coconuts', 'fall', 'from', 'trees']),

    (['beaches', 'glow', 'at', 'sunset'],
     ['islands', 'surround', 'calm', 'lagoons'],
     ['ladders', 'lean', 'against', 'walls']),

    (['kittens', 'chase', 'fluttering', 'leaves'],
     ['puppies', 'chew', 'on', 'rubber', 'toys'],
     ['mirrors', 'reflect', 'our', 'faces']),

    (['snowmen', 'stand', 'in', 'yards'],
     ['sleds', 'slide', 'down', 'slopes'],
     ['paintings', 'hang', 'on', 'galleries']),

    (['cereal', 'crunches', 'in', 'milk'],
     ['pancakes', 'soak', 'up', 'syrup'],
     ['seagulls', 'circle', 'above', 'harbors']),

    (['ice', 'melts', 'on', 'hot', 'days'],
     ['steam', 'rises', 'from', 'cups'],
     ['bamboo', 'bends', 'with', 'breezes']),

    (['bridges', 'span', 'over', 'rivers'],
     ['tunnels', 'run', 'under', 'cities'],
     ['gloves', 'keep', 'hands', 'warm']),

    (['apples', 'grow', 'on', 'orchards'],
     ['pears', 'ripen', 'in', 'autumn'],
     ['drums', 'beat', 'rhythms', 'loudly']),

    (['clouds', 'float', 'in', 'skies'],
     ['storms', 'gather', 'during', 'spring'],
     ['tickets', 'grant', 'entry', 'to', 'shows']),

    (['elevators', 'ascend', 'at', 'towers'],
     ['escalators', 'glide', 'between', 'floors'],
     ['foxglove', 'blooms', 'in', 'gardens']),

    (['hammers', 'strike', 'nails'],
     ['saws', 'cut', 'through', 'planks'],
     ['penguins', 'waddle', 'on', 'icebergs']),

    (['trumpets', 'blow', 'brassy', 'sounds'],
     ['flutes', 'whistle', 'soft', 'notes'],
     ['meadows', 'stretch', 'for', 'miles']),

    (['pencils', 'write', 'on', 'paper'],
     ['pens', 'leave', 'permanent', 'ink'],
     ['craters', 'dot', 'the', 'moon']),

    (['leopards', 'stalk', 'quietly'],
     ['hyenas', 'laugh', 'in', 'packs'],
     ['sandals', 'protect', 'feet', 'on', 'paths']),

    (['surfboards', 'glide', 'over', 'waves'],
     ['kayaks', 'drift', 'down', 'rivers'],
     ['magnets', 'stick', 'to', 'metal']),

    (['deserts', 'bake', 'beneath', 'suns'],
     ['cacti', 'store', 'water'],
     ['suits', 'hang', 'in', 'closets']),

    (['computers', 'run', 'on', 'electricity'],
     ['routers', 'manage', 'network', 'traffic'],
     ['candies', 'come', 'in', 'wrappers']),

    (['spoons', 'hold', 'liquid', 'foods'],
     ['forks', 'pierce', 'vegetables'],
     ['alligators', 'swim', 'in', 'marshes']),

    (['guitars', 'strum', 'melodic', 'chords'],
     ['drummers', 'keep', 'time', 'with', 'beats'],
     ['kites', 'fly', 'on', 'windy', 'days']),

    (['planets', 'orbit', 'distant', 'stars'],
     ['asteroids', 'travel', 'through', 'space'],
     ['suitcases', 'carry', 'clothes', 'for', 'trips']),

    (['honeybees', 'collect', 'nectar'],
     ['wasps', 'build', 'paper', 'nests'],
     ['scissors', 'cut', 'through', 'fabric']),

    (['crayons', 'color', 'on', 'pages'],
     ['markers', 'leave', 'bold', 'lines'],
     ['fireworks', 'explode', 'during', 'holidays']),

    (['sunscreen', 'protects', 'skin'],
     ['hats', 'shade', 'your', 'face'],
     ['statues', 'stand', 'in', 'parks']),

    (['beavers', 'build', 'dams'],
     ['otters', 'float', 'on', 'backs'],
     ['backpacks', 'carry', 'books']),

    (['watches', 'track', 'time'],
     ['alarms', 'wake', 'people', 'early'],
     ['bubbles', 'rise', 'in', 'water']),

    (['jeans', 'fit', 'snugly'],
     ['jackets', 'keep', 'you', 'warm'],
     ['lanterns', 'glow', 'in', 'darkness']),

    (['raindrops', 'tap', 'on', 'windows'],
     ['breeze', 'enters', 'the','door'],
     ['mirrors', 'shatter', 'when', 'punched']),

    (['hedgehogs', 'curl', 'into', 'balls'],
     ['badgers', 'dig', 'deep', 'holes'],
     ['rulers', 'measure', 'straight', 'lines']),

    (['windmills', 'spin', 'in', 'breezes'],
     ['solar', 'panels', 'absorb', 'sunlight'],
     ['picnics', 'happen', 'on', 'lawns']),

    (['camels', 'endure', 'heat'],
     ['lizards', 'scuttle', 'on', 'rocks'],
     ['couches', 'face', 'televisions']),

    (['schoolbags', 'hold', 'supplies'],
     ['pencilcases', 'store', 'erasers'],
     ['staircases', 'lead', 'to', 'attics']),

    (['penguins', 'dive', 'under', 'ice'],
     ['seals', 'slide', 'on', 'bellies'],
     ['statements', 'end', 'with', 'periods']),

    (['lava', 'flows', 'from', 'craters'],
     ['ash', 'clouds', 'fill', 'the', 'sky'],
     ['baskets', 'carry', 'fruits']),

    (['fireflies', 'blink', 'in', 'meadows'],
     ['crickets', 'chirp', 'over', 'grass'],
     ['pillows', 'soften', 'your', 'head']),
    (
        ['we', 'toured', 'city', 'on', 'a', 'bus', 'visited', 'the'],
        ['she', 'booked', 'her', 'flights', 'to', 'new', 'york', 'for', 'vacation'],
        ['he', 'fixed', 'leak', 'in', 'roof', 'of', 'house']
    ),
        (['brightly', 'orange', 'carrots', 'are', 'very', 'rich', 'in', 'essential', 'vitamin', 'a'],
     ['green', 'broccoli', 'contains', 'a', 'significant', 'amount', 'of', 'dietary', 'fiber'],
     ['tall', 'mountains', 'majestically', 'reach', 'far', 'into', 'the', 'blue', 'sky']),

    (['ferocious', 'lions', 'often', 'roam', 'freely', 'across', 'vast', 'African', 'savannahs'],
     ['swift', 'cheetahs', 'can', 'sprint', 'rapidly', 'after', 'their', 'fleeting', 'prey'],
     ['wet', 'paint', 'typically', 'dries', 'quite', 'slowly', 'on', 'a', 'stretched', 'canvas']),

    (['popular', 'python', 'is', 'considered', 'a', 'versatile', 'programming', 'language'],
     ['robust', 'java', 'effectively', 'supports', 'complex', 'object', 'oriented', 'software', 'coding'],
     ['colorful', 'balloons', 'gently', 'float', 'upward', 'during', 'festive', 'outdoor', 'parties']),

    (['cool', 'and', 'crisp', 'cucumbers', 'are', 'refreshing', 'summer', 'vegetables'],
     ['green', 'zucchini', 'usually', 'grows', 'well', 'in', 'long', 'warm', 'summer', 'seasons'],
     ['graceful', 'giraffes', 'frequently', 'browse', 'peacefully', 'on', 'tall', 'acacia', 'trees']),

    (['fast', 'basketball', 'often', 'requires', 'both', 'great', 'speed', 'along', 'with', 'excellent', 'agility'],
     ['competitive', 'soccer', 'typically', 'demands', 'significant', 'endurance', 'plus', 'considerable', 'foot', 'skill'],
     ['lit', 'candles', 'slowly', 'melt', 'down', 'into', 'small', 'liquid', 'wax', 'pools']),

    (['long', 'trains', 'frequently', 'move', 'steadily', 'along', 'steel', 'tracks'],
     ['large', 'planes', 'can', 'fly', 'swiftly', 'across', 'distant', 'continents'],
     ['ripe', 'oranges', 'are', 'generally', 'quite', 'juicy', 'and', 'pleasantly', 'tart']),

    (['majestic', 'eagles', 'often', 'soar', 'gracefully', 'high', 'above', 'wide', 'valleys'],
     ['nocturnal', 'owls', 'primarily', 'hunt', 'actively', 'by', 'the', 'dark', 'night'],
     ['various', 'toys', 'tend', 'to', 'scatter', 'randomly', 'across', 'the', 'untidy', 'floor']),

    (['active', 'volcanoes', 'can', 'violently', 'erupt', 'suddenly', 'with', 'molten', 'hot', 'magma'],
     ['powerful', 'earthquakes', 'can', 'violently', 'shake', 'the', 'solid', 'ground', 'beneath'],
     ['wooden', 'pencils', 'usually', 'rest', 'motionless', 'on', 'a', 'cluttered', 'desk']),

    (['quick', 'rabbits', 'often', 'hop', 'merrily', 'through', 'green', 'fields'],
     ['stealthy', 'foxes', 'typically', 'prowl', 'quietly', 'after', 'the', 'late', 'dusk'],
     ['modern', 'cameras', 'can', 'easily', 'snap', 'clear', 'bright', 'digital', 'photos']),

    (['fragrant', 'roses', 'often', 'smell', 'wonderfully', 'sweet', 'when', 'in', 'full', 'bloom'],
     ['colorful', 'tulips', 'gently', 'sway', 'back', 'and', 'forth', 'in', 'the', 'breeze', 'wind'],
     ['agile', 'monkeys', 'frequently', 'swing', 'effortlessly', 'through', 'tall', 'forest', 'branches']),

    (['modern', 'keyboards', 'typically', 'have', 'numerous', 'small', 'plastic', 'buttons'],
     ['large', 'monitors', 'clearly', 'display', 'high', 'screen', 'resolution', 'images'],
     ['delicate', 'snowflakes', 'gently', 'land', 'softly', 'on', 'outstretched', 'tongues']),

    (['playful', 'dolphins', 'often', 'leap', 'joyfully', 'high', 'from', 'the', 'ocean', 'water'],
     ['large', 'whales', 'sometimes', 'sing', 'melodically', 'deep', 'beneath', 'the', 'ocean', 'waves'],
     ['tall', 'bookshelves', 'usually', 'line', 'the', 'long', 'narrow', 'hallway']),

    (['organized', 'notebooks', 'frequently', 'hold', 'many', 'handwritten', 'personal', 'notes'],
     ['sturdy', 'binders', 'help', 'organize', 'loose', 'paperwork', 'quite', 'easily', 'by', 'topic'],
     ['wild', 'zebras', 'often', 'graze', 'peacefully', 'on', 'the', 'vast', 'African', 'plains']),

    (['comfortable', 'chairs', 'are', 'designed', 'to', 'support', 'our', 'tired', 'backs'],
     ['soft', 'couches', 'usually', 'provide', 'great', 'comfort', 'to', 'people', 'at', 'home'],
     ['massive', 'glaciers', 'slowly', 'carve', 'deeply', 'through', 'mountainous', 'valleys']),

    (['nocturnal', 'bats', 'skillfully', 'navigate', 'effectively', 'using', 'high-frequency', 'echoes'],
     ['tiny', 'mice', 'often', 'scurry', 'quickly', 'into', 'small', 'dark', 'holes'],
     ['large', 'umbrellas', 'typically', 'open', 'wide', 'during', 'heavy', 'rain', 'storms']),

    (['loud', 'trombones', 'can', 'blare', 'powerful', 'sounds', 'within', 'orchestras'],
     ['melodic', 'violins', 'beautifully', 'play', 'soft', 'and', 'gentle', 'musical', 'melodies'],
     ['ripe', 'coconuts', 'can', 'suddenly', 'fall', 'down', 'from', 'tall', 'palm', 'trees']),

    (['sandy', 'beaches', 'often', 'glow', 'warmly', 'at', 'the', 'beautiful', 'sunset'],
     ['tropical', 'islands', 'typically', 'surround', 'calm', 'clear', 'blue', 'lagoons'],
     ['tall', 'ladders', 'usually', 'lean', 'precariously', 'against', 'exterior', 'walls']),

    (['playful', 'kittens', 'often', 'chase', 'quickly', 'fluttering', 'colorful', 'leaves'],
     ['small', 'puppies', 'frequently', 'chew', 'happily', 'on', 'soft', 'rubber', 'toys'],
     ['shiny', 'mirrors', 'clearly', 'reflect', 'our', 'own', 'unique', 'faces']),

    (['large', 'snowmen', 'often', 'stand', 'still', 'in', 'snow-covered', 'yards'],
     ['fast', 'sleds', 'quickly', 'slide', 'rapidly', 'down', 'icy', 'slopes'],
     ['framed', 'paintings', 'usually', 'hang', 'decoratively', 'on', 'art', 'galleries']),

    (['crispy', 'cereal', 'often', 'crunches', 'noisily', 'in', 'cold', 'white', 'milk'],
     ['fluffy', 'pancakes', 'readily', 'soak', 'completely', 'up', 'sweet', 'maple', 'syrup'],
     ['graceful', 'seagulls', 'frequently', 'circle', 'lazily', 'high', 'above', 'busy', 'harbors']),

    (['clear', 'ice', 'quickly', 'melts', 'away', 'on', 'very', 'hot', 'summer', 'days'],
     ['hot', 'steam', 'visibly', 'rises', 'upward', 'from', 'warm', 'coffee', 'cups'],
     ['tall', 'bamboo', 'often', 'bends', 'gently', 'back', 'and', 'forth', 'with', 'light', 'breezes']),

    (['sturdy', 'bridges', 'typically', 'span', 'widely', 'over', 'flowing', 'rivers'],
     ['dark', 'tunnels', 'usually', 'run', 'deeply', 'under', 'large', 'cities'],
     ['warm', 'gloves', 'effectively', 'keep', 'our', 'hands', 'protected', 'and', 'warm']),

    (['red', 'apples', 'commonly', 'grow', 'abundantly', 'on', 'fruitful', 'orchards'],
     ['ripe', 'pears', 'typically', 'ripen', 'slowly', 'in', 'the', 'autumn', 'season'],
     ['loud', 'drums', 'often', 'beat', 'strong', 'rhythms', 'very', 'loudly']),

    (['white', 'clouds', 'frequently', 'float', 'lazily', 'high', 'in', 'the', 'blue', 'skies'],
     ['severe', 'storms', 'often', 'gather', 'quickly', 'during', 'the', 'springtime', 'season'],
     ['paper', 'tickets', 'usually', 'grant', 'easy', 'entry', 'to', 'various', 'shows']),

    (['fast', 'elevators', 'quickly', 'ascend', 'smoothly', 'at', 'tall', 'city', 'towers'],
     ['modern', 'escalators', 'gently', 'glide', 'effortlessly', 'between', 'different', 'floors'],
     ['purple', 'foxglove', 'beautifully', 'blooms', 'vibrantly', 'in', 'well-tended', 'gardens']),

    (['heavy', 'hammers', 'forcefully', 'strike', 'metal', 'nails'],
     ['sharp', 'saws', 'efficiently', 'cut', 'cleanly', 'through', 'wooden', 'planks'],
     ['flightless', 'penguins', 'often', 'waddle', 'awkwardly', 'on', 'slippery', 'icebergs']),

    (['loud', 'trumpets', 'can', 'blow', 'strong', 'brassy', 'musical', 'sounds'],
     ['high', 'flutes', 'often', 'whistle', 'soft', 'and', 'gentle', 'high', 'notes'],
     ['green', 'meadows', 'typically', 'stretch', 'outward', 'for', 'many', 'long', 'miles']),

    (['wooden', 'pencils', 'are', 'used', 'to', 'write', 'clearly', 'on', 'white', 'paper'],
     ['smooth', 'pens', 'typically', 'leave', 'permanent', 'dark', 'ink', 'marks'],
     ['numerous', 'craters', 'distinctly', 'dot', 'the', 'barren', 'surface', 'of', 'the', 'moon']),

    (['stealthy', 'leopards', 'often', 'stalk', 'their', 'prey', 'very', 'quietly'],
     ['vocal', 'hyenas', 'frequently', 'laugh', 'loudly', 'in', 'large', 'social', 'packs'],
     ['simple', 'sandals', 'help', 'protect', 'bare', 'feet', 'while', 'on', 'sandy', 'paths']),

    (['fast', 'surfboards', 'skillfully', 'glide', 'smoothly', 'over', 'ocean', 'waves'],
     ['small', 'kayaks', 'often', 'drift', 'gently', 'down', 'winding', 'rivers'],
     ['powerful', 'magnets', 'strongly', 'stick', 'firmly', 'to', 'ferrous', 'metal', 'surfaces']),

    (['arid', 'deserts', 'often', 'bake', 'intensely', 'beneath', 'the', 'hot', 'suns'],
     ['tough', 'cacti', 'efficiently', 'store', 'precious', 'water', 'inside'],
     ['formal', 'suits', 'usually', 'hang', 'neatly', 'in', 'spacious', 'closets']),

    (['modern', 'computers', 'typically', 'run', 'efficiently', 'on', 'electrical', 'electricity'],
     ['network', 'routers', 'actively', 'manage', 'complex', 'internet', 'traffic'],
     ['sweet', 'candies', 'usually', 'come', 'individually', 'in', 'colorful', 'wrappers']),

    (['metal', 'spoons', 'are', 'designed', 'to', 'hold', 'liquid', 'hot', 'foods'],
     ['sharp', 'forks', 'are', 'used', 'to', 'pierce', 'various', 'cooked', 'vegetables'],
     ['large', 'alligators', 'often', 'swim', 'slowly', 'in', 'swampy', 'marshes']),

    (['acoustic', 'guitars', 'often', 'strum', 'beautiful', 'melodic', 'musical', 'chords'],
     ['talented', 'drummers', 'skillfully', 'keep', 'precise', 'time', 'with', 'strong', 'beats'],
     ['colorful', 'kites', 'frequently', 'fly', 'high', 'on', 'breezy', 'windy', 'days']),

    (['distant', 'planets', 'continuously', 'orbit', 'faraway', 'bright', 'stars'],
     ['small', 'asteroids', 'constantly', 'travel', 'rapidly', 'through', 'outer', 'space'],
     ['packed', 'suitcases', 'are', 'used', 'to', 'carry', 'personal', 'clothes', 'for', 'trips']),

    (['busy', 'honeybees', 'actively', 'collect', 'sweet', 'flower', 'nectar'],
     ['industrious', 'wasps', 'often', 'build', 'intricate', 'paper', 'nests'],
     ['sharp', 'scissors', 'are', 'used', 'to', 'cut', 'cleanly', 'through', 'fabric']),

    (['bright', 'crayons', 'are', 'used', 'to', 'color', 'vibrantly', 'on', 'paper', 'pages'],
     ['bold', 'markers', 'typically', 'leave', 'distinct', 'dark', 'lines', 'easily'],
     ['loud', 'fireworks', 'spectacularly', 'explode', 'noisily', 'during', 'special', 'holidays']),

    (['effective', 'sunscreen', 'helps', 'protects', 'sensitive', 'skin', 'from', 'sun'],
     ['wide', 'hats', 'effectively', 'shade', 'your', 'delicate', 'face', 'from', 'light'],
     ['stone', 'statues', 'usually', 'stand', 'motionless', 'in', 'public', 'parks']),

    (['busy', 'beavers', 'actively', 'build', 'sturdy', 'wooden', 'dams'],
     ['aquatic', 'otters', 'often', 'float', 'lazily', 'on', 'their', 'backs', 'in', 'water'],
     ['large', 'backpacks', 'are', 'designed', 'to', 'carry', 'heavy', 'school', 'books']),

    (['accurate', 'watches', 'are', 'designed', 'to', 'track', 'precise', 'time'],
     ['loud', 'alarms', 'are', 'set', 'to', 'wake', 'sleeping', 'people', 'quite', 'early'],
     ['clear', 'bubbles', 'often', 'rise', 'slowly', 'in', 'clear', 'water']),

    (['comfortable', 'jeans', 'usually', 'fit', 'quite', 'snugly', 'on', 'legs'],
     ['warm', 'jackets', 'are', 'worn', 'to', 'keep', 'you', 'comfortably', 'warm'],
     ['bright', 'lanterns', 'typically', 'glow', 'softly', 'in', 'total', 'darkness']),

    (['heavy', 'raindrops', 'often', 'tap', 'gently', 'on', 'glass', 'windows'],
     ['loud', 'thunder', 'frequently', 'rumbles', 'deeply', 'overhead', 'in', 'sky'],
     ['fragile', 'mirrors', 'can', 'easily', 'shatter', 'into', 'pieces', 'when', 'dropped']),

    (['shy', 'hedgehogs', 'quickly', 'curl', 'themselves', 'into', 'tight', 'balls'],
     ['strong', 'badgers', 'actively', 'dig', 'deep', 'underground', 'holes'],
     ['straight', 'rulers', 'are', 'used', 'to', 'measure', 'accurate', 'straight', 'lines']),
    
    (['efficient', 'solar', 'panels', 'effectively', 'absorb', 'bright', 'sunlight'],
     ['strong', 'windmills', 'steadily', 'spin', 'around', 'in', 'light', 'breezes'],
     ['outdoor', 'picnics', 'frequently', 'happen', 'happily', 'on', 'green', 'lawns'],
     ),

    (['sturdy', 'camels', 'can', 'endure', 'extreme', 'heat', 'well'],
     ['small', 'lizards', 'quickly', 'scuttle', 'rapidly', 'on', 'hot', 'rocks'],
     ['soft', 'couches', 'are', 'placed', 'to', 'face', 'large', 'televisions']),

    (['heavy', 'schoolbags', 'are', 'designed', 'to', 'hold', 'many', 'supplies'],
     ['small', 'pencilcases', 'are', 'used', 'to', 'store', 'small', 'erasers'],
     ['long', 'staircases', 'typically', 'lead', 'upward', 'to', 'dark', 'attics']),

    (['aquatic', 'penguins', 'often', 'dive', 'deeply', 'under', 'thick', 'ice'],
     ['slippery', 'seals', 'frequently', 'slide', 'smoothly', 'on', 'their', 'bellies'],
     ['clear', 'statements', 'always', 'end', 'correctly', 'with', 'full', 'periods']),

    (['molten', 'lava', 'violently', 'flows', 'freely', 'from', 'volcanic', 'craters'],
     ['dense', 'ash', 'clouds', 'rapidly', 'fill', 'the', 'dark', 'sky', 'above'],
     ['woven', 'baskets', 'are', 'used', 'to', 'carry', 'fresh', 'fruits']),

    (['glowing', 'fireflies', 'often', 'blink', 'brightly', 'in', 'dark', 'meadows'],
     ['noisy', 'crickets', 'frequently', 'chirp', 'loudly', 'over', 'green', 'grass'],
     ['soft', 'pillows', 'are', 'designed', 'to', 'soften', 'your', 'tired', 'head']),
     (
    ['we', 'toured', 'the', 'historic', 'city', 'on', 'a', 'bus', 'and', 'visited', 'the'],
    ['she', 'booked', 'her', 'round-trip', 'flights', 'to', 'new', 'york', 'city', 'for', 'her', 'summer', 'vacation'],
    ['he', 'fixed', 'the', 'leaking', 'roof', 'of', 'his', 'old', 'house', 'himself']
     ),

]


In [378]:
def embed_sentence(sentence, word_to_index, embeddings, word_count,a=1e-4):
    vecs = []
    total_count = sum(word_count.values())

    for word in sentence:
        idx = word_to_index.get(word)
        if idx is not None:
            vec = embeddings[idx]
            if hasattr(vec, "toarray"): 
                vec = vec.toarray().ravel()

            count = word_count.get(word, 1)
            prob = count / total_count
            weight = a / (a + prob)

            vecs.append(vec * weight)

    if not vecs:
        return np.zeros(embeddings.shape[1])

    vecs = np.stack(vecs)
    sent_vec = np.sum(vecs, axis=0)
    norm = np.linalg.norm(sent_vec)
    return sent_vec / norm if norm else sent_vec

In [380]:
def most_similar_pair_with_embeddings(s1, s2, s3, word_to_index, embeddings, word_count,a=1e-4):
    v1 = embed_sentence(s1, word_to_index, embeddings, word_count,a)
    v2 = embed_sentence(s2, word_to_index, embeddings, word_count,a)
    v3 = embed_sentence(s3, word_to_index, embeddings, word_count,a)
    
    similarity_12 = np.dot(v1, v2)
    similarity_13 = np.dot(v1, v3)
    similarity_23 = np.dot(v2, v3)

    scores = [similarity_12, similarity_13, similarity_23]
    return np.argmax(scores), scores

In [382]:
def test_embeddings(embeddings,word_to_idx,a=1e-4):
    correct = 0
    for i, (s1, s2, s3) in enumerate(triplets):
        idx, _ = most_similar_pair_with_embeddings(s1, s2, s3, word_to_idx, embeddings,word_count,a)
        is_correct = idx == 0  
        correct += is_correct
    print(f"Accuracy: {correct}/{len(triplets)} correct")

In [384]:
embeddings = load_npz('embeddings_pca_200_75k_window5.npz')
test_embeddings(embeddings,word_to_idx)

Accuracy: 93/100 correct


In [640]:
def extract_reviews(filename,review_col,max_reviews_per_file):
    reviews = []
    try:
        with open(filename, 'r', encoding="ISO-8859-1") as file:
            total_reviews = sum(1 for _ in file)
            print(f"{filename} has {total_reviews - 1} reviews")
            file.seek(0)
            my_reader = csv.reader(file, delimiter=',')
            next(my_reader, None)
            for i, row in enumerate(my_reader):
                if i >= max_reviews_per_file:
                    break
                reviews.append(row[review_col])
    except Exception as e:
        print(f"Error reading {filename}: {e}")
    return reviews

In [642]:
def csv_pipeline(filename,col,max_reviews_per_file=10e7):
    clean_reviews = extract_reviews(filename,col,max_reviews_per_file)
    original_reviews = [rev for rev in clean_reviews]
    lower_case_reviews(clean_reviews)
    replace_punctuation(clean_reviews)
    split_sentences_into_words(clean_reviews)
    replace_words_with_special_symbols(clean_reviews)
    replace_digit_words(clean_reviews)
    remove_non_alpha_numeric(clean_reviews)
    return clean_reviews,original_reviews

Loading reviews of single recipe

In [645]:
clean_reviews,original_reviews = csv_pipeline('recipe_reviews.csv',5)

recipe_reviews.csv has 2325 reviews


Lowercasing: 100%|████████████████████████████████████████████████████████████| 2182/2182 [00:00<00:00, 2182158.16it/s]
Cleaning reviews: 100%|████████████████████████████████████████████████████████| 2182/2182 [00:00<00:00, 285179.21it/s]
Splitting sentences into words: 100%|██████████████████████████████████████████| 2182/2182 [00:00<00:00, 411250.62it/s]
Replacing special symbol words: 100%|███████████████████████████████████████████| 2182/2182 [00:00<00:00, 78850.76it/s]
Replacing digit words: 100%|████████████████████████████████████████████████████| 2182/2182 [00:00<00:00, 35925.02it/s]
Removing None values: 100%|████████████████████████████████████████████████████████████████| 2182/2182 [00:00<?, ?it/s]
Removing non-alphanumeric words: 100%|██████████████████████████████████████████| 2182/2182 [00:00<00:00, 37697.34it/s]


In [647]:
reviews_embeddings = np.array([embed_sentence(rev, word_to_idx, embeddings, word_count) for rev in clean_reviews])

In [648]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering

In [651]:
if hasattr(embeddings,'toarray'):
    embeddings = embeddings.toarray()

In [653]:
agglo = AgglomerativeClustering(n_clusters=4)
agglo.fit(reviews_embeddings)
labels = agglo.labels_
values, value_counts = np.unique(labels, return_counts=True)
for label, count in zip(values, value_counts):
    print(f'Label {label}: {count * 100 / len(reviews_embeddings):.2f}%')

Label 0: 48.35%
Label 1: 12.51%
Label 2: 29.01%
Label 3: 10.13%


In [655]:
def find_closest_reviews_to_computed_centroids(reviews, review_embeddings, labels, top_k=3):
    closest_reviews_per_cluster = []
    unique_labels = np.unique(labels)
    for label in unique_labels:
        cluster_reviews_idx = np.where(labels == label)[0]
        cluster_embeddings = review_embeddings[cluster_reviews_idx]
        centroid = np.mean(cluster_embeddings, axis=0)
        distances = np.sum((cluster_embeddings - centroid.reshape(1, -1))**2, axis=1)
        top_indices = np.argsort(distances)[:top_k]
        top_reviews = [reviews[cluster_reviews_idx[idx]] for idx in top_indices]
        closest_reviews_per_cluster.append(top_reviews)
    return closest_reviews_per_cluster

In [657]:
closest_reviews = find_closest_reviews_to_computed_centroids(original_reviews, reviews_embeddings, labels, top_k=5)

In [659]:
for i, reviews in enumerate(closest_reviews):
    print(f"\nCluster {i+1}:\n")
    for j in range(len(reviews)):
        print('- ', closest_reviews[i][j] + "\n")


Cluster 1:

-  Im only 15 and this is my second time making banana bread. The first time i used a different recipe and it came out so bad, but i found this recipe and it came out delicious. The perfect recipe for banana bread, im not that great at baking but this is by far the best.

-  First time I baked banana bread and it turned out perfectly! I used only 1/2 cup of sugar and 4 bananas. Super moist, just the right amount of sweetness for me. Thank you for the recipe!

-  This was an easy banana bread. I chose to use 4 bananas. My fiance and I both agree that this made it too moist for our liking. The bread fell apart. I also added chocolate chips. Looking forward to trying this recipe with 3 bananas and maybe with some cinnamon as another reviewer said. This recipe fit perfectly into my loaf pan, it did not overflow. It took me an extra 15 minutes (probably due to extra moistness with the 4th banana) to cook versus the time listed. I cooked it on on the middle shelf of my gas oven.

We can see how:  
the first cluster talks about positive personal stories  
the second cluster is mainly about complaints  
the third cluster talks about modifications to the recipe  
the fourth cluster are short compliments of the recipe