In [1]:
# Imports
#%load_ext cudf.pandas
from pandas import read_parquet
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from transformers import AutoModel
from IPython.display import clear_output


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Loads reviews dataframe
df_reviews = read_parquet("../../data/reviews.parquet/")

In [3]:
# List of user tags from steam website. Should not be in code; will write to csv file (and maybe write quick notebook to scrape from web)
user_tags = [
    "Indie",
    "Action",
    "Adventure",
    "Casual",
    "RPG",
    "Singleplayer",
    "Simulation",
    "Strategy",
    "Early Access",
    "Free to Play",
    "2D",
    "3D",
    "Atmospheric",
    "Colorful",
    "Story Rich",
    "Fantasy",
    "Exploration",
    "Multiplayer",
    "Puzzle",
    "Pixel Graphics",
    "Cute",
    "Combat",
    "First-Person",
    "Massively Multiplayer",
    "Sports",
    "Action-Adventure",
    "Violent",
    "Funny",
    "Anime",
    "Arcade",
    "Relaxing",
    "Sci-fi",
    "Horror",
    "Shooter",
    "Racing",
    "Controller",
    "Stylized",
    "Sexual Content",
    "Third Person",
    "Family Friendly",
    "Nudity",
    "Retro",
    "Female Protagonist",
    "PvE",
    "Co-op",
    "Open World",
    "Choices Matter",
    "Gore",
    "Top-Down",
    "PvP",
    "Realistic",
    "Survival",
    "Platformer",
    "Dark",
    "Linear",
    "Comedy",
    "Mystery",
    "Character Customization",
    "Cartoony",
    "Visual Novel",
    "Multiple Endings",
    "Physics",
    "2D Platformer",
    "Online Co-Op",
    "Psychological Horror",
    "FPS",
    "Magic",
    "Old School",
    "Sandbox",
    "Action RPG",
    "Tactical",
    "Medieval",
    "Roguelike",
    "Difficult",
    "Futuristic",
    "Hand-drawn",
    "Minimalist",
    "Building",
    "Roguelite",
    "Point &amp; Click",
    "Crafting",
    "Management",
    "Space",
    "VR",
    "Drama",
    "Cartoon",
    "Immersive Sim",
    "Resource Management",
    "Dark Fantasy",
    "3D Platformer",
    "Emotional",
    "Procedural Generation",
    "Logic",
    "Survival Horror",
    "Choose Your Own Adventure",
    "Romance",
    "Local Multiplayer",
    "Great Soundtrack",
    "Action Roguelike",
    "Shoot 'Em Up",
    "Turn-Based Combat",
    "Side Scroller",
    "Nature",
    "Interactive Fiction",
    "Mature",
    "Hack and Slash",
    "Turn-Based Tactics",
    "Turn-Based Strategy",
    "Education",
    "Puzzle Platformer",
    "Hentai",
    "Base Building",
    "Design &amp; Illustration",
    "Hidden Object",
    "Dating Sim",
    "1990's",
    "Post-apocalyptic",
    "War",
    "Surreal",
    "Zombies",
    "1980s",
    "Bullet Hell",
    "Utilities",
    "Tabletop",
    "Dungeon Crawler",
    "Walking Simulator",
    "JRPG",
    "Cinematic",
    "NSFW",
    "Stealth",
    "Score Attack",
    "Local Co-Op",
    "Narration",
    "Historical",
    "Conversation",
    "Party-Based RPG",
    "Text-Based",
    "LGBTQ+",
    "Investigation",
    "Replay Value",
    "Third-Person Shooter",
    "2.5D",
    "Military",
    "Card Game",
    "Lore-Rich",
    "Life Sim",
    "Isometric",
    "Top-Down Shooter",
    "Inventory Management",
    "Turn-Based",
    "Cookie Clicker",
    "Aliens",
    "Nonlinear",
    "Demons",
    "Dark Humor",
    "Psychological",
    "Robots",
    "Thriller",
    "Team-Based",
    "Cyberpunk",
    "Tutorial",
    "Supernatural",
    "Strategy RPG",
    "Economy",
    "Real Time Tactics",
    "Artificial Intelligence",
    "RTS",
    "Detective",
    "Abstract",
    "Time Management",
    "Perma Death",
    "Memes",
    "Arena Shooter",
    "Modern",
    "Driving",
    "Loot",
    "Web Publishing",
    "Precision Platformer",
    "Board Game",
    "Dystopian ",
    "Tower Defense",
    "Psychedelic",
    "Souls-like",
    "Tactical RPG",
    "Collectathon",
    "Comic Book",
    "Deckbuilding",
    "City Builder",
    "Software",
    "4 Player Local",
    "Animation &amp; Modeling",
    "Alternate History",
    "Idler",
    "Fast-Paced",
    "Wargame",
    "Short",
    "Mythology",
    "Game Development",
    "Beat 'em up",
    "Flight",
    "Metroidvania",
    "Destruction",
    "Soundtrack",
    "Runner",
    "Level Editor",
    "Card Battler",
    "Music",
    "Moddable",
    "Crime",
    "Grid-Based Movement",
    "Parkour",
    "CRPG",
    "RPGMaker",
    "Cats",
    "Class-Based",
    "2D Fighter",
    "Classic",
    "Philosophical",
    "Dark Comedy",
    "Automobile Sim",
    "Capitalism",
    "Gun Customization",
    "Fighting",
    "Creature Collector",
    "MMORPG",
    "Rhythm",
    "Automation",
    "Competitive",
    "Experimental",
    "Video Production",
    "Science",
    "Twin Stick Shooter",
    "Swordplay",
    "Movie",
    "Co-op Campaign",
    "Vehicular Combat",
    "Lovecraftian",
    "3D Fighter",
    "Beautiful",
    "Audio Production",
    "Battle Royale",
    "Dragons",
    "Farming Sim",
    "America",
    "World War II",
    "Trading",
    "eSports",
    "Noir",
    "Cooking",
    "Conspiracy",
    "Grand Strategy",
    "Space Sim",
    "3D Vision",
    "6DOF",
    "Quick-Time Events",
    "Split Screen",
    "Parody ",
    "Colony Sim",
    "Mystery Dungeon",
    "Bullet Time",
    "Satire",
    "Looter Shooter",
    "Dynamic Narration",
    "Gothic",
    "Match 3",
    "Word Game",
    "Cozy",
    "Agriculture",
    "Hero Shooter",
    "Auto Battler",
    "Time Manipulation",
    "Open World Survival Craft",
    "Underground",
    "Spectacle fighter",
    "Blood",
    "Martial Arts",
    "Combat Racing",
    "Software Training",
    "Voxel",
    "Mechs",
    "Political",
    "Mining",
    "Time Travel",
    "Steampunk",
    "Dog",
    "Otome",
    "Immersive",
    "Pirates",
    "Action RTS",
    "FMV",
    "Vampire",
    "Solitaire",
    "God Game",
    "Hunting",
    "Narrative",
    "Fishing",
    "Asynchronous Multiplayer",
    "Character Action Game",
    "Trading Card Game",
    "Ninja",
    "Wholesome",
    "Tanks",
    "MOBA",
    "Transportation",
    "Illuminati",
    "Hex Grid",
    "Hacking",
    "Underwater",
    "Politics",
    "Faith",
    "Assassin",
    "Addictive",
    "Superhero",
    "Remake",
    "4X",
    "Photo Editing",
    "Dinosaurs",
    "Roguelike Deckbuilder",
    "Sokoban",
    "Mouse only",
    "Party Game",
    "Political Sim",
    "Cold War",
    "Trains",
    "Gambling",
    "Heist",
    "Western",
    "Party",
    "Programming",
    "Foreign",
    "Real-Time",
    "Boomer Shooter",
    "Diplomacy",
    "Episodic",
    "Archery",
    "Naval",
    "Minigames",
    "Snow",
    "Epic",
    "Cult Classic",
    "Traditional Roguelike",
    "Typing",
    "Naval Combat",
    "Escape Room",
    "Transhumanism",
    "Sailing",
    "Real-Time with Pause",
    "Werewolves",
    "Villain Protagonist",
    "Sniper",
    "Nostalgia",
    "Horses",
    "On-Rails Shooter",
    "Sequel",
    "Offroad",
    "Trivia",
    "Dungeons &amp; Dragons",
    "Music-Based Procedural Generation",
    "Farming",
    "Time Attack",
    "Football (Soccer)",
    "Mars",
    "Touch-Friendly",
    "World War I",
    "Mod",
    "360 Video",
    "Kickstarter",
    "Boxing",
    "Jet",
    "Experience",
    "Spelling",
    "Spaceships",
    "Outbreak Sim",
    "Gaming",
    "GameMaker",
    "Roguevania",
    "Games Workshop",
    "Chess",
    "Motorbike",
    "Medical Sim",
    "Submarine",
    "Unforgiving",
    "LEGO",
    "Bikes",
    "Rome",
    "Dwarf",
    "Basketball",
    "Golf",
    "Asymmetric VR",
    "Electronic Music",
    "Pinball",
    "Jump Scare",
    "Silent Protagonist",
    "Social Deduction",
    "Ambient",
    "Documentary",
    "Baseball",
    "Job Simulator",
    "Skateboarding",
    "Instrumental Music",
    "Football (American)",
    "Wrestling",
    "Crowdfunded",
    "Mini Golf",
    "Warhammer 40K",
    "Rock Music",
    "Well-Written",
    "Pool",
    "TrackIR",
    "Cycling",
    "Skating",
    "Boss Rush",
    "Vikings",
    "Extraction Shooter",
    "Intentionally Awkward Controls",
    "Tennis",
    "Based On A Novel",
    "Motocross",
    "Tile-Matching",
    "Hockey",
    "Lemmings",
    "ATV",
    "8-bit Music",
    "Bowling",
    "Snowboarding",
    "BMX",
    "Hardware",
    "Skiing",
    "Benchmark",
    "Shop Keeper",
    "Birds",
    "Mahjong",
    "Steam Machine",
    "Electronic",
    "Dice",
    "Voice Control",
    "Musou",
    "Elf",
    "Fox",
    "Feature Film",
    "Coding",
    "Hobby Sim",
    "Volleyball",
    "Rugby",
    "Cricket",
    "Reboot",
    "Snooker",
]

In [4]:
# Small code block to remove word "game" from tags. This word is pretty confounding; I may try to implement it as a stop word.
user_tags_new = []
for tag in user_tags:
    user_tags_new.append(tag.replace('Game',"").strip())
user_tags = user_tags_new
del user_tags_new

In [5]:
# Removes positive and duplicated (text) reviews. No two (useful) reviews should *identical* text, so the first duplicate is dropped with the others.
n_reviews_pre_cull = len(df_reviews)
df_reviews = df_reviews[(df_reviews['voted_up']) & (df_reviews.duplicated(subset='review', keep=False) == False)].copy()
n_reviews_post_cull = len(df_reviews)

In [6]:
print(f"{(n_reviews_pre_cull-n_reviews_post_cull):,} ({(1-n_reviews_post_cull/n_reviews_pre_cull):.02%}) reviews culled (duplicates and/or negative).")

4,406,937 (30.54%) reviews culled (duplicates and/or negative).


In [7]:
# Randomizes review order prior to batch-processing, hopefully reducing batch-to-batch variation; subsets to just review text .
# Random_state is set because reproducible shuffling happens to be very important in this case...
docs = df_reviews.sample(frac=1, random_state=42)['review']

# Deletes dataframe to save RAM
del df_reviews

In [8]:
# Gets indices where batches start/stop by evenly dividing them. 12 batches seems to work okay for about 10,000,000 reviews.
n_reviews = len(docs)
n_batches = 12
batch_bounds = [i*n_reviews//n_batches for i in range(n_batches)]

# Sets final boundary to last review
batch_bounds.append(-1)

In [10]:
# Instantiates GPU-accelerated bertopic model


# Create instances of GPU-accelerated UMAP and HDBSCAN.
# Associated hyperparameters have not been tuned, they just come from BERTopic's gpu-acceleration demo. I haven't the bandwidth to adjust them now.
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0, metric="cosine", verbose=True, build_algo='nn_descent')
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, verbose=True)

# Specifies represenation model
representation_model = KeyBERTInspired()

# Pass the above models to be used in BERTopic
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    
    # I'm using NV-Embed-v2 because that does the best on the leaderboard right now.
    embedding_model=AutoModel.from_pretrained(
        "nvidia/NV-Embed-v2", trust_remote_code=True
    ),
    representation_model=representation_model,
    # Two lines below enable zero-shot modeling
    zeroshot_topic_list=user_tags,
    # min similarity is the hyperparameter I did try to tune; it makes sense that there should be topics besides the tags I've forced it to look for, so I picked a value
    #   high enough that BERT still identifies non-tag topics but low enough to still merge similar topics into the tag topics
    zeroshot_min_similarity=0.4,
    verbose=True,
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.58s/it]


In [None]:
# For loop to do batch processing
# IT IS A LIE because BERTopic throws a weird error when you try to fit a model that has already been trained. As far as I can tell, this is not the intended behavior.
# Instantiating a new model every iteration is not viable because instantiating the embedding model (NV-Embed-v2) uses 32 GB RAM, and deleting the model doesn't seem to free it up.
# But yeah I've just manually had to manually update the iteration counter and restart the code each time, which is really not how it's supposed to work.
for i in [0]:
    # Gets current batch (subset of docs)
    batch = docs.iloc[batch_bounds[i]:batch_bounds[i+1]]
    
    # Fits model
    topics, probs = topic_model.fit_transform(batch)

    # Gets topic dataframe
    df_topics = topic_model.get_document_info(batch)
    
    # Adds recommendation ids to topic dataframe (for match to reviews)
    df_topics['recommendationid'] = batch.index.values
    df_topics = df_topics.set_index('recommendationid')
    
    # Saves dataframe to parquet
    df_topics.to_parquet(f"../../data/topics.parquet/part.{i}.parquet")
    
    # Refreshes console; prints progress.
    clear_output()
    print(f"Batch {i+1}/{n_batches} processed!")


2024-12-08 22:42:21,204 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 26107/26107 [06:00<00:00, 72.34it/s] 
2024-12-08 22:48:49,913 - BERTopic - Embedding - Completed ✓
2024-12-08 22:48:49,918 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


[D] [22:48:50.944836] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:107 n_neighbors=15
[D] [22:48:50.948808] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:129 Calling knn graph run
[D] [22:48:58.601288] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:135 Done. Calling fuzzy simplicial set
[D] [22:48:58.605563] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:318 Smooth kNN Distances
[D] [22:48:58.608351] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:320 sigmas = [ 0.0714819, 0.192098, 0.0478412, 0.028921, 0.0814323, 0.0678349, 0.0408132, 0.0882239, 0.0202205, 0.0657434, 0.0703974, 0.0291572, 0.0118365, 0.208321, 0.0680337, 0.0266775, 0.199258, 0.0594971, 0.0485721, 0.0472224, 0.0365312, 0.0788369, 0.0575008, 0.0386131, 0.0345132 ]

[D] [22:48:58.608428] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:322 rhos = [ 0.587163, 0.683372, 0.679994, 0.672627, 0.564559, 0.782606, 0.706857, 0.764719, 0.732765, 0.517345, 0.722839, 0.8

2024-12-08 22:50:06,075 - BERTopic - Dimensionality - Completed ✓
2024-12-08 22:50:06,088 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics


[D] [22:50:05.940656] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:400 Smoothing KNN distances
[D] [22:50:05.943149] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:432 Executing fuzzy simplicial set
[D] [22:50:05.952684] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:461 Performing L1 normalization
[D] [22:50:05.959687] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:497 n_epochs=30
[D] [22:50:05.970003] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:520 Computing # of epochs for training each sample
[D] [22:50:05.972712] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:527 Performing optimization


2024-12-08 22:50:10,233 - BERTopic - Zeroshot Step 1 - Completed ✓
2024-12-08 22:50:38,553 - BERTopic - Cluster - Start clustering the reduced embeddings


[D] [22:50:10.520861] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:365 Running transform
[D] [22:50:10.520952] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:367 Building KNN Graph
[D] [22:50:38.479807] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:400 Smoothing KNN distances
[D] [22:50:38.481669] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:432 Executing fuzzy simplicial set
[D] [22:50:38.486513] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:461 Performing L1 normalization
[D] [22:50:38.489695] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:497 n_epochs=30
[D] [22:50:38.495946] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:520 Computing # of epochs for training each sample
[D] [22:50:38.497498] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:527 Performing optimization


2024-12-08 22:51:15,690 - BERTopic - Cluster - Completed ✓
2024-12-08 22:51:15,691 - BERTopic - Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...
2024-12-08 22:51:16,702 - BERTopic - Zeroshot Step 2 - Completed ✓
2024-12-08 22:51:16,711 - BERTopic - Representation - Extracting topics from clusters using representation models.


KeyboardInterrupt: 