In [2]:
!pip install openai

from openai import OpenAI  # Replace Groq with OpenAI
import networkx as nx

import numpy as np
from scipy import stats as sps
from matplotlib import pyplot as plt
import pandas as pd

Defaulting to user installation because normal site-packages is not writeable


## Initial tag extraction

Read the ml-20m dataset

In [3]:
ratings = pd.read_csv("data/ml-20m/ratings.csv")
movies = pd.read_csv("data/ml-20m/movies.csv")
genome_scores = pd.read_csv("data/ml-20m/genome-scores.csv")
genome_tags = pd.read_csv("data/ml-20m/genome-tags.csv")

Get the top-3 movies in terms of the amount of ratings

In [4]:
top_movies = (
    ratings.groupby("movieId")
           .agg(avg_rating=("rating", "mean"), num_ratings=("rating", "count"))
           .reset_index()
)

# Pick top 3 by num_ratings
top_movies = top_movies.sort_values("num_ratings", ascending=False).head(5).iloc[[0, 1, 4]]
# Merge with titles
top_movies = top_movies.merge(movies, on="movieId", how="left")
top_movie_ids = top_movies["movieId"].tolist()
top_movie_names = top_movies['title'].tolist()

print(top_movie_names)
print(top_movie_ids)

['Pulp Fiction (1994)', 'Forrest Gump (1994)', 'Jurassic Park (1993)']
[296, 356, 480]


Construct the relevance table

In [5]:
# Step 1. Get all tags for top 3 movies
rel_top3 = genome_scores[genome_scores["movieId"].isin(top_movie_ids)].copy()
rel_tags = rel_top3.merge(genome_tags, on="tagId", how="left")

# Step 2. Create unique tags subset & reset index
unique_tags = rel_tags[["tagId", "tag"]].drop_duplicates().reset_index(drop=True)
unique_tags["NewTagID"] = range(len(unique_tags))  # <-- start from 0

# Step 3. Merge back so each row in rel_tags has NewTagID
rel_tags = rel_tags.merge(unique_tags, on=["tagId", "tag"], how="left")

# Step 4. Build tag_lookup_array [(NewTagID, tag), ...]
tag_lookup_array = [
    (int(row.NewTagID), row.tag)
    for _, row in unique_tags.iterrows()
]

print(tag_lookup_array)

[(0, '007'), (1, '007 (series)'), (2, '18th century'), (3, '1920s'), (4, '1930s'), (5, '1950s'), (6, '1960s'), (7, '1970s'), (8, '1980s'), (9, '19th century'), (10, '3d'), (11, '70mm'), (12, '80s'), (13, '9/11'), (14, 'aardman'), (15, 'aardman studios'), (16, 'abortion'), (17, 'absurd'), (18, 'action'), (19, 'action packed'), (20, 'adaptation'), (21, 'adapted from:book'), (22, 'adapted from:comic'), (23, 'adapted from:game'), (24, 'addiction'), (25, 'adolescence'), (26, 'adoption'), (27, 'adultery'), (28, 'adventure'), (29, 'affectionate'), (30, 'afi 100'), (31, 'afi 100 (laughs)'), (32, 'afi 100 (movie quotes)'), (33, 'africa'), (34, 'afterlife'), (35, 'aging'), (36, 'aids'), (37, 'airplane'), (38, 'airport'), (39, 'alaska'), (40, 'alcatraz'), (41, 'alcoholism'), (42, 'alien'), (43, 'alien invasion'), (44, 'aliens'), (45, 'allegory'), (46, 'almodovar'), (47, 'alone in the world'), (48, 'alter ego'), (49, 'alternate endings'), (50, 'alternate history'), (51, 'alternate reality'), (52, 

**Important:** filter by relevance 

In [6]:
rel_tags_filtered = rel_tags[rel_tags["relevance"] >= 0.145]

In [7]:
# Step 5. Build movie_tag_tuples using NewTagID instead of original tagId
movie_tag_tuples = (
    rel_tags_filtered.groupby(["movieId"])
            .apply(lambda df: [
                (int(row.NewTagID), row.tag, float(row.relevance))
                for _, row in df.iterrows()
            ])
            .reset_index(name="TagTuples")
    .merge(movies[["movieId", "title"]], on="movieId", how="left")
)

# Restrict to the selected movies
movie_tag_tuples = movie_tag_tuples[movie_tag_tuples["movieId"].isin(top_movie_ids)]

In [8]:
for _, row in movie_tag_tuples.iterrows():
    print(row['title'], f"has {len(row['TagTuples'])} associated tags")

Pulp Fiction (1994) has 428 associated tags
Forrest Gump (1994) has 468 associated tags
Jurassic Park (1993) has 429 associated tags


In [9]:
# all the tags that actually are related to the three selected movies after relevance filtering
all_touched_tags_set = set()
for _, row in movie_tag_tuples.iterrows():
    for i, j, k in row['TagTuples']:
        all_touched_tags_set.add(j)
all_touched_tags = []
for k in all_touched_tags_set:
    all_touched_tags.append(k)

In [10]:
len(all_touched_tags)

691

## Movie - group tags construction

### How it tag_groups was generated

In [11]:
tag_groups1 = [
    # Group 0: War, Politics, Crime, Historical, Conspiracy (Expanded)
    ['unique', 'vietnam war', 'best war films', 'civil war', 'vietnam', 'vigilantism', 'murder', 'crime', 'gangster', 'gangsters', 'mob', 'organized crime', 'corruption', 'undercover cop', 'spies', 'spy', 'espionage', 'con men', 'heist', 'robbery', 'crime gone awry', 'lawyer', 'lawyers', 'justice', 'political', 'politics', 'world politics', 'president', 'bullshit history', 'history', 'historical', 'biopic', 'explosions', 'biography', 'biographical', 'based on true story', 'based on a true story', 'true story', 'adaptation', 'based on book', 'based on a book', 'adapted from:book', 'book was better', 'ethnic conflict', 'segregation', 'racism', 'military', 'prison', 'prison escape', 'police investigation', 'censorship', 'conspiracy', 'conspiracy theory', 'brainwashing', '1930s', '1950s', '1960s', '1970s', '1980s', 'betrayal', 'controversial', 'free speech', 'hit men', 'hitman', 'kidnapping', 'mafia', 'police corruption', 'propaganda', 'redemption', 'secrets', 'serial killer', 'social commentary', 'spying', 'torture', 'us history', 'war', 'war movie', 'weapons', 'investigation', 'fighting the system', 'gangs', 'hunting', 'race', 'vengeance', 'revenge', 'suicide', 'drug abuse', 'drug addiction', 'drugs', 'addiction', 'rape', 'race issues', 'prejudice', 'sacrifice', 'schizophrenia', 'disability', 'autism', 'cancer', 'terminal illness', 'aids', 'suicide attempt', 'guilt', 'atheism', 'bullying', 'cheating', 'death', 'depressing', 'disability', 'homeless', 'prejudice', 'race issues', 'redemption', 'sacrifice', 'violence', 'gratuitous violence', 'brutality', 'splatter', 'violent', 'foul language', 'harsh', 'grim', 'forceful', 'blood', 'fighting', 'visceral', 'brutal', 'grindhouse', 'bloody', 'gory', 'gore'],
    # Group 1: Sci-fi, Fantasy, Supernatural, Technology, Mind-Bending & Surreal (Expanded)
    ['compassionate', 'dragons', 'surveillance', 'monsters', 'monster', 'mental illness', 'runaway', 'scenic', 'assassin', 'assassination', 'assassins', 'sci fi', 'sci-fi', 'science fiction', 'scifi', 'rebellion', 'revolution', 'fantasy', 'supernatural', 'alien', 'alien invasion', 'aliens', 'android(s)/cyborg(s)', 'apocalypse', 'artificial intelligence', 'astronauts', 'black and white', 'clones', 'cloning', 'demons', 'disaster', 'distopia', 'dystopia', 'dystopic future', 'end of the world', 'fantasy world', 'first contact', 'future', 'futuristic', 'genetics', 'ghosts/afterlife', 'giant robots', 'graphic novel', 'high fantasy', 'immortality', 'magic', 'man versus machine', 'mars', 'modern fantasy', 'mutants', 'mythology', 'neo-noir', 'nocturnal', 'parallel universe', 'philip k. dick', 'post apocalyptic', 'post-apocalyptic', 'robot', 'robots', 'saturn award (best science fiction film)', 'saturn award (best special effects)', 'scifi cult', 'space', 'space opera', 'space program', 'space travel', 'super hero', 'superheroes', 'technology', 'time', 'transformation', 'utopia', 'video game', 'virtual reality', 'virus', 'werewolf', 'wizards', '3d', '70mm', 'slavery', 'alternate history', 'alternate reality', 'alternate universe', 'computers', 'firefly', 'hackers', 'internet', 'science', 'dinosaurs', 'global warming', 'imagination', 'archaeology', 'math', 'capitalism', 'business', 'philosophy', 'intellectual', 'cerebral', 'genius', 'brilliant', 'clever', 'nerds', 'memory loss', 'destiny', 'fairy tale', 'fairy tales', 'mythology', 'immortality', 'absurd', 'action', 'action packed', 'alternate endings', 'atmospheric', 'bizarre', 'complex', 'complicated', 'complicated plot', 'confusing', 'cool', 'crazy', 'creativity', 'cynical', 'dark', 'dreams', 'enigmatic', 'existentialism', 'hallucinatory', 'insanity', 'intelligent', 'mindfuck', 'non-linear', 'nonlinear', 'original', 'original plot', 'psychedelic', 'psychological', 'suspenseful', 'surreal', 'surrealism', 'twist', 'twist ending', 'unusual plot structure', 'visually stunning', 'weird'],
    # Group 2: Relationships, Emotions, Humanity, Coming-of-Age (Reduced)
    ['educational', 'nostalgia', 'nostalgic', 'love', 'romance', 'romantic', 'good romantic comedies', 'love story', 'coming of age', 'coming-of-age', 'relationships', 'family', 'family drama', 'kids', 'kids and family', 'children', 'parenthood', 'father-son relationship', 'brothers', 'unlikely friendships', 'friendship', 'affectionate', 'teacher', 'intimate', 'earnest', 'emotional', 'sentimental', 'poignant', 'tear jerker', 'touching', 'heartwarming', 'heartbreaking', 'bittersweet', 'sad', 'sad but good', 'tragedy', 'melancholy', 'melancholic', 'downbeat', 'wistful', 'loneliness', 'alone in the world', 'solitude', 'depression', 'passionate', 'poetry', 'humanity', 'identity', 'sexuality', 'queer', 'sex', 'adultery', 'nudity', 'nudity (topless)', 'nudity (topless - brief)', 'nudity (topless - notable)', 'nudity (full frontal)', 'nudity (full frontal - brief)', 'nudity (full frontal - notable)', 'male nudity', 'sexualized violence', 'pornography', 'bdsm', 'honest', 'idealism', 'self discovery', 'life', 'life & death', 'life philosophy', 'mission from god', 'inspirational', 'inspiring', 'cathartic', 'cute', 'dysfunctional family', 'feel good movie', 'feel-good', 'gay character', 'great ending', 'happy ending', 'heroin', 'memory', 'mentor', 'queer', 'realistic', 'sappy', 'sexy', 'sweet', 'teen', 'girlie movie', 'good', 'perfect', 'better than expected', 'good story', 'good sequel', 'special', 'great movie', 'very good', 'very interesting', 'light', 'simple', 'shallow', 'good versus evil', 'moral ambiguity', 'morality', 'culture clash', 'islam', 'jesus', 'god', 'meditative', 'gypsy accent', 'rabbits', 'chocolate', 'blindness', 'doctors', 'slackers', 'aging'],
    # Group 3: Storytelling, Writing, Structure, Drama (Reduced)
    ['underrated', 'mystery', 'plot twist', 'surprise ending', 'multiple storylines', 'flashbacks', 'storytelling', 'dialogue driven', 'entirely dialogue', 'talky', 'good dialogue', 'great dialogue', 'dialogue', 'script', 'excellent script', 'writing', 'literature', 'books', 'book', 'writers', 'golden palm', 'criterion', 'movielens top pick', 'imdb top 250', 'afi 100', 'afi 100 (movie quotes)', 'series', 'sequels', 'sequel', 'trilogy', 'franchise', 'long', 'too long', 'enormously long battle scene', 'epic', 'classic', 'masterpiece', 'moving', 'dramatic', 'drama', 'ensemble cast', 'complex characters', 'character study', 'view askew', 'interesting', 'thought-provoking', 'reflective', 'quotable', 'suprisingly clever', 'allegory', 'coen bros', 'directorial debut', 'excellent', 'factual', 'great', 'highly quotable', 'narrated', 'noir', 'oscar (best writing - screenplay written directly for the screen)', 'philosophical', 'plot', 'plot holes', 'pulp', 'spielberg', 'story', 'tarantino', 'twists & turns', 'tricky', 'predictable', 'unrealistic', 'slow', 'slow paced', 'graphic design', 'comics', 'comic book adaption', 'period piece', 'caper', 'spaghetti western', 'lone hero', 'anti-hero', 'dark hero', 'alter ego', 'unlikeable characters', 'powerful ending', 'watch the credits'],
    # Group 4: Humor, Mood, Weirdness, Horror (Reduced)
    ['comedy', 'romantic comedy', 'black comedy', 'dark comedy', 'off-beat comedy', 'slapstick', 'satire', 'dark humor', 'humorous', 'humor', 'funny', 'very funny', 'funny as hell', 'hilarious', 'hillarious', 'goofy', 'quirky', 'eccentricity', 'camp', 'campy', 'silly', 'silly fun', "so bad it's funny", 'stupid', 'idiotic', 'dumb', 'pointless', 'boring!', 'lame', 'pretentious', 'corny', 'cheesy', 'crappy sequel', 'bad', 'bad plot', 'bad ending', 'bad science', 'catastrophe', 'bad acting', 'unfunny', 'not funny', 'strange', 'creepy', 'disturbing', 'frightening', 'scary', 'tense', 'suspense', 'thriller', 'horror', 'psychology', 'paranoia', 'paranoid', 'intense', 'ominous', 'macabre', 'gritty', 'whimsical', 'ironic', 'witty', 'sarcasm', 'biting', 'cult', 'cult film', 'cult classic', 'indie', 'independent film', 'artsy', 'artistic', 'art house', 'stylized', 'stylish', 'no plot', 'over the top', '80s', 'crude humor', 'deadpan', 'entertaining', 'film noir', 'fun', 'fun movie', 'geek', 'gruesome', 'irreverent', 'mad scientist', 'noir thriller', 'overrated', 'pg', 'pg-13', 'puppets', 'satirical', 'stoner movie', 'stupidity', 'weed', 'stoner movie', 'buddy movie'],
    # Group 5: Visuals, Production, Awards, Settings, Misc (Reduced)
    ['chick flick', 'awesome soundtrack', 'notable soundtrack', 'good soundtrack', 'music', 'good music', 'great music', 'amazing cinematography', 'great cinematography', 'cinematography', 'visually appealing', 'visuals', 'visual', 'beautiful scenery', 'stunning', 'beautiful', 'amazing photography', 'photography', 'breathtaking', 'awesome', 'special effects', 'effects', 'dynamic cgi action', 'stop motion', 'stop-motion', 'computer animation', 'cgi', 'bad cgi',  'natural disaster', 'animals', 'fish', 'lions', 'spiders', 'snakes', 'shark', 'toys', 'cars', 'classic car', 'racing', 'motorcycle', 'football', 'sports', 'school', 'college', 'childhood', 'suburbia', 'new york city', 'los angeles', 'san francisco', 'new jersey', 'california', 'texas', 'poland', 'england', 'british', 'tokyo', 'desert', 'island', 'antarctica', 'jungle', 'wilderness', 'small town', 'hotel', 'maze', 'claustrophobic', 'stranded', 'travel', 'road movie', 'road trip', 'adventure', 'fight scenes', 'good action', 'gunfight', 'guns', 'sword fight', 'chase', 'exciting', 'fast paced', 'big budget', 'low budget', 'oscar', 'oscar winner', 'oscar (best picture)', 'oscar (best actress)', 'oscar (best actor)', 'oscar (best supporting actor)', 'oscar (best supporting actress)', 'oscar (best cinematography)', 'oscar (best editing)', 'oscar (best effects - visual effects)', 'oscar (best sound)', 'potential oscar nom', 'art', 'beautifully filmed', 'bleak', 'colourful', 'dance', 'dancing', 'desert', 'drinking', 'exceptional acting', 'foreign', 'good acting', 'great acting', 'great cinematography', 'great soundtrack', 'hospital', 'indiana jones', 'indians', 'irish accent', 'oscar (best directing)', 'oscar (best foreign language film)', 'product placement', 'radio', 'shopping', 'southern theme', 'spiders', 'sports', 'stereotypes',  'survival', 'swashbuckler', 'travel', 'treasure', 'treasure hunt', 'ocean', 'freedom', 'underdog', 'rags to riches', 'kick-butt women', 'manipulation', 'obsession', 'courage', 'greed']
]

In [12]:
tag_groups = [[] for i in range(len(tag_groups1) * 2)]
np.random.seed(42)

for i in range(len(tag_groups1)):
    # THERE IS SOME RANDOMNESS HERE, unfortunately,
    # since set(...) is an unordered list and the hashes could be different on differnent runs
    # it was constructed this way, and then the result was saved to Moviegroups

    tag_groups1[i] = list(set(tag_groups1[i]))
    mask = np.random.choice([0, 1], size=(len(tag_groups1[i]),), p=[0.5, 0.5])
    for j in range(len(tag_groups1[i])):
        if (mask[j] == 0):
            tag_groups[i].append(tag_groups1[i][j])
        else:
            tag_groups[i+len(tag_groups1)].append(tag_groups1[i][j])

Checking for intersections

In [13]:
# Convert all groups to sets for easier comparison
group_sets = [set(group) for group in tag_groups]

# Check for intersections between all pairs of groups
has_intersections = False
for i in range(len(group_sets)):
    if (len(group_sets[i])!= len(tag_groups[i])):
        print("???", i, len(group_sets[i]), len(tag_groups[i]))
    for j in range(i + 1, len(group_sets)):
        intersection = group_sets[i] & group_sets[j]
        if intersection:
            has_intersections = True
            print(f"Intersection found between group {i} and group {j}:")
            print(intersection)
            print()

if not has_intersections:
    print("No intersections found between groups.")

No intersections found between groups.


In [14]:
tag_groups = [list(set(group)) for group in tag_groups]

Checking for tags not included in the groups, and new tags that could have been accidentally added

In [15]:
#Check that every tag in the new groups exists in the original tag list
all_group_tags = [tag for group in tag_groups for tag in group]
missing_tags = [tag for tag in all_group_tags if tag not in all_touched_tags]
extra_tags = [tag for tag in all_touched_tags if tag not in all_group_tags]

print("\nMissing tags (in groups but not in original):", missing_tags)
print("Tags from original missing in groups:", extra_tags)


Missing tags (in groups but not in original): []
Tags from original missing in groups: []


In [16]:
n_tag_groups = 12
for i in range(n_tag_groups):
    print(f"Group {i} has {len(tag_groups[i])} tags.")

Group 0 has 65 tags.
Group 1 has 65 tags.
Group 2 has 57 tags.
Group 3 has 41 tags.
Group 4 has 36 tags.
Group 5 has 78 tags.
Group 6 has 60 tags.
Group 7 has 77 tags.
Group 8 has 59 tags.
Group 9 has 44 tags.
Group 10 has 59 tags.
Group 11 has 50 tags.


In [17]:
oldid_newid = {296: 0, 356: 1, 480: 2}

##### Constructing the moviegroups from the split tags, deterministic if provided with split tags

In [18]:
n_tag_groups = 12

In [19]:
get_group = {}
for i in range(len(tag_groups)):
    for tag in tag_groups[i]:
        get_group[tag] = i
top_n=3

In [20]:
existing_tags = [set() for i in range(len(tag_groups))]
movie_group = [
    [[] for _ in range(len(tag_groups))]  # empty list for each group
    for _ in range(top_n)       # for each movie
]
for _, row in movie_tag_tuples.iterrows():
    pos_movie = oldid_newid[row['movieId']]
    print(pos_movie, row['title'])
    for id_tag, tag, rel in row['TagTuples']:
        existing_tags[get_group[tag]].add(tag)
        movie_group[pos_movie][get_group[tag]].append((id_tag, rel))

0 Pulp Fiction (1994)
1 Forrest Gump (1994)
2 Jurassic Park (1993)


In [21]:
for m in range(top_n):
    for g in range(n_tag_groups):
        print(len(movie_group[m][g]), end=" ")
    print()

40 36 36 31 28 42 34 35 34 39 43 30 
41 39 48 25 21 58 39 48 47 35 32 35 
28 44 36 26 21 53 28 58 32 33 38 32 


THE REAL answer:  
40 29 31 33 27 49 34 42 39 37 44 23   
42 40 48 27 19 58 38 47 47 33 34 35   
35 47 32 26 24 48 21 55 36 33 35 37   

### The actual movie_group result loading:

In [22]:
import pickle
with open("data/Moviegroups.pkl", "rb") as f:
    movie_group = pickle.load(f)

In [23]:
for m in range(top_n):
    for g in range(n_tag_groups):
        print(len(movie_group[m][g]), end=" ")
    print()

40 29 31 33 27 49 34 42 39 37 44 23 
42 40 48 27 19 58 38 47 47 33 34 35 
35 47 32 26 24 48 21 55 36 33 35 37 


In [27]:
original_tags = []
for i, tag in tag_lookup_array:
    original_tags.append(tag)
print(len(original_tags))

1128


In [28]:
original_tags

['007',
 '007 (series)',
 '18th century',
 '1920s',
 '1930s',
 '1950s',
 '1960s',
 '1970s',
 '1980s',
 '19th century',
 '3d',
 '70mm',
 '80s',
 '9/11',
 'aardman',
 'aardman studios',
 'abortion',
 'absurd',
 'action',
 'action packed',
 'adaptation',
 'adapted from:book',
 'adapted from:comic',
 'adapted from:game',
 'addiction',
 'adolescence',
 'adoption',
 'adultery',
 'adventure',
 'affectionate',
 'afi 100',
 'afi 100 (laughs)',
 'afi 100 (movie quotes)',
 'africa',
 'afterlife',
 'aging',
 'aids',
 'airplane',
 'airport',
 'alaska',
 'alcatraz',
 'alcoholism',
 'alien',
 'alien invasion',
 'aliens',
 'allegory',
 'almodovar',
 'alone in the world',
 'alter ego',
 'alternate endings',
 'alternate history',
 'alternate reality',
 'alternate universe',
 'amazing cinematography',
 'amazing photography',
 'american civil war',
 'amnesia',
 'amy smart',
 'android(s)/cyborg(s)',
 'androids',
 'animal movie',
 'animals',
 'animated',
 'animation',
 'anime',
 'antarctica',
 'anti-hero',


In [29]:
import pickle
with open("data/originaltags.pkl", "wb") as f:
    pickle.dump(original_tags,f)