In [1]:
!pip install openai

from openai import OpenAI
import networkx as nx

import numpy as np
from scipy import stats as sps
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
np.random.seed(42)

Defaulting to user installation because normal site-packages is not writeable


In [2]:
ratings = pd.read_csv("data/ml-32m/ratings.csv")
movies = pd.read_csv("data/ml-32m/movies.csv")
tags = pd.read_csv("data/ml-32m/tags.csv")

In [3]:
top_movies = (
    ratings.groupby("movieId")
           .agg(avg_rating=("rating", "mean"), num_ratings=("rating", "count"))
           .reset_index()
)

# Pick top 3 by num_ratings - FIXED: use brackets [] instead of parentheses ()
top_movies = top_movies.sort_values("num_ratings", ascending=False).head(50)
top_movies = top_movies.merge(movies, on="movieId", how="left")
top_movies.head(8)

Unnamed: 0,movieId,avg_rating,num_ratings,title,genres
0,318,4.404614,102929,"Shawshank Redemption, The (1994)",Crime|Drama
1,356,4.052744,100296,Forrest Gump (1994),Comedy|Drama|Romance|War
2,296,4.196969,98409,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,2571,4.156437,93808,"Matrix, The (1999)",Action|Sci-Fi|Thriller
4,593,4.148367,90330,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
5,260,4.099824,85010,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
6,2959,4.22878,77332,Fight Club (1999),Action|Crime|Drama|Thriller
7,480,3.698623,75233,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller


In [4]:
top_movies = top_movies.iloc[[1, 2, 6]]

top_movies

Unnamed: 0,movieId,avg_rating,num_ratings,title,genres
1,356,4.052744,100296,Forrest Gump (1994),Comedy|Drama|Romance|War
2,296,4.196969,98409,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
6,2959,4.22878,77332,Fight Club (1999),Action|Crime|Drama|Thriller


In [5]:
# Merge with titles
top_movie_ids = top_movies["movieId"].tolist()
top_movie_names = top_movies['title'].tolist()

print(top_movie_names)
print(top_movie_ids)

['Forrest Gump (1994)', 'Pulp Fiction (1994)', 'Fight Club (1999)']
[356, 296, 2959]


In [6]:
# Step 1. Get all tags for top 3 movies
rel_top3 = tags[tags["movieId"].isin(top_movie_ids)].copy()
rel_top3

Unnamed: 0,userId,movieId,tag,timestamp
197,109,2959,Brad Pitt,1257989264
198,109,2959,dark,1257989274
199,109,2959,dark comedy,1257989253
200,109,2959,Edward Norton,1257989262
201,109,2959,mental illness,1257989255
...,...,...,...,...
1999380,162279,296,Bruce Willis,1284926303
1999381,162279,296,Quentin Tarantino,1284926298
1999382,162279,296,Samuel L. Jackson,1284926300
1999383,162279,296,Steve Buscemi,1284926322


In [7]:
unique_tags = rel_top3[["tag"]].drop_duplicates().reset_index(drop=True)
unique_tags["NewTagID"] = range(len(unique_tags))  # <-- start from 0
rel_tags = rel_top3.merge(unique_tags, on=["tag"], how="left")
rel_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp,NewTagID
0,109,2959,Brad Pitt,1257989264,0
1,109,2959,dark,1257989274,1
2,109,2959,dark comedy,1257989253,2
3,109,2959,Edward Norton,1257989262,3
4,109,2959,mental illness,1257989255,4


In [8]:
# Step 4. Build tag_lookup_array [(NewTagID, tag), ...]
tag_lookup_array = [
    (int(row.NewTagID), row.tag)
    for _, row in unique_tags.iterrows()
]

print(tag_lookup_array)

[(0, 'Brad Pitt'), (1, 'dark'), (2, 'dark comedy'), (3, 'Edward Norton'), (4, 'mental illness'), (5, 'thought-provoking'), (6, 'twist ending'), (7, 'action'), (8, 'Bruce Willis'), (9, 'comedy'), (10, 'crime'), (11, 'great dialogue'), (12, 'multiple storylines'), (13, 'very funny'), (14, 'clever'), (15, 'smart'), (16, 'classic'), (17, 'fun'), (18, 'atmospheric'), (19, 'disjointed timeline'), (20, 'drugs'), (21, 'gangsters'), (22, 'nonlinear narrative'), (23, 'stylized'), (24, 'cult classic'), (25, 'funny'), (26, 'good dialogue'), (27, 'Samuel L. Jackson'), (28, 'Uma Thurman'), (29, 'violence'), (30, 'blood'), (31, 'brutality'), (32, 'dialogue'), (33, 'foul language'), (34, 'style'), (35, 'acting'), (36, 'Helena Bonham Carter'), (37, 'powerful ending'), (38, 'black comedy'), (39, 'cult film'), (40, 'great soundtrack'), (41, 'Highly quotable'), (42, 'nonlinear'), (43, 'Quentin Tarantino'), (44, 'bittersweet'), (45, 'drama'), (46, 'emotional'), (47, 'great acting'), (48, 'heartwarming'), (

In [9]:
rel_tags_filtered = rel_tags.drop(columns=["userId", "timestamp"]).drop_duplicates()
rel_tags_filtered

Unnamed: 0,movieId,tag,NewTagID
0,2959,Brad Pitt,0
1,2959,dark,1
2,2959,dark comedy,2
3,2959,Edward Norton,3
4,2959,mental illness,4
...,...,...,...
15660,296,spaghetti western,2053
15662,356,mental disability,2054
15721,356,awesome mast good,2055
15753,356,heart follower,2056


In [10]:
# Step 5. Build movie_tag_tuples using NewTagID instead of original tagId
movie_tag_tuples = (
    rel_tags_filtered.groupby(["movieId"])
            .apply(lambda df: [
                (int(row.NewTagID), row.tag)
                for _, row in df.iterrows()
            ])
            .reset_index(name="TagTuples")
    .merge(movies[["movieId", "title"]], on="movieId", how="left")
)

# Restrict to top n movies
movie_tag_tuples = movie_tag_tuples[movie_tag_tuples["movieId"].isin(top_movie_ids)]

In [11]:
oldid_newid = {356:1, 296:0, 2959:2}
oldid_newid

{356: 1, 296: 0, 2959: 2}

In [12]:
original_tags = []
for i, tag in tag_lookup_array:
    original_tags.append(tag)
print(len(original_tags))

2058


### How we generated first moviegroups & movietags split

In [13]:
N = 23
top_n = 3

In [14]:
movietags = [set() for i in range(top_n)]
for _, row in movie_tag_tuples.iterrows():
    pos_movie = oldid_newid[row['movieId']]
    movietags[pos_movie] = list(set(row['TagTuples']))
    print(pos_movie, row['title'], len(movietags[pos_movie]))

0 Pulp Fiction (1994) 944
1 Forrest Gump (1994) 748
2 Fight Club (1999) 633


In [15]:
true_movie = 0
id_in_true_movie = np.zeros(len(original_tags), dtype=int) - 1   #will be -1 if not in true movie tags
for i in range(len(movietags[true_movie])):
    id_t, tag = movietags[true_movie][i]
    id_in_true_movie[id_t] = int(i)

In [16]:
get_group = {}
np.random.seed(42)
tmp_shuffled = np.random.RandomState(seed=42).permutation(len(original_tags))
tag_groups = [[] for i in range(N)]
for i in range(len(original_tags)):
    t = tmp_shuffled[i]
    if (id_in_true_movie[t] != -1):
        formula = id_in_true_movie[t] // 41
    else:
        formula = i // 89
    if (formula >= N):
        formula=N-1
    get_group[original_tags[t]] = formula
    tag_groups[formula].append(original_tags[t])

In [17]:
for i in range(N):
    print(f"Group {i} has {len(tag_groups[i])} tags.")

Group 0 has 92 tags.
Group 1 has 85 tags.
Group 2 has 93 tags.
Group 3 has 93 tags.
Group 4 has 81 tags.
Group 5 has 91 tags.
Group 6 has 94 tags.
Group 7 has 92 tags.
Group 8 has 93 tags.
Group 9 has 86 tags.
Group 10 has 91 tags.
Group 11 has 89 tags.
Group 12 has 81 tags.
Group 13 has 96 tags.
Group 14 has 79 tags.
Group 15 has 90 tags.
Group 16 has 86 tags.
Group 17 has 82 tags.
Group 18 has 92 tags.
Group 19 has 91 tags.
Group 20 has 102 tags.
Group 21 has 85 tags.
Group 22 has 94 tags.


In [18]:
existing_tags = [set() for i in range(len(tag_groups))]
movie_group = [
    [set() for _ in range(len(tag_groups))]  # empty list for each group
    for _ in range(top_n)       # for each movie
]
for _, row in movie_tag_tuples.iterrows():
    pos_movie = oldid_newid[row['movieId']]
    print(pos_movie, row['title'])
    for id_tag, tag in row['TagTuples']:
        existing_tags[get_group[tag]].add(tag)
        movie_group[pos_movie][get_group[tag]].add(id_tag)

0 Pulp Fiction (1994)
1 Forrest Gump (1994)
2 Fight Club (1999)


In [19]:
for m in range(top_n):
    for g in range(N):
        movie_group[m][g] = list(movie_group[m][g])
        print(len(movie_group[m][g]), end=" ")
    print()

41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 42 
35 33 37 35 24 33 36 31 38 32 33 31 28 34 25 25 35 32 30 34 43 25 39 
28 25 26 29 26 30 27 29 27 20 37 26 24 40 27 34 21 16 28 24 32 28 29 


Should be:  
41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 42  
34 30 37 34 28 33 37 30 38 35 37 32 25 34 20 27 35 31 29 38 43 28 33  
23 32 24 32 30 23 23 31 32 23 34 27 23 35 30 32 21 17 31 23 28 26 33

There is still randomness in both, because of set(...), so to acess the data just load from files

In [25]:
import pickle
with open("data/Moviegroups.pkl", "rb") as f:
    movie_group = pickle.load(f)

In [26]:
import pickle
with open("data/Movietags.pkl", "rb") as f:
    movietags = pickle.load(f)

In [27]:
for m in range(top_n):
    for g in range(N):
        movie_group[m][g] = list(movie_group[m][g])
        print(len(movie_group[m][g]), end=" ")
    print()

41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 42 
34 30 37 34 28 33 37 30 38 35 37 32 25 34 20 27 35 31 29 38 43 28 33 
23 32 24 32 30 23 23 31 32 23 34 27 23 35 30 32 21 17 31 23 28 26 33 


### Adding split tags

In [40]:
import random

# Split Pulp Fiction tags
split_tags_all = []
for tid, tag in movietags[0]:
    words = tag.split()
    if len(words) >= 2:
        for w in words:
            split_tags_all.append(w)

# Deduplicate
split_tags_all = list(set(split_tags_all))

# Deterministic sample of 300
random.seed(42)
sampled_words = random.sample(split_tags_all, 300)

sampled_split_tags = []
tag_to_id = {}
for i, w in enumerate(sampled_words):
    sampled_split_tags.append((len(original_tags) + i, w))
    tag_to_id[w] = i
    
# Assign IDs 2058–2357
sampled_split_tags = [(i,w) for i, w in enumerate(sampled_words)]

In [41]:
extended_tags = original_tags + [w for _, w in sampled_split_tags]

This gets the wrong result, because there is a set() again. So we just load the correct results from the files 

In [43]:
import pickle
with open("data/sampled_split_tags.pkl", "rb") as f:
    sampled_split_tags = pickle.load(f)

In [44]:
import pickle
with open("data/extended_tags.pkl", "rb") as f:
    extended_tags = pickle.load(f)