In [1]:
import pandas as pd
import numpy as np

In [81]:


# Enter path to data folder on your device. 
# Data can be downloaded from https://grouplens.org/datasets/movielens/ as "ml-25m.zip" under MovieLens 25M Dataset
# ~62K movies in total
movie_lens_path = "/Users/david/Documents/Research2324/Sanner/ml-25m/"

In [2]:
movie_lens_path = "C:/Users/anton/source/data/SIGIR24/MovieLens25M/"

In [3]:
movie_df = pd.read_csv(movie_lens_path + "movies.csv")

#Make 'genres' column comma seperated, not '|' seperated
movie_df['genres'] = movie_df['genres'].str.replace('|', ', ', regex=False)

movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy"
1,2,Jumanji (1995),"Adventure, Children, Fantasy"
2,3,Grumpier Old Men (1995),"Comedy, Romance"
3,4,Waiting to Exhale (1995),"Comedy, Drama, Romance"
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
tags_df = pd.read_csv(movie_lens_path + "tags.csv")
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


### Filter movies be in top 10% rating count and have 20+ tags

In [5]:
#select movies with top 10% rating count
ratings_df = pd.read_csv(movie_lens_path + 'ratings.csv')
rating_cnt_df = ratings_df['movieId'].value_counts()

top_10_perc_cnt = int(len(movie_df)*0.1)

top_10_perc_rating_cnts = rating_cnt_df.head(top_10_perc_cnt)

In [6]:
top_10_perc_rating_cnts 

movieId
356       81491
318       81482
296       79672
593       74127
2571      72674
          ...  
61255       366
71619       366
58839       366
134528      366
8799        366
Name: count, Length: 6242, dtype: int64

In [7]:
top_10_percent_movie_ids = set(top_10_perc_rating_cnts.index)

In [8]:
#select movies with at least 20 unique tags

# Filter 'tags_df' for movies in 'top_10_percent_movie_ids'
filtered_tags_df = tags_df[tags_df['movieId'].isin(top_10_percent_movie_ids)]

#Group by 'movieId' and count unique tags
unique_tag_counts = filtered_tags_df.groupby('movieId')['tag'].nunique()

#Filter for movies with at least 20 unique tags
movies_20_plus_tags = unique_tag_counts[unique_tag_counts >= 20].index.tolist()

In [9]:
#gives 4425 movies (using MovieLens25M)
len(movies_20_plus_tags)

4425

### Randomly select 100 movies and get their top 20 most frequent tags

In [27]:
import random

In [28]:
random.seed(42)
random_100_movie_ids = random.sample(movies_20_plus_tags, 100)

In [29]:
#Filter 'tags_df' for these 100 movies
filtered_100_tags_df = tags_df[tags_df['movieId'].isin(random_100_movie_ids)]

#tag counts for each movie:
grouped_tags = filtered_100_tags_df.groupby(['movieId', 'tag']).size().reset_index(name='count')

# for each movie, select the top 20 tags
top_20_tags_per_movie = grouped_tags.groupby('movieId').apply(lambda x: x.nlargest(20, 'count')).reset_index(drop = True)

In [30]:
top_20_tags_per_movie

Unnamed: 0,movieId,tag,count
0,123,stylized,17
1,123,dreamlike,12
2,123,loneliness,12
3,123,reflective,12
4,123,Hong Kong,9
...,...,...,...
1995,197175,badass girl,5
1996,197175,cliched,5
1997,197175,romance,5
1998,197175,uncanny valley,5


In [31]:
# Create a DataFrame from the 'top_20_tags_per_movie' with 'movieId' as index and concatenated tags as values
tags_concatenated = top_20_tags_per_movie.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index(name='top_20_tags')

#Merge the concatenated tags with the filtered_movie_df
filtered_movie_df = movie_df[movie_df['movieId'].isin(random_100_movie_ids)]
filtered_100_df = pd.merge(filtered_movie_df, tags_concatenated, on='movieId', how='left')

#randomize rows
filtered_100_df = filtered_100_df.sample(frac=1).reset_index(drop=True)

In [32]:
filtered_100_df

Unnamed: 0,movieId,title,genres,top_20_tags
0,74727,Gentlemen of Fortune (Dzhentlmeny udachi) (1972),"Comedy, Crime, Drama, Mystery","Soviet Union, classic, comedy, Russian, underc..."
1,7293,50 First Dates (2004),"Comedy, Romance","Adam Sandler, Drew Barrymore, Romance, Comedy,..."
2,1912,Out of Sight (1998),"Comedy, Crime, Drama, Romance, Thriller","George Clooney, Jennifer Lopez, comedy romance..."
3,6873,Intolerable Cruelty (2003),"Comedy, Romance","screwball comedy, divorce, lawyers, George Clo..."
4,64034,"Boy in the Striped Pajamas, The (Boy in the St...","Drama, War","World War II, Holocaust, touching, Friendship,..."
...,...,...,...,...
95,4519,"Land Before Time, The (1988)","Adventure, Animation, Children, Fantasy","dinosaurs, friendship, classic, animation, nos..."
96,4967,No Man's Land (2001),"Drama, War","No Happy End, complex characters, ethnic confl..."
97,91653,We Bought a Zoo (2011),"Comedy, Drama","feel-good, Family, predictable, Animals, Matt ..."
98,565,Cronos (1993),"Drama, Horror","Guillermo del Toro, slow, Criterion, atmospher..."


In [35]:
filtered_100_df[filtered_100_df['title'] == '5 Centimeters per Second (By\u00f4soku 5 senchim\u00eatoru) (2007)']

Unnamed: 0,movieId,title,genres,top_20_tags
84,64993,5 Centimeters per Second (Byôsoku 5 senchimêto...,"Animation, Drama, Romance","melancholic, bittersweet, visually stunning, a..."


### Make JSON File

In [19]:
movies_dict = {}

for idx, row in filtered_100_df.iterrows():
    description = f" Movie Title: {row['title']} \n Genres: {row['genres']} \n Tags: {row['top_20_tags']}"
    movies_dict[str(idx)] = {'description': description}

In [20]:
import json

with open('data/ml25M_100_movie_sample_n.json', 'w') as json_file:
    json.dump(movies_dict, json_file, indent=4)

### Trim 100 movies to 50

In [84]:
with open('data/ml25M_100_movie_sample.json', 'r') as json_file:
    jdata = json.load(json_file)

In [86]:
keep_ids = []
for val in range(50):
    keep_ids.append(str(val))

In [87]:
final_dict = {key: jdata[key] for key in keep_ids}
final_dict

{'0': {'description': ' -Movie Title: Tomb Raider (2018) \n -Genres: Action, Adventure, Fantasy \n -Tags: adventure, Alicia Vikander, video game adaptation, female protagonist, action, strong female lead, island, treasure hunt, boring, father daughter relationship, remake, based on a video game, reboot, predictable, action heroine, archaeology, dialogue, adaptation, bad remake, perfectly enjoyable'},
 '1': {'description': " -Movie Title: Pirates of the Caribbean: At World's End (2007) \n -Genres: Action, Adventure, Comedy, Fantasy \n -Tags: Johnny Depp, pirates, adventure, Keira Knightley, comedy, Orlando Bloom, anti-hero, treasure, quirky, seafaring, Geoffrey Rush, sequel, swashbuckler, funny, surreal, big budget, confusing, BORING!, stupid ending, Disney"},
 '2': {'description': ' -Movie Title: Laura (1944) \n -Genres: Crime, Film-Noir, Mystery \n -Tags: film noir, black and white, Vincent Price, murder, Oscar (Best Cinematography), murder mystery, noir, Dana Andrews, Gene Tierney, 1

In [89]:
import json

with open('data/ml25M_50_movie_sample.json', 'w') as json_file:
    json.dump(final_dict, json_file, indent=4)

### Randomly select 50 movies and get their top 20 most frequent tags

In [14]:
import random

In [15]:
random.seed(42)
random_50_movie_ids = random.sample(movies_20_plus_tags, 50)

In [16]:
#Filter 'tags_df' for these 50 movies
filtered_50_tags_df = tags_df[tags_df['movieId'].isin(random_50_movie_ids)]

#tag counts for each movie:
grouped_tags = filtered_50_tags_df.groupby(['movieId', 'tag']).size().reset_index(name='count')

# for each movie, select the top 20 tags
top_20_tags_per_movie = grouped_tags.groupby('movieId').apply(lambda x: x.nlargest(20, 'count')).reset_index(drop = True)

In [17]:
top_20_tags_per_movie

Unnamed: 0,movieId,tag,count
0,123,stylized,17
1,123,dreamlike,12
2,123,loneliness,12
3,123,reflective,12
4,123,Hong Kong,9
...,...,...,...
995,190555,missing child,2
996,190555,mystery,2
997,190555,New,1
998,190555,acting,1


In [18]:
# Create a DataFrame from the 'top_20_tags_per_movie' with 'movieId' as index and concatenated tags as values
tags_concatenated = top_20_tags_per_movie.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index(name='top_20_tags')

#Merge the concatenated tags with the filtered_movie_df
filtered_movie_df = movie_df[movie_df['movieId'].isin(random_50_movie_ids)]
filtered_50_df = pd.merge(filtered_movie_df, tags_concatenated, on='movieId', how='left')

#randomize rows
filtered_50_df = filtered_50_df.sample(frac=1).reset_index(drop=True)

In [20]:
filtered_50_df

Unnamed: 0,movieId,title,genres,top_20_tags
0,2710,"Blair Witch Project, The (1999)","Drama, Horror, Thriller","low budget, creepy, scary, found footage, fore..."
1,1927,All Quiet on the Western Front (1930),"Action, Drama, War","World War I, Oscar (Best Picture), anti-war, b..."
2,2146,St. Elmo's Fire (1985),"Drama, Romance","Brat Pack, friendship, Rob Lowe, 80's classic,..."
3,90249,Real Steel (2011),"Action, Drama, Sci-Fi, IMAX","robots, boxing, Science Fiction, father-son re..."
4,4010,Brewster's Millions (1985),Comedy,"richard pryor, Betamax, Can't remember, John C..."
5,2022,"Last Temptation of Christ, The (1988)",Drama,"jesus, christian, Martin Scorsese, based on a ..."
6,2020,Dangerous Liaisons (1988),"Drama, Romance","adultery, Paris, aristocracy, witty, John Malk..."
7,5065,"Mothman Prophecies, The (2002)","Drama, Fantasy, Horror, Mystery, Thriller","supernatural, Richard Gere, pointless, Debra M..."
8,3095,"Grapes of Wrath, The (1940)",Drama,"Great Depression, adapted from:book, Social Dr..."
9,3868,"Naked Gun: From the Files of Police Squad!, Th...","Action, Comedy, Crime, Romance","parody, Leslie Nielsen, Funny as hell, slapsti..."


### Make JSON File

In [21]:
movies_dict = {}

for idx, row in filtered_50_df.iterrows():
    description = f" -Movie Title: {row['title']} \n -Genres: {row['genres']} \n -Tags: {row['top_20_tags']}"
    movies_dict[str(idx)] = {'description': description}

In [22]:
import json

with open('data/ml25M_50_movie_sample.json', 'w') as json_file:
    json.dump(movies_dict, json_file, indent=4)

### Randomly select 16 movies and get their top 20 most frequent tags
These are a subset of the above 100 at seed 42

In [11]:
import random

In [12]:
random.seed(42)
random_16_movie_ids = random.sample(movies_20_plus_tags, 16)

In [13]:
#Filter 'tags_df' for these 100 movies
filtered_16_tags_df = tags_df[tags_df['movieId'].isin(random_16_movie_ids)]

#tag counts for each movie:
grouped_tags = filtered_16_tags_df.groupby(['movieId', 'tag']).size().reset_index(name='count')

# for each movie, select the top 20 tags
top_20_tags_per_movie = grouped_tags.groupby('movieId').apply(lambda x: x.nlargest(20, 'count')).reset_index(drop = True)

In [14]:
top_20_tags_per_movie

Unnamed: 0,movieId,tag,count
0,485,arnold,32
1,485,Arnold Schwarzenegger,12
2,485,parody,12
3,485,deconstruction,6
4,485,campy,5
...,...,...,...
315,135288,dementia,1
316,135288,detective,1
317,135288,growing old,1
318,135288,kid,1


In [15]:
# Create a DataFrame from the 'top_20_tags_per_movie' with 'movieId' as index and concatenated tags as values
tags_concatenated = top_20_tags_per_movie.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index(name='top_20_tags')

#Merge the concatenated tags with the filtered_movie_df
filtered_movie_df = movie_df[movie_df['movieId'].isin(random_16_movie_ids)]
filtered_16_df = pd.merge(filtered_movie_df, tags_concatenated, on='movieId', how='left')

#randomize rows
filtered_16_df = filtered_16_df.sample(frac=1).reset_index(drop=True)

In [17]:
filtered_16_df

Unnamed: 0,movieId,title,genres,top_20_tags
0,5540,Clash of the Titans (1981),"Action, Adventure, Fantasy, Romance","mythology, Claymation, Ancient Greeks, Greek m..."
1,1732,"Big Lebowski, The (1998)","Comedy, Crime","dark comedy, cult film, great dialogue, black ..."
2,1927,All Quiet on the Western Front (1930),"Action, Drama, War","World War I, Oscar (Best Picture), anti-war, b..."
3,613,Jane Eyre (1996),"Drama, Romance","adapted from:book, 19th century, Anna Paquin, ..."
4,4658,Santa Sangre (1989),"Drama, Horror, Mystery, Thriller","atmospheric, Alejandro Jodorowsky, weird, styl..."
5,509,"Piano, The (1993)","Drama, Romance","atmospheric, New Zealand, beautiful, 19th cent..."
6,6873,Intolerable Cruelty (2003),"Comedy, Romance","screwball comedy, divorce, lawyers, George Clo..."
7,565,Cronos (1993),"Drama, Horror","Guillermo del Toro, slow, Criterion, atmospher..."
8,2146,St. Elmo's Fire (1985),"Drama, Romance","Brat Pack, friendship, Rob Lowe, 80's classic,..."
9,74553,"Secret of Kells, The (2009)","Animation, Fantasy","stylized, animation, beautiful, medieval, Atmo..."


### Make JSON File

In [19]:
movies_dict = {}

for idx, row in filtered_16_df.iterrows():
    description = f" -Movie Title: {row['title']} \n -Genres: {row['genres']} \n -Tags: {row['top_20_tags']}"
    movies_dict[str(idx)] = {'description': description}

In [20]:
import json

with open('data/ml25M_16_movie_sample.json', 'w') as json_file:
    json.dump(movies_dict, json_file, indent=4)

### Few shot samples
Some non-test movies with tags for few shot purposes.

In [192]:
random.seed(100000)
random_50_movie_ids = random.sample(movies_20_plus_tags, 50)

In [194]:
#remove overlapping ids with test data
random_50_movie_ids = [id for id in random_50_movie_ids if id not in random_100_movie_ids]

In [198]:
#Filter 'tags_df' for these 100 movies
filtered_50_tags_df = tags_df[tags_df['movieId'].isin(random_50_movie_ids)]

#tag counts for each movie:
grouped_tags = filtered_50_tags_df.groupby(['movieId', 'tag']).size().reset_index(name='count')

# for each movie, select the top 20 tags
top_20_tags_per_movie = grouped_tags.groupby('movieId').apply(lambda x: x.nlargest(20, 'count')).reset_index(drop = True)

In [199]:
top_20_tags_per_movie

Unnamed: 0,movieId,tag,count
0,337,Johnny Depp,32
1,337,Leonardo DiCaprio,22
2,337,bittersweet,20
3,337,coming of age,19
4,337,mental illness,15
...,...,...,...
995,194016,sad,2
996,194016,wifi,2
997,194016,2020 relook,1
998,194016,Gal Gadot,1


In [200]:
# Create a DataFrame from the 'top_20_tags_per_movie' with 'movieId' as index and concatenated tags as values
tags_concatenated = top_20_tags_per_movie.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index(name='top_20_tags')

#Merge the concatenated tags with the filtered_movie_df
filtered_movie_df = movie_df[movie_df['movieId'].isin(random_50_movie_ids)]
filtered_50_df = pd.merge(filtered_movie_df, tags_concatenated, on='movieId', how='left')

#randomize rows
filtered_50_df = filtered_50_df.sample(frac=1).reset_index(drop=True)

In [201]:
filtered_50_df

Unnamed: 0,movieId,title,genres,top_20_tags
0,52604,Fracture (2007),"Crime, Drama, Mystery, Thriller","Anthony Hopkins, twist ending, courtroom drama..."
1,2087,Peter Pan (1953),"Animation, Children, Fantasy, Musical","Disney, Peter Pan, pirates, animation, sword f..."
2,96432,Lawless (2012),"Crime, Drama","Great Depression, Prohibition, Tom Hardy, Shia..."
3,6023,Band of Outsiders (Bande à part) (1964),"Comedy, Crime, Drama, Romance","Jean-Luc Godard, French New Wave, Criterion, A..."
4,72393,"Fourth Kind, The (2009)","Horror, Mystery, Sci-Fi, Thriller","Milla Jovovich, aliens, fake documentary, abdu..."
5,69526,Transformers: Revenge of the Fallen (2009),"Action, Adventure, Sci-Fi, IMAX","robots, Megan Fox, bad plot, Shia LaBeouf, rid..."
6,1513,Romy and Michele's High School Reunion (1997),Comedy,"high school, Janeane Garofalo, comedy, Lisa Ku..."
7,6101,Missing (1982),"Drama, Mystery, Thriller","politics, Chile, Criterion, Golden Palm, Jack ..."
8,3091,Kagemusha (1980),"Drama, War","Akira Kurosawa, Criterion, Japan, Tatsuya Naka..."
9,63808,"Class, The (Entre les murs) (2008)",Drama,"high school, Laurent Cantet, Palme d'Or, reali..."


### Make JSON File

In [202]:
movies_dict = {}

for idx, row in filtered_50_df.iterrows():
    description = f" -Movie Title: {row['title']} \n -Genres: {row['genres']} \n -Tags: {row['top_20_tags']}"
    movies_dict[str(idx)] = {'description': description}

In [203]:
import json

with open('data/ml25M_FS_movie_sample.json', 'w') as json_file:
    json.dump(movies_dict, json_file, indent=4)

### Old code

In [11]:
# USE THIS OPTION IF YOU WANT A RANDOM SAMPLE OF MOVIES WITH A SPECIFIC NUMBER OF TAGS

# Select 20 random movies from the trimmed set
np.random.seed(42)
random_movies = np.random.choice(trimmed_tags['movieId'].unique(), size=20)
movie_subset = random_movies


In [23]:
# USE THIS OPTION FOR HANDSELECTED MOVIES
selected_titles = [
    "Terminator, The (1984)", # 4 sci-fi movies
    "Matrix, The (1999)",
    "Interstellar (2014)",
    "Alien (1979)",
    "Tropic Thunder (2008)", # 4 comedy movies
    "Night at the Museum (2006)",
    "Shaun of the Dead (2004)",
    "Hot Fuzz (2007)",
    "Godfather, The (1972)", # 4 crime movies
    "Heat (1995)",
    "Goodfellas (1990)",
    "Reservoir Dogs (1992)",
    "Fantasia (1940)", # 4 animated movies
    "Toy Story 2 (1999)",
    "Aladdin (1992)",
    "Shrek (2001)",
]
movie_rows = movie_df[movie_df['title'].isin(selected_titles)]
movie_rows = movie_rows.drop_duplicates(subset=['title'])
movie_subset = movie_rows['movieId']
movie_rows

Unnamed: 0,movieId,title,genres
5,6,Heat (1995),Action|Crime|Thriller
580,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
840,858,"Godfather, The (1972)",Crime|Drama
1062,1089,Reservoir Dogs (1992),Crime|Mystery|Thriller
1182,1213,Goodfellas (1990),Crime|Drama
1183,1214,Alien (1979),Horror|Sci-Fi
1207,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller
1249,1282,Fantasia (1940),Animation|Children|Fantasy|Musical
2480,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy


In [24]:
final_tags = tags_df[tags_df['movieId'].isin(movie_subset)]
final_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
51,87,109487,good science,1522676693
52,87,109487,Hans Zimmer,1522676679
53,87,109487,philosophical issues,1522676687
54,87,109487,sci-fi,1522676660
55,87,109487,science fiction,1522676703


In [25]:
test_row = final_tags[final_tags['movieId'] == 1666]
for tag in test_row['tag']:
    print(tag)

In [28]:
item_descs = {}
for item_count, item_id in enumerate(movie_subset):
    movie_df_row = movie_df[movie_df['movieId'] == item_id]
    # Set description as title
    title = movie_df_row['title'].item()
    item_desc = title + ", "
    # Append genres to the description
    genres = movie_df_row['genres'].item()
    genres = genres.split("|")
    for genre in genres:
        item_desc += (genre + ", ")

    # Append tags to the description
    movie_tags = final_tags[final_tags['movieId'] == item_id].drop_duplicates("tag")
    for i, aspect in enumerate(movie_tags['tag']):
        item_desc += aspect
        if (i > 15): # Cap the number of aspects at 15
            break
        if not (i == (len(movie_tags) - 1)):
            item_desc += ", "
    item_descs[str(item_count)] = {"description": item_desc, "name": title} # Using item_count instead of item_id 
print(item_descs)


{'0': {'description': "Heat (1995), Action, Crime, Thriller, imdb top 250, great acting, realistic action, suspense, Al Pacino, atmospheric, bank robbery, crime, Robert De Niro, tense, Val Kilmer, bank job, dialogue, heist, heist movie, long, Los Angeles, Michael Mann, visceral, bleak, slick, somber, witty, betrayal, cat and mouse, chase, crime epic, criminal mastermind, cult film, detective, ex-con, gang, honor, loner, los angeles, murder, neo-noir, obsession, one last job, robbery, thief, Recommendz Top Pick, gunfight, 1, overrated, philosophy, Natalie Portman, soundtrack, al pacino, electronic soundtrack, robert de niro, slow paced, blu-ray, mine, realistic, cliched, dumbed down, hollywoodization, lame, 7.5-FilmAffinity, es un poc de lo de sempre, too long, Ashley Judd, Bechdel Test:Fail, ensemble cast, police, relationships, individualism, heists, complex characters, setting:LA, who cares DVDs, CLV, Action, Crime, Heist, Jon Voight, Realistic, Third Act Problems, Al Pacino Vs Rober

In [29]:
import json

with open("./data/movielens_16_trimmed.json", "w") as output_file:
    json.dump(item_descs, output_file)