In [184]:
import pandas as pd
import numpy as np

In [1]:


# Enter path to data folder on your device. 
# Data can be downloaded from https://grouplens.org/datasets/movielens/ as "ml-25m.zip" under MovieLens 25M Dataset
# ~62K movies in total
movie_lens_path = "/Users/david/Documents/Research2324/Sanner/ml-25m/"

In [73]:
movie_lens_path = "C:/Users/anton/source/data/SIGIR24/MovieLens25M/"

In [161]:
movie_df = pd.read_csv(movie_lens_path + "movies.csv")

#Make 'genres' column comma seperated, not '|' seperated
movie_df['genres'] = movie_df['genres'].str.replace('|', ', ', regex=False)

movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy"
1,2,Jumanji (1995),"Adventure, Children, Fantasy"
2,3,Grumpier Old Men (1995),"Comedy, Romance"
3,4,Waiting to Exhale (1995),"Comedy, Drama, Romance"
4,5,Father of the Bride Part II (1995),Comedy


In [75]:
tags_df = pd.read_csv(movie_lens_path + "tags.csv")
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


### Filter movies be in top 10% rating count and have 20+ tags

In [76]:
#select movies with top 10% rating count
ratings_df = pd.read_csv(movie_lens_path + 'ratings.csv')
rating_cnt_df = ratings_df['movieId'].value_counts()

top_10_perc_cnt = int(len(movie_df)*0.1)

top_10_perc_rating_cnts = rating_cnt_df.head(top_10_perc_cnt)

In [77]:
top_10_perc_rating_cnts 

movieId
356       81491
318       81482
296       79672
593       74127
2571      72674
          ...  
61255       366
71619       366
58839       366
134528      366
8799        366
Name: count, Length: 6242, dtype: int64

In [78]:
top_10_percent_movie_ids = set(top_10_perc_rating_cnts.index)

In [93]:
#select movies with at least 20 unique tags

# Filter 'tags_df' for movies in 'top_10_percent_movie_ids'
filtered_tags_df = tags_df[tags_df['movieId'].isin(top_10_percent_movie_ids)]

#Group by 'movieId' and count unique tags
unique_tag_counts = filtered_tags_df.groupby('movieId')['tag'].nunique()

#Filter for movies with at least 20 unique tags
movies_20_plus_tags = unique_tag_counts[unique_tag_counts >= 20].index.tolist()

In [82]:
#gives 4425 movies (using MovieLens25M)
len(movies_20_plus_tags)

4425

### Randomly select 100 movies and get their top 20 most frequent tags

In [83]:
import random

In [162]:
random.seed(42)
random_100_movie_ids = random.sample(movies_20_plus_tags, 100)

In [163]:
#Filter 'tags_df' for these 100 movies
filtered_100_tags_df = tags_df[tags_df['movieId'].isin(random_100_movie_ids)]

#tag counts for each movie:
grouped_tags = filtered_100_tags_df.groupby(['movieId', 'tag']).size().reset_index(name='count')

# for each movie, select the top 20 tags
top_20_tags_per_movie = grouped_tags.groupby('movieId').apply(lambda x: x.nlargest(20, 'count')).reset_index(drop = True)

In [164]:
top_20_tags_per_movie

Unnamed: 0,movieId,tag,count
0,123,stylized,17
1,123,dreamlike,12
2,123,loneliness,12
3,123,reflective,12
4,123,Hong Kong,9
...,...,...,...
1995,197175,badass girl,5
1996,197175,cliched,5
1997,197175,romance,5
1998,197175,uncanny valley,5


In [167]:
# Create a DataFrame from the 'top_20_tags_per_movie' with 'movieId' as index and concatenated tags as values
tags_concatenated = top_20_tags_per_movie.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index(name='top_20_tags')

#Merge the concatenated tags with the filtered_movie_df
filtered_movie_df = movie_df[movie_df['movieId'].isin(random_100_movie_ids)]
filtered_100_df = pd.merge(filtered_movie_df, tags_concatenated, on='movieId', how='left')

#randomize rows
filtered_100_df = filtered_100_df.sample(frac=1).reset_index(drop=True)

In [168]:
filtered_100_df

Unnamed: 0,movieId,title,genres,top_20_tags
0,184471,Tomb Raider (2018),"Action, Adventure, Fantasy","adventure, Alicia Vikander, video game adaptat..."
1,53125,Pirates of the Caribbean: At World's End (2007),"Action, Adventure, Comedy, Fantasy","Johnny Depp, pirates, adventure, Keira Knightl..."
2,942,Laura (1944),"Crime, Film-Noir, Mystery","film noir, black and white, Vincent Price, mur..."
3,2022,"Last Temptation of Christ, The (1988)",Drama,"jesus, christian, Martin Scorsese, based on a ..."
4,95207,Abraham Lincoln: Vampire Hunter (2012),"Action, Fantasy, Horror, Thriller","vampires, American Civil War, Predictable, Stu..."
...,...,...,...,...
95,4688,Black Robe (1991),"Adventure, Drama","17th century, Bruce Beresford, colonialism, Ca..."
96,49396,Tenacious D in The Pick of Destiny (2006),"Adventure, Comedy, Musical","Jack Black, Kyle Gass, comedy, rock and roll, ..."
97,64993,5 Centimeters per Second (Byôsoku 5 senchimêto...,"Animation, Drama, Romance","melancholic, bittersweet, visually stunning, a..."
98,49649,Eragon (2006),"Action, Adventure, Fantasy","dragons, bad acting, cliche, based on a book, ..."


### Make JSON File

In [180]:
movies_dict = {}

for idx, row in filtered_100_df.iterrows():
    description = f" -Movie Title: {row['title']} \n -Genres: {row['genres']} \n -Tags: {row['top_20_tags']}"
    movies_dict[str(idx)] = {'description': description}

In [183]:
import json

with open('data/ml25M_100_movie_sample.json', 'w') as json_file:
    json.dump(movies_dict, json_file, indent=4)

### Old code

In [11]:
# USE THIS OPTION IF YOU WANT A RANDOM SAMPLE OF MOVIES WITH A SPECIFIC NUMBER OF TAGS

# Select 20 random movies from the trimmed set
np.random.seed(42)
random_movies = np.random.choice(trimmed_tags['movieId'].unique(), size=20)
movie_subset = random_movies


In [23]:
# USE THIS OPTION FOR HANDSELECTED MOVIES
selected_titles = [
    "Terminator, The (1984)", # 4 sci-fi movies
    "Matrix, The (1999)",
    "Interstellar (2014)",
    "Alien (1979)",
    "Tropic Thunder (2008)", # 4 comedy movies
    "Night at the Museum (2006)",
    "Shaun of the Dead (2004)",
    "Hot Fuzz (2007)",
    "Godfather, The (1972)", # 4 crime movies
    "Heat (1995)",
    "Goodfellas (1990)",
    "Reservoir Dogs (1992)",
    "Fantasia (1940)", # 4 animated movies
    "Toy Story 2 (1999)",
    "Aladdin (1992)",
    "Shrek (2001)",
]
movie_rows = movie_df[movie_df['title'].isin(selected_titles)]
movie_rows = movie_rows.drop_duplicates(subset=['title'])
movie_subset = movie_rows['movieId']
movie_rows

Unnamed: 0,movieId,title,genres
5,6,Heat (1995),Action|Crime|Thriller
580,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
840,858,"Godfather, The (1972)",Crime|Drama
1062,1089,Reservoir Dogs (1992),Crime|Mystery|Thriller
1182,1213,Goodfellas (1990),Crime|Drama
1183,1214,Alien (1979),Horror|Sci-Fi
1207,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller
1249,1282,Fantasia (1940),Animation|Children|Fantasy|Musical
2480,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy


In [24]:
final_tags = tags_df[tags_df['movieId'].isin(movie_subset)]
final_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
51,87,109487,good science,1522676693
52,87,109487,Hans Zimmer,1522676679
53,87,109487,philosophical issues,1522676687
54,87,109487,sci-fi,1522676660
55,87,109487,science fiction,1522676703


In [25]:
test_row = final_tags[final_tags['movieId'] == 1666]
for tag in test_row['tag']:
    print(tag)

In [28]:
item_descs = {}
for item_count, item_id in enumerate(movie_subset):
    movie_df_row = movie_df[movie_df['movieId'] == item_id]
    # Set description as title
    title = movie_df_row['title'].item()
    item_desc = title + ", "
    # Append genres to the description
    genres = movie_df_row['genres'].item()
    genres = genres.split("|")
    for genre in genres:
        item_desc += (genre + ", ")

    # Append tags to the description
    movie_tags = final_tags[final_tags['movieId'] == item_id].drop_duplicates("tag")
    for i, aspect in enumerate(movie_tags['tag']):
        item_desc += aspect
        if (i > 15): # Cap the number of aspects at 15
            break
        if not (i == (len(movie_tags) - 1)):
            item_desc += ", "
    item_descs[str(item_count)] = {"description": item_desc, "name": title} # Using item_count instead of item_id 
print(item_descs)


{'0': {'description': "Heat (1995), Action, Crime, Thriller, imdb top 250, great acting, realistic action, suspense, Al Pacino, atmospheric, bank robbery, crime, Robert De Niro, tense, Val Kilmer, bank job, dialogue, heist, heist movie, long, Los Angeles, Michael Mann, visceral, bleak, slick, somber, witty, betrayal, cat and mouse, chase, crime epic, criminal mastermind, cult film, detective, ex-con, gang, honor, loner, los angeles, murder, neo-noir, obsession, one last job, robbery, thief, Recommendz Top Pick, gunfight, 1, overrated, philosophy, Natalie Portman, soundtrack, al pacino, electronic soundtrack, robert de niro, slow paced, blu-ray, mine, realistic, cliched, dumbed down, hollywoodization, lame, 7.5-FilmAffinity, es un poc de lo de sempre, too long, Ashley Judd, Bechdel Test:Fail, ensemble cast, police, relationships, individualism, heists, complex characters, setting:LA, who cares DVDs, CLV, Action, Crime, Heist, Jon Voight, Realistic, Third Act Problems, Al Pacino Vs Rober

In [29]:
import json

with open("./data/movielens_16_trimmed.json", "w") as output_file:
    json.dump(item_descs, output_file)