In [1]:
import pandas as pd
import numpy as np
movie_lens_path = "/Users/david/Documents/Research2324/Sanner/ml-25m/"

In [2]:
movie_df = pd.read_csv(movie_lens_path + "movies.csv")
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
tags_df = pd.read_csv(movie_lens_path + "tags.csv")
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [4]:
# Trim to movies with exactly 6 tags
counts = tags_df['movieId'].value_counts()
trimmed_tags = tags_df[tags_df['movieId'].isin(counts.index[counts == 6])]
trimmed_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
654,195,184685,impressive visuals,1545130234
655,195,184685,magic,1545130222
656,195,184685,mystery,1545130257
657,195,184685,tang dynasty,1545130187
1200,402,116803,china,1431427396


In [9]:
# USE THIS OPTION IF YOU WANT A RANDOM SAMPLE OF MOVIES WITH A SPECIFIC NUMBER OF TAGS

# Select 20 random movies from the trimmed set
np.random.seed(42)
random_movies = np.random.choice(trimmed_tags['movieId'].unique(), size=20)
movie_subset = random_movies


array([106156, 138912, 128279, 125607, 162232, 134095,  45942, 135037,
         7392, 152250, 188073,   1666, 164829,  95784,  82595, 155026,
       133443, 116584, 136992, 131745])

In [51]:
# USE THIS OPTION FOR HANDSELECTED MOVIES
selected_titles = [
    "Terminator, The (1984)", # 4 sci-fi movies
    "Matrix, The (1999)",
    "Interstellar (2014)",
    "Alien (1979)",
    "Tropic Thunder (2008)", # 4 comedy movies
    "Night at the Museum (2006)",
    "Shaun of the Dead (2004)",
    "Hot Fuzz (2007)",
]
movie_rows = movie_df[movie_df['title'].isin(selected_titles)]
movie_subset = movie_rows['movieId']
movie_rows

Unnamed: 0,movieId,title,genres
1183,1214,Alien (1979),Horror|Sci-Fi
1207,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller
2480,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
8159,8874,Shaun of the Dead (2004),Comedy|Horror
10943,46972,Night at the Museum (2006),Action|Comedy|Fantasy|IMAX
11407,51255,Hot Fuzz (2007),Action|Comedy|Crime|Mystery
12591,61132,Tropic Thunder (2008),Action|Adventure|Comedy|War
21199,109487,Interstellar (2014),Sci-Fi|IMAX


In [52]:
final_tags = tags_df[tags_df['movieId'].isin(movie_subset)]
final_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
51,87,109487,good science,1522676693
52,87,109487,Hans Zimmer,1522676679
53,87,109487,philosophical issues,1522676687
54,87,109487,sci-fi,1522676660
55,87,109487,science fiction,1522676703


In [53]:
test_row = final_tags[final_tags['movieId'] == 1666]
for tag in test_row['tag']:
    print(tag)

In [56]:
item_descs = {}
for item_id in movie_subset:
    item_desc = ""
    # Append genres to the description
    movie_df_row = movie_df[movie_df['movieId'] == item_id]
    title = movie_df_row['title'].item()
    genres = movie_df_row['genres'].item()
    genres = genres.split("|")
    for genre in genres:
        item_desc += (genre + ", ")

    # Append tags to the description
    movie_tags = final_tags[final_tags['movieId'] == item_id].drop_duplicates("tag")
    for i, aspect in enumerate(movie_tags['tag']):
        item_desc += aspect
        if (i > 10): # Cap the number of aspects at 10
            break
        if not (i == (len(movie_tags) - 1)):
            item_desc += ", "
    item_descs[str(item_id)] = {"description": item_desc, "name": title}
print(item_descs)


{'1214': {'description': 'Horror, Sci-Fi, imdb top 250, aliens, sci-fi, suspense, thriller, tense, horror, atmospheric, outer space, space, space travel, suspenseful', 'name': 'Alien (1979)'}, '1240': {'description': 'Action, Sci-Fi, Thriller, Action, artificial intelligence, future, time travel, imdb top 250, arnold, Arnold Schwarzenegger, classic, cyborgs, dystopic future, highly quotable, robots', 'name': 'Terminator, The (1984)'}, '2571': {'description': 'Action, Sci-Fi, Thriller, alternate reality, artificial intelligence, cyberpunk, dystopia, philosophical, philosophy, sci-fi, virtual reality, Futuristic, post apocalyptic, surreal, thought-provoking', 'name': 'Matrix, The (1999)'}, '8874': {'description': 'Comedy, Horror, Simon Pegg, Very British and very funny!, zombies, black comedy, British, british comedy, comedy, dark comedy, dark humor, Edgar Wright, funny, hilarious', 'name': 'Shaun of the Dead (2004)'}, '46972': {'description': "Action, Comedy, Fantasy, IMAX, based on chi

In [57]:
import json

with open("./data/movielens_2.json", "w") as output_file:
    json.dump(item_descs, output_file)