In [77]:
import pandas as pd

In [78]:
ml_m = pd.read_csv("./data/ml-20m/movies.csv")
ml_r = pd.read_csv("./data/ml-20m/ratings.csv")

In [79]:
ml_m.sample(1)

Unnamed: 0,movieId,title,genres
7300,7412,"Cat and the Canary, The (1978)",Comedy|Horror|Mystery


In [80]:
ml_r.sample(1)

Unnamed: 0,userId,movieId,rating,timestamp
9475061,65529,346,1.0,1296711070


In [81]:
# TODO: MovieLens single KG
'''
User: userId
Item: movieId & title
Attributes: genres, release_year
Relation: rating
'''
ml_kg = pd.merge(ml_r, ml_m, on=["movieId"], how="inner")

In [82]:
ml_kg.drop(labels=["timestamp"], axis=1, inplace=True)

In [83]:
ml_kg

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
3,29,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
4,34,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
...,...,...,...,...,...
20000258,138301,121017,3.5,The Gentleman from Epsom (1962),Comedy|Crime
20000259,138301,121019,4.5,The Great Spy Chase (1964),Action|Comedy|Thriller
20000260,138301,121021,4.5,Taxi for Tobruk (1961),Drama|War
20000261,138406,110167,4.5,"Judge and the Assassin, The (Juge et l'assassi...",Crime|Drama


In [84]:
ml_kg = ml_kg[~ml_kg["title"].str.contains(r"1975-1979", regex=True)]

In [85]:
ml_kg = ml_kg[~ml_kg["title"].str.contains(r"2007-", regex=True)]

In [86]:
ml_kg[ml_kg["title"].str.contains(r"Frankenstein$", regex=True)]

Unnamed: 0,userId,movieId,rating,title,genres
19957051,11288,115685,4.0,National Theatre Live: Frankenstein,Drama|Fantasy
19957052,32632,115685,5.0,National Theatre Live: Frankenstein,Drama|Fantasy
19957053,42506,115685,4.5,National Theatre Live: Frankenstein,Drama|Fantasy
19957054,90256,115685,5.0,National Theatre Live: Frankenstein,Drama|Fantasy
19957055,111963,115685,4.0,National Theatre Live: Frankenstein,Drama|Fantasy
19957056,119157,115685,1.0,National Theatre Live: Frankenstein,Drama|Fantasy
19957057,119465,115685,0.5,National Theatre Live: Frankenstein,Drama|Fantasy


In [87]:
def get_year(x:str):
    try:
        return str(int(x.split()[-1].strip("\(").strip("\)")) // 10 * 10) + "\'s"
    except:
        return ""

ml_kg["release_year"] = ml_kg["title"].apply(lambda x:get_year(x))

In [88]:
def p_title(x:str):
    try:
        return " ".join(x.split()[:-1])
    except:
        return x

ml_kg["title"] = ml_kg["title"].apply(lambda x:p_title(x))

In [89]:
ml_kg['genres'] = ml_kg['genres'].str.split("|")

In [90]:
ml_kg_neo4j_data = ml_kg.explode('genres')

In [91]:
ml_kg_neo4j_data.genres.unique()

array(['Adventure', 'Children', 'Fantasy', 'Drama', 'Mystery', 'Sci-Fi',
       'Thriller', 'Crime', 'Action', 'Comedy', 'Romance', 'War',
       'Horror', 'Musical', 'Western', 'Animation', 'IMAX', 'Film-Noir',
       'Documentary', '(no genres listed)'], dtype=object)

In [92]:
ml_kg_neo4j_data.sort_values(by=["userId", "movieId", "rating"], ascending=(True, True, False), inplace=True)

In [93]:
ml_kg_neo4j_data["userId"] = ml_kg_neo4j_data["userId"].apply(lambda x:"ML_" + str(x))

In [94]:
ml_kg_neo4j_data

Unnamed: 0,userId,movieId,rating,title,genres,release_year
0,ML_1,2,3.5,Jumanji,Adventure,1990's
0,ML_1,2,3.5,Jumanji,Children,1990's
0,ML_1,2,3.5,Jumanji,Fantasy,1990's
22243,ML_1,29,3.5,"City of Lost Children, The (Cité des enfants p...",Adventure,1990's
22243,ML_1,29,3.5,"City of Lost Children, The (Cité des enfants p...",Drama,1990's
...,...,...,...,...,...,...
17724424,ML_138493,69644,3.0,Ice Age: Dawn of the Dinosaurs,Romance,2000's
13662983,ML_138493,70286,5.0,District 9,Mystery,2000's
13662983,ML_138493,70286,5.0,District 9,Sci-Fi,2000's
13662983,ML_138493,70286,5.0,District 9,Thriller,2000's


In [95]:
ratio = 0.1

import random
random.seed(1004)

uid_list = list(ml_kg_neo4j_data.userId.unique())

sample_uid_list = random.sample(uid_list, round(len(uid_list) * ratio))

sampled_ml_kg = ml_kg_neo4j_data[ml_kg_neo4j_data['userId'].isin(sample_uid_list)]

sampled_ml_kg

Unnamed: 0,userId,movieId,rating,title,genres,release_year
3070000,ML_6,1,5.0,Toy Story,Adventure,1990's
3070000,ML_6,1,5.0,Toy Story,Animation,1990's
3070000,ML_6,1,5.0,Toy Story,Children,1990's
3070000,ML_6,1,5.0,Toy Story,Comedy,1990's
3070000,ML_6,1,5.0,Toy Story,Fantasy,1990's
...,...,...,...,...,...,...
16730404,ML_138471,97913,3.5,Wreck-It Ralph,Animation,2010's
16730404,ML_138471,97913,3.5,Wreck-It Ralph,Comedy,2010's
13772847,ML_138471,98809,3.5,"Hobbit: An Unexpected Journey, The",Adventure,2010's
13772847,ML_138471,98809,3.5,"Hobbit: An Unexpected Journey, The",Fantasy,2010's


In [96]:
sampled_ml_kg.to_csv("./data/ml_kg_neo4j_data.csv", index=False)

In [97]:
ratio = 0.00004

In [98]:
import random
random.seed(1004)

uid_list = list(ml_kg_neo4j_data.userId.unique())

sample_uid_list = random.sample(uid_list, round(len(uid_list) * ratio))

In [99]:
sample_ml_kg_neo4j_data = ml_kg_neo4j_data[ml_kg_neo4j_data['userId'].isin(sample_uid_list)]

In [100]:
sample_ml_kg_neo4j_data

Unnamed: 0,userId,movieId,rating,title,genres,release_year
3079982,ML_27876,1,3.0,Toy Story,Adventure,1990's
3079982,ML_27876,1,3.0,Toy Story,Animation,1990's
3079982,ML_27876,1,3.0,Toy Story,Children,1990's
3079982,ML_27876,1,3.0,Toy Story,Comedy,1990's
3079982,ML_27876,1,3.0,Toy Story,Fantasy,1990's
...,...,...,...,...,...,...
15856287,ML_133396,51255,4.5,Hot Fuzz,Mystery,2000's
13725867,ML_133396,81834,5.0,Harry Potter and the Deathly Hallows: Part 1,Action,2010's
13725867,ML_133396,81834,5.0,Harry Potter and the Deathly Hallows: Part 1,Adventure,2010's
13725867,ML_133396,81834,5.0,Harry Potter and the Deathly Hallows: Part 1,Fantasy,2010's


In [101]:
sample_ml_kg_neo4j_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1449 entries, 3079982 to 13725867
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   userId        1449 non-null   object 
 1   movieId       1449 non-null   int64  
 2   rating        1449 non-null   float64
 3   title         1449 non-null   object 
 4   genres        1449 non-null   object 
 5   release_year  1449 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 79.2+ KB


In [102]:
sample_ml_kg_neo4j_data.to_csv("./data/sample_ml_kg_neo4j_data.csv", index=False)

In [103]:
movielens_tmdb_df = pd.read_csv("./data/movielens_tmdb.csv")

In [104]:
movielens_tmdb_df.sample(1)

Unnamed: 0,movieId,title,release_year,origin_country,Writer,Original Story,Original Film Writer,Director,cast
27479,69744,The Dark Backward,1991.0,Null,Adam Rifkin,,,Adam Rifkin,James Caan


In [105]:
sampled_ml_kg

Unnamed: 0,userId,movieId,rating,title,genres,release_year
3070000,ML_6,1,5.0,Toy Story,Adventure,1990's
3070000,ML_6,1,5.0,Toy Story,Animation,1990's
3070000,ML_6,1,5.0,Toy Story,Children,1990's
3070000,ML_6,1,5.0,Toy Story,Comedy,1990's
3070000,ML_6,1,5.0,Toy Story,Fantasy,1990's
...,...,...,...,...,...,...
16730404,ML_138471,97913,3.5,Wreck-It Ralph,Animation,2010's
16730404,ML_138471,97913,3.5,Wreck-It Ralph,Comedy,2010's
13772847,ML_138471,98809,3.5,"Hobbit: An Unexpected Journey, The",Adventure,2010's
13772847,ML_138471,98809,3.5,"Hobbit: An Unexpected Journey, The",Fantasy,2010's


In [106]:
mt_kg_neo4j_data = pd.read_csv("./data/mt_kg_neo4j_data.csv")
sampled_ml_kg_neo4j_data = pd.read_csv("./data/ml_kg_neo4j_data.csv")
ym_kg_neo4j_data = pd.read_csv("./data/ym_kg_neo4j_data.csv")

In [107]:
ml_left_join_else_data = pd.merge(sampled_ml_kg_neo4j_data, mt_kg_neo4j_data, on=["title", "release_year"], how="left", suffixes=("_ml", "_mt"))

In [108]:
ml_left_join_else_data = ml_left_join_else_data[["userId_ml", "movieId_ml", "rating_ml", "title", "genres_ml", "genres_mt", "release_year"]]

In [109]:
ml_left_join_else_data

Unnamed: 0,userId_ml,movieId_ml,rating_ml,title,genres_ml,genres_mt,release_year
0,ML_6,1,5.0,Toy Story,Adventure,Animation,1990's
1,ML_6,1,5.0,Toy Story,Adventure,Adventure,1990's
2,ML_6,1,5.0,Toy Story,Adventure,Comedy,1990's
3,ML_6,1,5.0,Toy Story,Adventure,Family,1990's
4,ML_6,1,5.0,Toy Story,Adventure,Fantasy,1990's
...,...,...,...,...,...,...,...
413509765,ML_138471,97913,3.5,Wreck-It Ralph,Comedy,Comedy,2010's
413509766,ML_138471,97913,3.5,Wreck-It Ralph,Comedy,Family,2010's
413509767,ML_138471,98809,3.5,"Hobbit: An Unexpected Journey, The",Adventure,,2010's
413509768,ML_138471,98809,3.5,"Hobbit: An Unexpected Journey, The",Fantasy,,2010's


In [110]:
ml_left_join_else_data.userId_ml.nunique()

13849

In [111]:
ml_left_join_else_data.movieId_ml.nunique()

17045

In [112]:
ml_left_join_else_data.drop_duplicates(subset=["userId_ml", "movieId_ml"])
# 전체 20M 중에서 sampling한 interaction 수

Unnamed: 0,userId_ml,movieId_ml,rating_ml,title,genres_ml,genres_mt,release_year
0,ML_6,1,5.0,Toy Story,Adventure,Animation,1990's
1400,ML_6,3,3.0,Grumpier Old Men,Comedy,Comedy,1990's
1404,ML_6,7,5.0,Sabrina,Comedy,Comedy,1990's
1420,ML_6,17,5.0,Sense and Sensibility,Drama,Drama,1990's
1440,ML_6,52,5.0,Mighty Aphrodite,Comedy,Comedy,1990's
...,...,...,...,...,...,...,...
413500883,ML_138471,91630,4.0,Mission: Impossible - Ghost Protocol,Action,Action,2010's
413501539,ML_138471,94780,4.0,Snow White and the Huntsman,Action,Action,2010's
413502799,ML_138471,95167,4.0,Brave,Action,Animation,2010's
413506111,ML_138471,97913,3.5,Wreck-It Ralph,Animation,Animation,2010's


In [113]:
ml_left_join_else_data = ml_left_join_else_data.rename(columns={"userId_ml":"userId", "movieId_ml":"movieId", "rating_ml":"rating"})

In [114]:
ml_left_join_else_data

Unnamed: 0,userId,movieId,rating,title,genres_ml,genres_mt,release_year
0,ML_6,1,5.0,Toy Story,Adventure,Animation,1990's
1,ML_6,1,5.0,Toy Story,Adventure,Adventure,1990's
2,ML_6,1,5.0,Toy Story,Adventure,Comedy,1990's
3,ML_6,1,5.0,Toy Story,Adventure,Family,1990's
4,ML_6,1,5.0,Toy Story,Adventure,Fantasy,1990's
...,...,...,...,...,...,...,...
413509765,ML_138471,97913,3.5,Wreck-It Ralph,Comedy,Comedy,2010's
413509766,ML_138471,97913,3.5,Wreck-It Ralph,Comedy,Family,2010's
413509767,ML_138471,98809,3.5,"Hobbit: An Unexpected Journey, The",Adventure,,2010's
413509768,ML_138471,98809,3.5,"Hobbit: An Unexpected Journey, The",Fantasy,,2010's


In [115]:
ml_left_join_else_data.drop_duplicates(inplace=True)

KeyboardInterrupt: 

In [None]:
ml_left_join_else_data.genres_mt.isna().sum()

51758

In [None]:
ml_left_join_else_data = ml_left_join_else_data[~ml_left_join_else_data.genres_mt.isna()]

In [None]:
ml_left_join_else_data

Unnamed: 0,userId,movieId,rating,title,genres_ml,genres_mt,release_year
0,ML_1635,1,3.0,Toy Story,Adventure,Animation,1990's
1,ML_1635,1,3.0,Toy Story,Adventure,Adventure,1990's
2,ML_1635,1,3.0,Toy Story,Adventure,Comedy,1990's
3,ML_1635,1,3.0,Toy Story,Adventure,Family,1990's
4,ML_1635,1,3.0,Toy Story,Adventure,Fantasy,1990's
...,...,...,...,...,...,...,...
9974097,ML_137662,84944,3.5,Rango,Western,Western,2010's
9974306,ML_137662,85367,3.0,Just Go with It,Comedy,Comedy,2010's
9974307,ML_137662,85367,3.0,Just Go with It,Comedy,Romance,2010's
9974368,ML_137662,85367,3.0,Just Go with It,Romance,Comedy,2010's


In [None]:
ml_left_join_else_data = pd.merge(ml_left_join_else_data, ym_kg_neo4j_data, on=["title", "release_year"], how="left", suffixes=("_ml", "_ym"))

In [None]:
ml_left_join_else_data = ml_left_join_else_data[["userId_ml", "movieId_ml", "title", "rating_ml", "genres_ml", "genres_mt", "genres", "release_year", "distributor"]]

In [None]:
ml_left_join_else_data = ml_left_join_else_data.rename(columns={"userId_ml":"userId", "movieId_ml":"movieId", "rating_ml":"rating", "genres":"genres_ym"})

In [None]:
ml_left_join_else_data.drop_duplicates(inplace=True)

In [None]:
ml_left_join_else_data.genres_ym.isna().sum()

124580

In [None]:
ml_left_join_else_data = ml_left_join_else_data[~ml_left_join_else_data.genres_ym.isna()]

In [None]:
ml_left_join_else_data

Unnamed: 0,userId,movieId,title,rating,genres_ml,genres_mt,genres_ym,release_year,distributor
0,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa
1,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa
2,ML_1635,1,Toy Story,3.0,Adventure,Animation,Family,1990's,Buena Vista Distribution Compa
807,ML_1635,1,Toy Story,3.0,Adventure,Adventure,Comedy,1990's,Buena Vista Distribution Compa
808,ML_1635,1,Toy Story,3.0,Adventure,Adventure,Kids,1990's,Buena Vista Distribution Compa
...,...,...,...,...,...,...,...,...,...
38297304,ML_137662,3735,Serpico,4.5,Crime,Crime,Drama,1970's,
38297327,ML_137662,3735,Serpico,4.5,Crime,Drama,Drama,1970's,
38297350,ML_137662,3735,Serpico,4.5,Drama,Biography,Drama,1970's,
38297373,ML_137662,3735,Serpico,4.5,Drama,Crime,Drama,1970's,


In [None]:
ml_left_join_else_data.drop_duplicates(inplace=True)

In [None]:
ml_left_join_else_data

Unnamed: 0,userId,movieId,title,rating,genres_ml,genres_mt,genres_ym,release_year,distributor
0,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa
1,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa
2,ML_1635,1,Toy Story,3.0,Adventure,Animation,Family,1990's,Buena Vista Distribution Compa
807,ML_1635,1,Toy Story,3.0,Adventure,Adventure,Comedy,1990's,Buena Vista Distribution Compa
808,ML_1635,1,Toy Story,3.0,Adventure,Adventure,Kids,1990's,Buena Vista Distribution Compa
...,...,...,...,...,...,...,...,...,...
38297304,ML_137662,3735,Serpico,4.5,Crime,Crime,Drama,1970's,
38297327,ML_137662,3735,Serpico,4.5,Crime,Drama,Drama,1970's,
38297350,ML_137662,3735,Serpico,4.5,Drama,Biography,Drama,1970's,
38297373,ML_137662,3735,Serpico,4.5,Drama,Crime,Drama,1970's,


In [None]:
import copy
ml_left_join_else_data_ = copy.deepcopy(ml_left_join_else_data)

In [None]:
def remove_duplicate_rows(row):
    if row['genres_ml'] == row['genres_mt'] or row['genres_ml'] == row['genres_ym'] or row['genres_mt'] == row['genres_ym']:
        return 0
    return 1

ml_left_join_else_data['duplicate_flag'] = ml_left_join_else_data.apply(remove_duplicate_rows, axis=1)
df_filtered = ml_left_join_else_data[ml_left_join_else_data['duplicate_flag'] == 1]

In [None]:
df_filtered = df_filtered.drop(columns=['duplicate_flag'])
df_filtered

Unnamed: 0,userId,movieId,title,rating,genres_ml,genres_mt,genres_ym,release_year,distributor
0,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa
1,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa
2,ML_1635,1,Toy Story,3.0,Adventure,Animation,Family,1990's,Buena Vista Distribution Compa
1615,ML_1635,1,Toy Story,3.0,Adventure,Comedy,Kids,1990's,Buena Vista Distribution Compa
1616,ML_1635,1,Toy Story,3.0,Adventure,Comedy,Family,1990's,Buena Vista Distribution Compa
...,...,...,...,...,...,...,...,...,...
38296965,ML_137662,379,Timecop,2.0,Thriller,Romance,Fantasy,1990's,
38297002,ML_137662,379,Timecop,2.0,Thriller,Sci-Fi,Science Fiction,1990's,
38297003,ML_137662,379,Timecop,2.0,Thriller,Sci-Fi,Fantasy,1990's,
38297248,ML_137662,3247,Sister Act,3.5,Crime,Music,Comedy,1990's,


In [None]:
df_filtered["sorted_genres"] = df_filtered[['genres_ml', 'genres_mt', 'genres_ym']].apply(sorted, axis=1)

In [None]:
df_filtered

Unnamed: 0,userId,movieId,title,rating,genres_ml,genres_mt,genres_ym,release_year,distributor,sorted_genres
0,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa,"[Adventure, Animation, Comedy]"
1,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa,"[Adventure, Animation, Kids]"
2,ML_1635,1,Toy Story,3.0,Adventure,Animation,Family,1990's,Buena Vista Distribution Compa,"[Adventure, Animation, Family]"
1615,ML_1635,1,Toy Story,3.0,Adventure,Comedy,Kids,1990's,Buena Vista Distribution Compa,"[Adventure, Comedy, Kids]"
1616,ML_1635,1,Toy Story,3.0,Adventure,Comedy,Family,1990's,Buena Vista Distribution Compa,"[Adventure, Comedy, Family]"
...,...,...,...,...,...,...,...,...,...,...
38296965,ML_137662,379,Timecop,2.0,Thriller,Romance,Fantasy,1990's,,"[Fantasy, Romance, Thriller]"
38297002,ML_137662,379,Timecop,2.0,Thriller,Sci-Fi,Science Fiction,1990's,,"[Sci-Fi, Science Fiction, Thriller]"
38297003,ML_137662,379,Timecop,2.0,Thriller,Sci-Fi,Fantasy,1990's,,"[Fantasy, Sci-Fi, Thriller]"
38297248,ML_137662,3247,Sister Act,3.5,Crime,Music,Comedy,1990's,,"[Comedy, Crime, Music]"


In [None]:
# 중복 행을 제거하기 위한 함수 정의
def remove_duplicate_rows(row):
    return "|".join(row['sorted_genres'])

# 중복 행을 제거하는 코드 적용
df_filtered['unique_genres'] = df_filtered.apply(remove_duplicate_rows, axis=1)
df_filtered["sorted_genres"] = df_filtered["sorted_genres"].apply(lambda x:"|".join(x))
df_filtered = df_filtered.drop_duplicates(subset=['unique_genres', 'userId', 'movieId', 'title', 'rating', 'release_year'])

In [None]:
df_filtered

Unnamed: 0,userId,movieId,title,rating,genres_ml,genres_mt,genres_ym,release_year,distributor,sorted_genres,unique_genres
0,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa,Adventure|Animation|Comedy,Adventure|Animation|Comedy
1,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa,Adventure|Animation|Kids,Adventure|Animation|Kids
2,ML_1635,1,Toy Story,3.0,Adventure,Animation,Family,1990's,Buena Vista Distribution Compa,Adventure|Animation|Family,Adventure|Animation|Family
1615,ML_1635,1,Toy Story,3.0,Adventure,Comedy,Kids,1990's,Buena Vista Distribution Compa,Adventure|Comedy|Kids,Adventure|Comedy|Kids
1616,ML_1635,1,Toy Story,3.0,Adventure,Comedy,Family,1990's,Buena Vista Distribution Compa,Adventure|Comedy|Family,Adventure|Comedy|Family
...,...,...,...,...,...,...,...,...,...,...,...
38296889,ML_137662,379,Timecop,2.0,Sci-Fi,Thriller,Fantasy,1990's,,Fantasy|Sci-Fi|Thriller,Fantasy|Sci-Fi|Thriller
38296964,ML_137662,379,Timecop,2.0,Thriller,Romance,Science Fiction,1990's,,Romance|Science Fiction|Thriller,Romance|Science Fiction|Thriller
38296965,ML_137662,379,Timecop,2.0,Thriller,Romance,Fantasy,1990's,,Fantasy|Romance|Thriller,Fantasy|Romance|Thriller
38297248,ML_137662,3247,Sister Act,3.5,Crime,Music,Comedy,1990's,,Comedy|Crime|Music,Comedy|Crime|Music


In [None]:
df_filtered = df_filtered.drop(columns=['sorted_genres', 'unique_genres'])
df_filtered

Unnamed: 0,userId,movieId,title,rating,genres_ml,genres_mt,genres_ym,release_year,distributor
0,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa
1,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa
2,ML_1635,1,Toy Story,3.0,Adventure,Animation,Family,1990's,Buena Vista Distribution Compa
1615,ML_1635,1,Toy Story,3.0,Adventure,Comedy,Kids,1990's,Buena Vista Distribution Compa
1616,ML_1635,1,Toy Story,3.0,Adventure,Comedy,Family,1990's,Buena Vista Distribution Compa
...,...,...,...,...,...,...,...,...,...
38296889,ML_137662,379,Timecop,2.0,Sci-Fi,Thriller,Fantasy,1990's,
38296964,ML_137662,379,Timecop,2.0,Thriller,Romance,Science Fiction,1990's,
38296965,ML_137662,379,Timecop,2.0,Thriller,Romance,Fantasy,1990's,
38297248,ML_137662,3247,Sister Act,3.5,Crime,Music,Comedy,1990's,


In [None]:
df_filtered.to_csv("./data/ml_left_join_else_data.csv", index=False)

In [None]:
def ry(x):
    try:
        return str(int(x // 10 * 10)) + "\'s"
    except:
        return ""

movielens_tmdb_df['release_year'] = movielens_tmdb_df['release_year'].apply(lambda x:ry(x))

In [None]:
df_filtered

Unnamed: 0,userId,movieId,title,rating,genres_ml,genres_mt,genres_ym,release_year,distributor
0,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa
1,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa
2,ML_1635,1,Toy Story,3.0,Adventure,Animation,Family,1990's,Buena Vista Distribution Compa
1615,ML_1635,1,Toy Story,3.0,Adventure,Comedy,Kids,1990's,Buena Vista Distribution Compa
1616,ML_1635,1,Toy Story,3.0,Adventure,Comedy,Family,1990's,Buena Vista Distribution Compa
...,...,...,...,...,...,...,...,...,...
38296889,ML_137662,379,Timecop,2.0,Sci-Fi,Thriller,Fantasy,1990's,
38296964,ML_137662,379,Timecop,2.0,Thriller,Romance,Science Fiction,1990's,
38296965,ML_137662,379,Timecop,2.0,Thriller,Romance,Fantasy,1990's,
38297248,ML_137662,3247,Sister Act,3.5,Crime,Music,Comedy,1990's,


In [None]:
tmdb_ml_left_join_else_data = pd.merge(df_filtered, movielens_tmdb_df, how="left", on=["title", 'release_year']).drop_duplicates()

In [None]:
tmdb_ml_left_join_else_data.drop(labels=["movieId_y"], inplace=True, axis=1)
tmdb_ml_left_join_else_data.rename(columns={"movieId_x":"movieId"}, inplace=True)

In [None]:
tmdb_ml_left_join_else_data = tmdb_ml_left_join_else_data.drop_duplicates()

In [None]:
tmdb_ml_left_join_else_data

Unnamed: 0,userId,movieId,title,rating,genres_ml,genres_mt,genres_ym,release_year,distributor,origin_country,Writer,Original Story,Original Film Writer,Director,cast
0,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Tom Hanks
1,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Annie Potts
2,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Tom Hanks
3,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Annie Potts
4,ML_1635,1,Toy Story,3.0,Adventure,Animation,Family,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Tom Hanks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249619,ML_137662,379,Timecop,2.0,Thriller,Romance,Fantasy,1990's,,US,,,,Peter Hyams,Bruce McGill
249620,ML_137662,3247,Sister Act,3.5,Crime,Music,Comedy,1990's,,US,Paul Rudnick,,,Emile Ardolino,Harvey Keitel
249621,ML_137662,3247,Sister Act,3.5,Crime,Music,Comedy,1990's,,US,Paul Rudnick,,,Emile Ardolino,Maggie Smith
249622,ML_137662,3735,Serpico,4.5,Crime,Biography,Drama,1970's,,,,,,Sidney Lumet,Al Pacino


In [None]:
tmdb_ml_left_join_else_data.drop_duplicates(subset=["userId", "movieId"])

Unnamed: 0,userId,movieId,title,rating,genres_ml,genres_mt,genres_ym,release_year,distributor,origin_country,Writer,Original Story,Original Film Writer,Director,cast
0,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Tom Hanks
62,ML_1635,2,Jumanji,3.0,Adventure,Family,Kids,1990's,,US,,,,Joe Johnston,Robin Williams
92,ML_1635,22,Copycat,2.0,Crime,Mystery,Drama,1990's,,US,,,,Jon Amiel,Sigourney Weaver
104,ML_1635,44,Mortal Kombat,3.0,Action,Adventure,Science Fiction,1990's,,US,,,,Paul W. S. Anderson,Frank Welker
132,ML_1635,110,Braveheart,2.0,Action,Biography,Adventure,1990's,Paramount Pictures,GB,,,,Mel Gibson,Mel Gibson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249584,ML_137324,2338,I Still Know What You Did Last Summer,2.5,Horror,Mystery,Suspense,1990's,Columbia Tristar,,,,,Danny Cannon,Jack Black
249592,ML_137324,3155,Anna and the King,1.0,Drama,Comedy,Romance,1990's,Twentieth Century Fox,US,,,,Andy Tennant,Jodie Foster
249596,ML_137662,379,Timecop,2.0,Action,Romance,Science Fiction,1990's,,US,,,,Peter Hyams,Jean-Claude Van Damme
249620,ML_137662,3247,Sister Act,3.5,Crime,Music,Comedy,1990's,,US,Paul Rudnick,,,Emile Ardolino,Harvey Keitel


In [72]:
tmdb_ml_left_join_else_data[tmdb_ml_left_join_else_data.origin_country == "US"]

Unnamed: 0,userId,movieId,title,rating,genres_ml,genres_mt,genres_ym,release_year,distributor,origin_country,Writer,Original Story,Original Film Writer,Director,cast
0,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Tom Hanks
1,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Annie Potts
2,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Tom Hanks
3,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Annie Potts
4,ML_1635,1,Toy Story,3.0,Adventure,Animation,Family,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Tom Hanks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249617,ML_137662,379,Timecop,2.0,Thriller,Romance,Science Fiction,1990's,,US,,,,Peter Hyams,Bruce McGill
249618,ML_137662,379,Timecop,2.0,Thriller,Romance,Fantasy,1990's,,US,,,,Peter Hyams,Jean-Claude Van Damme
249619,ML_137662,379,Timecop,2.0,Thriller,Romance,Fantasy,1990's,,US,,,,Peter Hyams,Bruce McGill
249620,ML_137662,3247,Sister Act,3.5,Crime,Music,Comedy,1990's,,US,Paul Rudnick,,,Emile Ardolino,Harvey Keitel


In [73]:
sample_movie_list = ["Toy Story", "X-Men", "Mission: Impossible", "Schindler's List", "Forrest Gump"]

In [74]:
tmdb_ml_left_join_else_data[tmdb_ml_left_join_else_data.title.isin(sample_movie_list)][["movieId", "title", "genres_ml", "genres_mt", "genres_ym"]].drop_duplicates()

Unnamed: 0,movieId,title,genres_ml,genres_mt,genres_ym
0,1,Toy Story,Adventure,Animation,Comedy
2,1,Toy Story,Adventure,Animation,Kids
4,1,Toy Story,Adventure,Animation,Family
6,1,Toy Story,Adventure,Comedy,Kids
8,1,Toy Story,Adventure,Comedy,Family
10,1,Toy Story,Adventure,Family,Kids
12,1,Toy Story,Adventure,Fantasy,Comedy
14,1,Toy Story,Adventure,Fantasy,Kids
16,1,Toy Story,Adventure,Fantasy,Family
18,1,Toy Story,Animation,Comedy,Kids


In [138]:
sample_movie_list = ["Toy Story", "X-Men", "Mission: Impossible", "Forrest Gump"]
final_sample = tmdb_ml_left_join_else_data[tmdb_ml_left_join_else_data.title.isin(sample_movie_list)]

In [139]:
ratio = 0.02

In [140]:
uid_list = list(final_sample.userId.unique())

In [141]:
sample_uid_list = random.sample(uid_list, round(len(uid_list) * ratio))
sample_uid_list

['ML_113198', 'ML_8135', 'ML_120660', 'ML_128425']

In [142]:
final_sample

Unnamed: 0,userId,movieId,title,rating,genres_ml,genres_mt,genres_ym,release_year,distributor,origin_country,Writer,Original Story,Original Film Writer,Director,cast
0,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Tom Hanks
1,ML_1635,1,Toy Story,3.0,Adventure,Animation,Comedy,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Annie Potts
2,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Tom Hanks
3,ML_1635,1,Toy Story,3.0,Adventure,Animation,Kids,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Annie Potts
4,ML_1635,1,Toy Story,3.0,Adventure,Animation,Family,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Tom Hanks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249495,ML_137324,1,Toy Story,5.0,Comedy,Fantasy,Kids,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Annie Potts
249496,ML_137324,1,Toy Story,5.0,Comedy,Fantasy,Family,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Tom Hanks
249497,ML_137324,1,Toy Story,5.0,Comedy,Fantasy,Family,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Annie Potts
249498,ML_137324,1,Toy Story,5.0,Fantasy,Family,Kids,1990's,Buena Vista Distribution Compa,US,,Pete Docter,,John Lasseter,Tom Hanks


In [143]:
final_sample = final_sample[final_sample['userId'].isin(sample_uid_list)]
final_sample.to_csv("./data/final_sample_tmdb_ml_left_join_else_data.csv", index=False)