In [15]:
import os
import numpy as np
import pandas as pd

In [30]:
movie_data = pd.read_csv("raw_data/movie_data.csv")

In [31]:
"""
I noticed that there were duplicate entries for certain 
movies that were released in separate origins.  This 
removes those duplicates so we can use the rotten_tomatoes_link
as the primary index of the data.
"""
duplicates = movie_data.groupby("rotten_tomatoes_link").count() \
    .loc[np.any(movie_data.groupby("rotten_tomatoes_link") \
    .count() > 1, axis=1)].index

dup_indices = movie_data.loc[(movie_data["rotten_tomatoes_link"].isin(duplicates)) & \
    (movie_data["Origin/Ethnicity"] != "American")].index

movie_data = movie_data.loc[~movie_data.index.isin(dup_indices)].reset_index(drop=True)

In [32]:
def explode_column(input_df, column_name, new_column_name):
    """
    Function that will expand the string columns that have 
    values separated by commas.
    """
    df = input_df.copy(deep=True)
    exploded_df = df[column_name].astype(str) \
        .apply(lambda x: x.split(",")).explode() \
        .to_frame().rename(columns={column_name: new_column_name})
    df = df.merge(exploded_df[[new_column_name]], left_index=True, right_index=True)
    return df

In [38]:
explode_columns = {
    "genres": "genre",
    "directors": "director",
    "authors": "author",
    "actors": "actor"
}

exploded_dfs = {}
for c in explode_columns.keys():
    exploded_dfs[c] = explode_column(movie_data, c, explode_columns[c])
for c in exploded_dfs.keys():
    movie_data = movie_data.merge(
        exploded_dfs[c][[explode_columns[c]]], left_index=True, right_index=True
    )

In [41]:
relation_dict = {
    "Title":"hasTitle",
    "director":"directedBy",
    "author":"authoredBy",
    "actor":"featuredActor",
    "genre":"hadGenre",
    "Release Year":"releasedOn",
    "production_company":"producedBy",
}
relations = []
for r in relation_dict:
    relations.append(relation_dict[r])

In [44]:
tabular_df = movie_data.rename(columns=relation_dict)[relations] \
    .drop_duplicates() \
    .reset_index(drop=True)

In [45]:
tabular_df

Unnamed: 0,hasTitle,directedBy,authoredBy,featuredActor,hadGenre,releasedOn,producedBy
0,Percy Jackson & the Olympians: The Lightning T...,Chris Columbus,Craig Titley,Logan Lerman,Action & Adventure,2010,20th Century Fox
1,Percy Jackson & the Olympians: The Lightning T...,Chris Columbus,Craig Titley,Brandon T. Jackson,Action & Adventure,2010,20th Century Fox
2,Percy Jackson & the Olympians: The Lightning T...,Chris Columbus,Craig Titley,Alexandra Daddario,Action & Adventure,2010,20th Century Fox
3,Percy Jackson & the Olympians: The Lightning T...,Chris Columbus,Craig Titley,Jake Abel,Action & Adventure,2010,20th Century Fox
4,Percy Jackson & the Olympians: The Lightning T...,Chris Columbus,Craig Titley,Sean Bean,Action & Adventure,2010,20th Century Fox
...,...,...,...,...,...,...,...
1323229,Zulu,Cyril Endfield,John Prebble,Daniel Tshabalala,Drama,1964,Paramount Pictures
1323230,Zulu,Cyril Endfield,John Prebble,Ephraim Mbhele,Drama,1964,Paramount Pictures
1323231,Zulu,Cyril Endfield,John Prebble,Simon Sabela,Drama,1964,Paramount Pictures
1323232,Zulu,Cyril Endfield,John Prebble,David Kerman,Drama,1964,Paramount Pictures
