# Final Data Table
This notebook merges all the tables of the cine_ethics dataset.

## Import Statements

In [1]:
from google.cloud import storage
import os
import pandas as pd
import re
from io import StringIO

## The Data

### MPST Full Dataset

In [3]:
mpst_full_data_df = pd.read_csv("../raw_data/data_mpst_full_data.csv", sep=";", skiprows=1)

# Make titles lower case
mpst_full_data_df["title"] = mpst_full_data_df["title"].apply(lambda x: x.lower())

mpst_full_data_df.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source
0,tt0057603,i tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb
1,tt1733125,dungeons & dragons: the book of vile darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
2,tt0033045,the shop around the corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
3,tt0113862,mr. holland's opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
4,tt0086250,scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb


In [4]:
mpst_full_data_df.shape

(14828, 6)

### Screengrabs Dataset

In [2]:
!gcloud auth login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=A4l6MD7yHrQaJiJiU2T402475djrtj&access_type=offline&code_challenge=A-WxI5WWPyo8xmEwKeJgD7ArhW40fAQ8h6fywo5Y80I&code_challenge_method=S256


You are now logged in as [w.m.mulaudzi@gmail.com].
Your current project is [ornate-lens-411311].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


In [3]:
# Log into the cine_ethics project
project_id = "ornate-lens-411311"
client = storage.Client(project=project_id)

bucket_name = "cine_ethics"
bucket = client.get_bucket(bucket_name)

In [40]:
# Path to the movie on the bucket
movies_path = "data/resized_frames/"

In [8]:
# Get blobs within the movie folder
blobs = bucket.list_blobs(prefix=movies_path)

# Dictionary to store movie titles and paths
movie_info = {}

for blob in blobs:
    # Get movie name in format title (year)
    movie_name_year_split = os.path.dirname(blob.name).split("/")
    movie_name_year = movie_name_year_split[2]

    # Strip year from title
    movie_name = re.sub(r"\s*\(\d+\)", "", movie_name_year).lower()

    # Add movie title and path to the dictionary
    movie_info[movie_name] = "/".join(movie_name_year_split)

print(len(movie_info))

802


In [9]:
screengrabs_df = pd.DataFrame.from_dict(movie_info, orient='index', columns=["title"])

# Reset index to make titles a column
screengrabs_df.reset_index(inplace=True)
screengrabs_df.columns = ["title", "paths"]

screengrabs_df.head()

Unnamed: 0,title,paths
0,10 things i hate about you,data/resized_frames/10 Things I Hate About You...
1,12 monkeys,data/resized_frames/12 Monkeys (1995)
2,12 years a slave,data/resized_frames/12 Years a Slave (2013)
3,127 hours,data/resized_frames/127 Hours (2010)
4,13 hours the secret soldiers of benghazi,data/resized_frames/13 Hours The Secret Soldie...


### Merging The Two

In [10]:
synopsis_screengrabs_df = pd.merge(screengrabs_df, mpst_full_data_df, how='left', on="title")
synopsis_screengrabs_df.head()

Unnamed: 0,title,paths,imdb_id,plot_synopsis,tags,split,synopsis_source
0,10 things i hate about you,data/resized_frames/10 Things I Hate About You...,tt0147800,"Cameron James (Joseph Gordon-Levitt), the new ...","comedy, gothic, adult comedy, clever, romantic...",train,imdb
1,12 monkeys,data/resized_frames/12 Monkeys (1995),tt3148266,A deadly virus released in 1996 wipes out almo...,murder,train,wikipedia
2,12 years a slave,data/resized_frames/12 Years a Slave (2013),tt2024544,The movie opens with a group of slaves receivi...,"dramatic, boring, historical",test,imdb
3,127 hours,data/resized_frames/127 Hours (2010),tt1542344,Mountaineer and adventurer Aron Ralston begins...,"dramatic, suspenseful, psychological, flashback",test,wikipedia
4,13 hours the secret soldiers of benghazi,data/resized_frames/13 Hours The Secret Soldie...,,,,,


In [11]:
synopsis_screengrabs_df.shape

(915, 7)

## Cleaning the Merged Table

In [12]:
# Check percentage of missing values
synopsis_screengrabs_df.isna().sum() / len(synopsis_screengrabs_df)

title              0.000000
paths              0.000000
imdb_id            0.285246
plot_synopsis      0.285246
tags               0.285246
split              0.285246
synopsis_source    0.285246
dtype: float64

In [13]:
synopsis_screengrabs_df.dropna(subset="plot_synopsis", inplace=True)

In [14]:
# Check percentage of missing values
synopsis_screengrabs_df.isna().sum() / len(synopsis_screengrabs_df)

title              0.0
paths              0.0
imdb_id            0.0
plot_synopsis      0.0
tags               0.0
split              0.0
synopsis_source    0.0
dtype: float64

In [21]:
synopsis_screengrabs_df.drop_duplicates(subset="title", inplace=True)

In [22]:
synopsis_screengrabs_df.shape

(541, 7)

In [23]:
synopsis_screengrabs_df.to_csv("../raw_data/synopsis_screengrabs_final_table.csv", index=False)

## Adding Genres

In [4]:
# Load synopsis_screengrabs_df
synopsis_screengrabs_df = pd.read_csv("../raw_data/synopsis_screengrabs_final_table.csv")

In [5]:
genres_path = "data/movie-genre-prediction/"

In [6]:
# Load from google storage
# Get blobs within the genres folder
blobs = bucket.list_blobs(prefix=genres_path)

dataframes = []

for blob in blobs:
    if os.path.basename(blob.name) == "train.tsv":
        download = blob.download_as_text()

        # Convert the text content to a StringIO object
        csv_data = StringIO(download)

        dataframes.append(pd.read_csv(csv_data, sep='\t'))

In [7]:
train_df = dataframes[0]
print("Shape of dataframe:", train_df.shape)
train_df.head()

Shape of dataframe: (54000, 4)


Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [8]:
train_df["genre"].value_counts()

genre
fantasy      5400
horror       5400
family       5400
scifi        5400
action       5400
crime        5400
adventure    5400
mystery      5400
romance      5400
thriller     5400
Name: count, dtype: int64

In [9]:
# Rename the columns
train_df.rename(columns={"id": "imdb_id", "movie_name": "title"}, inplace=True)

In [10]:
# Map the tt string to the imbd_id
train_df["imdb_id"] = train_df["imdb_id"].map(lambda x: "tt"+str(x))
train_df.head()

Unnamed: 0,imdb_id,title,synopsis,genre
0,tt44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,tt50185,Entity Project,A director and her friends renting a haunted h...,horror
2,tt34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,tt78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,tt2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [11]:
train_df["title"] = train_df["title"].map(lambda x: x.lower())
train_df.head()

Unnamed: 0,imdb_id,title,synopsis,genre
0,tt44978,super me,A young scriptwriter starts bringing valuable ...,fantasy
1,tt50185,entity project,A director and her friends renting a haunted h...,horror
2,tt34131,behavioral family therapy for serious psychiat...,This is an educational video for families and ...,family
3,tt78522,blood glacier,Scientists working in the Austrian Alps discov...,scifi
4,tt2206,apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [12]:
final_table_df = synopsis_screengrabs_df.merge(train_df, how="left", on="title")
final_table_df.head()

Unnamed: 0,title,paths,imdb_id_x,plot_synopsis,tags,split,synopsis_source,imdb_id_y,synopsis,genre
0,10 things i hate about you,data/resized_frames/10 Things I Hate About You...,tt0147800,"Cameron James (Joseph Gordon-Levitt), the new ...","comedy, gothic, adult comedy, clever, romantic...",train,imdb,,,
1,12 monkeys,data/resized_frames/12 Monkeys (1995),tt3148266,A deadly virus released in 1996 wipes out almo...,murder,train,wikipedia,tt75933,"In a future world devastated by disease, a con...",scifi
2,12 years a slave,data/resized_frames/12 Years a Slave (2013),tt2024544,The movie opens with a group of slaves receivi...,"dramatic, boring, historical",test,imdb,,,
3,127 hours,data/resized_frames/127 Hours (2010),tt1542344,Mountaineer and adventurer Aron Ralston begins...,"dramatic, suspenseful, psychological, flashback",test,wikipedia,,,
4,21 grams,data/resized_frames/21 Grams (2003),tt0315733,The story is told in a non-linear manner. The ...,"suspenseful, boring, murder, realism, depressi...",test,wikipedia,tt18010,A freak accident brings together a critically ...,crime


In [13]:
final_table_df.drop(columns=["synopsis", "imdb_id_y", "tags", "split", "synopsis_source"], inplace=True)

In [14]:
final_table_df.rename(columns={"imdb_id_x": "imdb_id"}, inplace=True)

In [15]:
final_table_df.head()

Unnamed: 0,title,paths,imdb_id,plot_synopsis,genre
0,10 things i hate about you,data/resized_frames/10 Things I Hate About You...,tt0147800,"Cameron James (Joseph Gordon-Levitt), the new ...",
1,12 monkeys,data/resized_frames/12 Monkeys (1995),tt3148266,A deadly virus released in 1996 wipes out almo...,scifi
2,12 years a slave,data/resized_frames/12 Years a Slave (2013),tt2024544,The movie opens with a group of slaves receivi...,
3,127 hours,data/resized_frames/127 Hours (2010),tt1542344,Mountaineer and adventurer Aron Ralston begins...,
4,21 grams,data/resized_frames/21 Grams (2003),tt0315733,The story is told in a non-linear manner. The ...,crime


In [16]:
final_table_df.isna().sum() / len(final_table_df)

title            0.000000
paths            0.000000
imdb_id          0.000000
plot_synopsis    0.000000
genre            0.401408
dtype: float64

In [17]:
final_table_df.shape

(710, 5)

In [18]:
final_table_df.dropna(inplace=True)

In [19]:
final_table_df.drop_duplicates(subset=["title"], keep="first", inplace=True)

In [20]:
final_table_df.shape

(256, 5)

In [21]:
final_table_df.to_csv("../raw_data/genre_synopsis_screengrabs_table.csv", index=False)

## Adding Movie Posters

In [4]:
posters_path = "data/SampleMoviePosters/"

In [5]:
# Load from google storage
# Get blobs within the genres folder
blobs = bucket.list_blobs(prefix=posters_path)

for blob in blobs:
    if os.path.basename(blob.name) == "MovieGenre.csv":
        # Load contents of blobs as text
        download = blob.download_as_text(encoding='latin1')

        # Convert the text content to a StringIO object
        csv_data = StringIO(download)

        posters_df = pd.read_csv(csv_data)

posters_df.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [6]:
posters_df.drop(columns=["Imdb Link", "Genre"], inplace=True)
posters_df.rename(columns={"imdbId": "imdb_id", "Imdb Link": "link", "IMDB Score": "rating",
                           "Poster": "poster_link", "Title": "title"}, inplace=True)
posters_df.head()

Unnamed: 0,imdb_id,title,rating,poster_link
0,114709,Toy Story (1995),8.3,https://images-na.ssl-images-amazon.com/images...
1,113497,Jumanji (1995),6.9,https://images-na.ssl-images-amazon.com/images...
2,113228,Grumpier Old Men (1995),6.6,https://images-na.ssl-images-amazon.com/images...
3,114885,Waiting to Exhale (1995),5.7,https://images-na.ssl-images-amazon.com/images...
4,113041,Father of the Bride Part II (1995),5.9,https://images-na.ssl-images-amazon.com/images...


In [7]:
posters_df["imdb_id"] = posters_df["imdb_id"].map(lambda x: "tt"+str(x))
posters_df.head()

Unnamed: 0,imdb_id,title,rating,poster_link
0,tt114709,Toy Story (1995),8.3,https://images-na.ssl-images-amazon.com/images...
1,tt113497,Jumanji (1995),6.9,https://images-na.ssl-images-amazon.com/images...
2,tt113228,Grumpier Old Men (1995),6.6,https://images-na.ssl-images-amazon.com/images...
3,tt114885,Waiting to Exhale (1995),5.7,https://images-na.ssl-images-amazon.com/images...
4,tt113041,Father of the Bride Part II (1995),5.9,https://images-na.ssl-images-amazon.com/images...


In [8]:
posters_df["title"] = posters_df["title"].map(lambda x: re.sub(r"\s*\(\d+\)", "", x).lower())
posters_df.head()

Unnamed: 0,imdb_id,title,rating,poster_link
0,tt114709,toy story,8.3,https://images-na.ssl-images-amazon.com/images...
1,tt113497,jumanji,6.9,https://images-na.ssl-images-amazon.com/images...
2,tt113228,grumpier old men,6.6,https://images-na.ssl-images-amazon.com/images...
3,tt114885,waiting to exhale,5.7,https://images-na.ssl-images-amazon.com/images...
4,tt113041,father of the bride part ii,5.9,https://images-na.ssl-images-amazon.com/images...


In [9]:
# Merging
genre_synopsis_screengrabs_table_df = pd.read_csv("../raw_data/genre_synopsis_screengrabs_table.csv")

In [12]:
full_table_df = genre_synopsis_screengrabs_table_df.merge(posters_df, on="title", how="left")
print(full_table_df.shape)
full_table_df.head()

(343, 8)


Unnamed: 0,title,paths,imdb_id_x,plot_synopsis,genre,imdb_id_y,rating,poster_link
0,12 monkeys,data/resized_frames/12 Monkeys (1995),tt3148266,A deadly virus released in 1996 wipes out almo...,scifi,,,
1,21 grams,data/resized_frames/21 Grams (2003),tt0315733,The story is told in a non-linear manner. The ...,crime,tt315733,7.7,https://images-na.ssl-images-amazon.com/images...
2,a few good men,data/resized_frames/A Few Good Men (1992),tt0104257,"Late one evening, at the U.S. Naval Base in Gu...",thriller,tt104257,7.6,https://images-na.ssl-images-amazon.com/images...
3,a history of violence,data/resized_frames/A History of Violence (2005),tt0399146,Leland Jones and William 'Billy' Orser (Stephe...,thriller,tt399146,7.5,https://images-na.ssl-images-amazon.com/images...
4,a scanner darkly,data/resized_frames/A Scanner Darkly (2006),tt0405296,"In the future ""seven years from now,"" America ...",scifi,tt405296,7.1,https://images-na.ssl-images-amazon.com/images...


In [13]:
full_table_df.isna().sum() / len(full_table_df)

title            0.000000
paths            0.000000
imdb_id_x        0.000000
plot_synopsis    0.000000
genre            0.000000
imdb_id_y        0.020408
rating           0.020408
poster_link      0.020408
dtype: float64

In [16]:
full_table_df.drop(columns=["imdb_id_y"], inplace=True)
full_table_df.dropna(subset=["poster_link"], inplace=True)
full_table_df.head()

Unnamed: 0,title,paths,imdb_id_x,plot_synopsis,genre,rating,poster_link
1,21 grams,data/resized_frames/21 Grams (2003),tt0315733,The story is told in a non-linear manner. The ...,crime,7.7,https://images-na.ssl-images-amazon.com/images...
2,a few good men,data/resized_frames/A Few Good Men (1992),tt0104257,"Late one evening, at the U.S. Naval Base in Gu...",thriller,7.6,https://images-na.ssl-images-amazon.com/images...
3,a history of violence,data/resized_frames/A History of Violence (2005),tt0399146,Leland Jones and William 'Billy' Orser (Stephe...,thriller,7.5,https://images-na.ssl-images-amazon.com/images...
4,a scanner darkly,data/resized_frames/A Scanner Darkly (2006),tt0405296,"In the future ""seven years from now,"" America ...",scifi,7.1,https://images-na.ssl-images-amazon.com/images...
5,a single man,data/resized_frames/A Single Man (2009),tt1315981,George Falconer (Colin Firth) approaches a car...,crime,7.6,https://images-na.ssl-images-amazon.com/images...


In [21]:
full_table_df.rename(columns={"imdb_id_x": "imdb_id"}, inplace=True)

In [17]:
# Do final checks
full_table_df.duplicated().sum()

7

In [18]:
full_table_df.drop_duplicates(inplace=True)

In [19]:
full_table_df.isna().sum() / len(full_table_df)

title            0.0
paths            0.0
imdb_id_x        0.0
plot_synopsis    0.0
genre            0.0
rating           0.0
poster_link      0.0
dtype: float64

In [23]:
full_table_df.shape

(329, 7)

In [24]:
full_table_df.to_csv("../raw_data/poster_genre_synopsis_screengrabs.csv")