# Final Data Table
This notebook merges the two main tables of the cine_ethics dataset.

## Import Statements

In [2]:
from google.cloud import storage
import os
import pandas as pd
import re

## The Data

### MPST Full Dataset

In [3]:
mpst_full_data_df = pd.read_csv("../raw_data/data_mpst_full_data.csv", sep=";", skiprows=1)

# Make titles lower case
mpst_full_data_df["title"] = mpst_full_data_df["title"].apply(lambda x: x.lower())

mpst_full_data_df.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source
0,tt0057603,i tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb
1,tt1733125,dungeons & dragons: the book of vile darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
2,tt0033045,the shop around the corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
3,tt0113862,mr. holland's opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
4,tt0086250,scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb


In [4]:
mpst_full_data_df.shape

(14828, 6)

### Screengrabs Dataset

In [5]:
!gcloud auth login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=mqt56cH7B3DPsVF275QinluZVVCAHp&access_type=offline&code_challenge=WDyWFneJEsBFjx0ComGBqsqCxPrZonTqtelsrWJWzZg&code_challenge_method=S256


You are now logged in as [w.m.mulaudzi@gmail.com].
Your current project is [ornate-lens-411311].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


In [6]:
# Log into the cine_ethics project
project_id = "ornate-lens-411311"
client = storage.Client(project=project_id)

bucket_name = "cine_ethics"
bucket = client.get_bucket(bucket_name)

In [7]:
# Path to the movie on the bucket
movies_path = "data/resized_frames/"

In [8]:
# Get blobs within the movie folder
blobs = bucket.list_blobs(prefix=movies_path)

# Dictionary to store movie titles and paths
movie_info = {}

for blob in blobs:
    # Get movie name in format title (year)
    movie_name_year_split = os.path.dirname(blob.name).split("/")
    movie_name_year = movie_name_year_split[2]

    # Strip year from title
    movie_name = re.sub(r"\s*\(\d+\)", "", movie_name_year).lower()

    # Add movie title and path to the dictionary
    movie_info[movie_name] = "/".join(movie_name_year_split)

print(len(movie_info))

802


In [9]:
screengrabs_df = pd.DataFrame.from_dict(movie_info, orient='index', columns=["title"])

# Reset index to make titles a column
screengrabs_df.reset_index(inplace=True)
screengrabs_df.columns = ["title", "paths"]

screengrabs_df.head()

Unnamed: 0,title,paths
0,10 things i hate about you,data/resized_frames/10 Things I Hate About You...
1,12 monkeys,data/resized_frames/12 Monkeys (1995)
2,12 years a slave,data/resized_frames/12 Years a Slave (2013)
3,127 hours,data/resized_frames/127 Hours (2010)
4,13 hours the secret soldiers of benghazi,data/resized_frames/13 Hours The Secret Soldie...


### Merging The Two

In [10]:
synopsis_screengrabs_df = pd.merge(screengrabs_df, mpst_full_data_df, how='left', on="title")
synopsis_screengrabs_df.head()

Unnamed: 0,title,paths,imdb_id,plot_synopsis,tags,split,synopsis_source
0,10 things i hate about you,data/resized_frames/10 Things I Hate About You...,tt0147800,"Cameron James (Joseph Gordon-Levitt), the new ...","comedy, gothic, adult comedy, clever, romantic...",train,imdb
1,12 monkeys,data/resized_frames/12 Monkeys (1995),tt3148266,A deadly virus released in 1996 wipes out almo...,murder,train,wikipedia
2,12 years a slave,data/resized_frames/12 Years a Slave (2013),tt2024544,The movie opens with a group of slaves receivi...,"dramatic, boring, historical",test,imdb
3,127 hours,data/resized_frames/127 Hours (2010),tt1542344,Mountaineer and adventurer Aron Ralston begins...,"dramatic, suspenseful, psychological, flashback",test,wikipedia
4,13 hours the secret soldiers of benghazi,data/resized_frames/13 Hours The Secret Soldie...,,,,,


In [11]:
synopsis_screengrabs_df.shape

(915, 7)

## Cleaning the Merged Table

In [12]:
# Check percentage of missing values
synopsis_screengrabs_df.isna().sum() / len(synopsis_screengrabs_df)

title              0.000000
paths              0.000000
imdb_id            0.285246
plot_synopsis      0.285246
tags               0.285246
split              0.285246
synopsis_source    0.285246
dtype: float64

In [13]:
synopsis_screengrabs_df.dropna(subset="plot_synopsis", inplace=True)

In [14]:
# Check percentage of missing values
synopsis_screengrabs_df.isna().sum() / len(synopsis_screengrabs_df)

title              0.0
paths              0.0
imdb_id            0.0
plot_synopsis      0.0
tags               0.0
split              0.0
synopsis_source    0.0
dtype: float64

In [21]:
synopsis_screengrabs_df.drop_duplicates(subset="title", inplace=True)

In [22]:
synopsis_screengrabs_df.shape

(541, 7)

In [23]:
synopsis_screengrabs_df.to_csv("../raw_data/synopsis_screengrabs_final_table.csv", index=False)