# Team: Omni Oracle
## ETL - Top 250 Movies

# Step 2: Transformation Process

After extracting data from the external sources in the web, some of the data might be missing and "corrupted". In this notebook, we will be cleaning the data that we have stored in the DataFrames. 

In [3]:
# to manage json data
import json

# for pandas dataframes
import pandas as pd

import math

## Helper functions

The functions in the two cells below are just functions that help with the conversions of the data types of the data and the formatting of the data.  

In [7]:
def convert_string_to_integer(input_str):
    try:
        return int(input_str)
    except ValueError:
        return None

def convert_string_to_float(input_str):
    try:
        return float(input_str)
    except ValueError:
        return None

def convert_string_to_list(input_str):
    if type(input_str) != type('hello'):
        return None

    return input_str.split(', ')

In [5]:
def extract_oscar_wins(awards_str):
    keywords_to_find = ["Won", "Oscars", "Oscar"]
    temp_list = awards_str.split()
    for keyword in keywords_to_find:
        if keyword in temp_list:
            temp_list.remove(keyword)
    if len(temp_list) > 1:
        return 0
    try:
        return int(temp_list[0])
    except ValueError:
        return 0

def extract_wins_and_nominations(awards_str):
    temp_list = awards_str.split()
    num_of_wins = 0
    num_of_nominations = 0
    if "win" in temp_list:
        index = temp_list.index("win")
        try:
            num_of_wins = int(temp_list[index-1])
        except ValueError:
            pass
    elif "wins" in temp_list:
        index = temp_list.index("wins")
        try:
            num_of_wins = int(temp_list[index-1])
        except ValueError:
            pass

    if "nomination" in temp_list:
        index = temp_list.index("nomination")
        try:
            num_of_nominations = int(temp_list[index-1])
        except ValueError:
            pass
    elif "nominations" in temp_list:
        index = temp_list.index("nominations")
        try:
            num_of_nominations = int(temp_list[index-1])
        except ValueError:
            pass

    return num_of_wins, num_of_nominations

def format_date(date_str):
    temp_date_list = date_str.split()
    temp_date_list.reverse()
    dates = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    month = dates.index(temp_date_list[1]) + 1
    if month < 10:
        temp_date_list[1] = '0' + str(month)
    else:
        temp_date_list[1] = str(month)
    return '-'.join(temp_date_list)

## Cleaning the movie's DataFrame

The movie's data that we collected has a lot of columns and we will be just taking the needed variables for the data that we are storing into our database. 

The columns/variables that we are taking for the movies:
1. title
2. year
3. certification
4. release_date
5. runtime
6. genre
7. description
8. language
9. country
10. directors
11. actors
12. oscars
13. winnings
14. nominations
15. ratings
16. num_of_votes
17. budget
18. revenue

In [4]:
def clean_movies_data(movies_df):
    cleaned_movie_df = movies_df.loc[:, ["Title", "Year", "Rated", "Released", "Runtime", "Genre", "Plot", "Language", "Country", "Awards", "Director", "Actors", "imdbRating", "imdbVotes", "BoxOffice", "Production"]]
    
    cleaned_movie_df.rename(columns={"Director": "directors", "Rated": "certification", "Released": "release_date", "Plot": "description"}, inplace=True)
    cleaned_movie_df.rename(columns={"imdbRating": "ratings", "imdbVotes": "num_of_votes", "BoxOffice": "revenue", "Production": "budget"}, inplace=True)
    cleaned_movie_df.columns = [col.lower() for col in cleaned_movie_df.columns]

    actors_list = []
    votes_list = []
    oscars_list = []
    winnings_list = []
    nominations_list = []
    date_list = []
    genre_list = []
    language_list = []
    country_list = []
    year_list = []
    votes_list = []
    budget_list = []
    revenue_list = []
    rating_list = []
    runtime_list = []
    
    for i in cleaned_movie_df.index:
        temp_str = cleaned_movie_df.loc[i, "actors"]
        if type(temp_str) == type('hi'):
            temp_str = temp_str.replace('\'', '')
            temp_list = temp_str[1:-1].split(', ')
            actors_list.append(temp_list)

        temp_cert = cleaned_movie_df.loc[i, "certification"]
        if type(temp_cert) != type('hi') and math.isnan(temp_cert):
            cleaned_movie_df.loc[i, "certification"] = "Not Rated"

        genre_list.append(convert_string_to_list(cleaned_movie_df.loc[i, "genre"]))
        language_list.append(convert_string_to_list(cleaned_movie_df.loc[i, "language"]))
        country_list.append(convert_string_to_list(cleaned_movie_df.loc[i, "country"]))
        
        year_list.append(convert_string_to_integer(cleaned_movie_df.loc[i, "year"]))
        budget_list.append(convert_string_to_integer(cleaned_movie_df.loc[i, "revenue"]))
        revenue_list.append(convert_string_to_integer(cleaned_movie_df.loc[i, "budget"]))
        rating_list.append(convert_string_to_float(cleaned_movie_df.loc[i, "ratings"]))
        
        num_of_votes = cleaned_movie_df.loc[i, "num_of_votes"]
        if isinstance(num_of_votes, type('hello')):
            num_of_votes = num_of_votes.replace(',', '')
        votes_list.append(convert_string_to_integer(num_of_votes))
    
        date_str = cleaned_movie_df.loc[i, "release_date"]
        date_list.append(format_date(date_str))

        runtime_str = cleaned_movie_df.loc[i, "runtime"]
        runtime_list.append(convert_string_to_integer(runtime_str.split()[0]))
            
        oscars = 0
        wins = 0
        nominations = 0
        awards = cleaned_movie_df.loc[i, "awards"]
        if isinstance(awards, type('hello')):
            if "BAFTA Award" in awards:
                index = awards.find("BAFTA Award")
                awards = awards[index+11:]
            awards_list = awards.split('.')
            if len(awards_list) > 1:
                oscars = extract_oscar_wins(awards_list[0])
            wins, nominations = extract_wins_and_nominations(awards_list[-1])
        oscars_list.append(oscars)
        winnings_list.append(wins)
        nominations_list.append(nominations)
    
    cleaned_movie_df = cleaned_movie_df.drop(["awards"], axis=1)
    cleaned_movie_df.insert(10, "oscars", oscars_list)
    cleaned_movie_df.insert(11, "winnings", winnings_list)
    cleaned_movie_df.insert(12, "nominations", nominations_list)

    if len(actors_list) > 0:
        cleaned_movie_df["actors"] = actors_list
    cleaned_movie_df["release_date"] = date_list
    cleaned_movie_df["genre"] = genre_list
    cleaned_movie_df["language"] = language_list
    cleaned_movie_df["country"] = country_list
    cleaned_movie_df["year"] = year_list
    cleaned_movie_df["num_of_votes"] = votes_list
    cleaned_movie_df["revenue"] = revenue_list
    cleaned_movie_df["budget"] = budget_list
    cleaned_movie_df["ratings"] = rating_list
    cleaned_movie_df["runtime"] = runtime_list
    
    cleaned_movie_df["directors"] = cleaned_movie_df["directors"].apply(lambda person: person.split(', '))

    cleaned_movie_df.insert(0, "id", range(1, len(cleaned_movie_df) + 1))

    return cleaned_movie_df

## Cleaning the movie's DataFrame
Just like the movie's data, tTheactore's data that we collected hassomef columns and we will be just taking the needed variables for the data that we are storing into our database.

The columns/variables that we are taking for tactorsi
1. name
2. date_of_birth
3. date_of_death
4. gender
5. num_of_acting_creditses:

In [13]:
def clean_actors_df(actors_df):
    cleaned_actor_df = actors_df.loc[:, ["name", "birthday", "deathday", "gender", "num_of_acting_credits"]]
    cleaned_actor_df.rename(columns={"birthday": "date_of_birth", "deathday": "date_of_death"}, inplace=True)
    cleaned_actor_df["gender"] = cleaned_actor_df["gender"].replace([0, 1, 2], ['None', 'Female', 'Male'])

    credits_list = []

    for i in cleaned_actor_df.index:
        credits_list.append(convert_string_to_integer(cleaned_actor_df.loc[i, "num_of_acting_credits"]))
    
    cleaned_actor_df["num_of_acting_credits"] = credits_list

    cleaned_actor_df.insert(0, "id", range(1, len(cleaned_actor_df) + 1))

    return cleaned_actor_df

## Main transformation function

We are going to call all the above functions in an order:
1. Transformation function for the movies DataFrame
2. Transformation function for the actors DataFrame

In the end, just like the extraction process, there will be two DataFrames; `transformed_movies_df` and `transformed_actors_df`, which stores the cleaned data for the top 250 movies and the actors who acted in them.

In [15]:
def transform_data(movies_df, actors_df):
    transformed_movies_df = clean_movies_data(movies_df)
    transformed_actors_df = clean_actors_df(actors_df)
    return transformed_movies_df, transformed_actors_df