In [1]:
# Import Dependencies
import json
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
import psycopg2
from config import postgres_pwd
import time

In [41]:
# Clean Movie
alt_langs = ['Also known as','Arabic','Cantonese','Chinese','French','Hangul','Hebrew','Hepburn','Japanese',
             'Literally','Mandarin','McCune–Reischauer','Original title','Polish','Revised Romanization','Romanized',
             'Russian','Simplified','Traditional','Yiddish']
def clean_movie(x):
    x = dict(x)
    
    # Consolidate Titles from all Languages
    alt_titles = {}
    for i in alt_langs:
        if i in x:
            alt_titles[i] = x[i]
            x.pop(i)
    if alt_titles:
        x['Alternate Title'] = alt_titles
    
    # Consolidate similar Columns
    def consolidate_columns(a,b):
        if a in x:
            x[b] = x.pop(a)
    consolidate_columns('Adaptation by', 'Writer(s)')
    consolidate_columns('Written by', 'Writer(s)')
    consolidate_columns('Screen story by', 'Writer(s)')
    consolidate_columns('Screenplay by', 'Writer(s)')
    consolidate_columns('Story by', 'Writer(s)')
    consolidate_columns('Country of origin', 'Country')
    consolidate_columns('Directed by', 'Director')
    consolidate_columns('Distributed by', 'Distributor')
    consolidate_columns('Edited by', 'Editor(s)')
    consolidate_columns('Length', 'Running time')
    consolidate_columns('Music by', 'Composer(s)')
    consolidate_columns('Theme music composer', 'Composer(s)')
    consolidate_columns('Produced by', 'Producer(s)')
    consolidate_columns('Producer', 'Producer(s)')
    consolidate_columns('Productioncompanies ', 'Production company(s)')
    consolidate_columns('Productioncompany ', 'Production company(s)')
    consolidate_columns('Original release', 'Release date')
    consolidate_columns('Released', 'Release date')
    consolidate_columns('Release Date', 'Release date')
    
    return x

In [79]:
# Extract Raw Data and Transform Wiki Data
def extract_transform(wiki,kaggle,ratings):

    # Extract Wiki Data from Json onto a DataFrame
    with open(wiki_file,'r') as wiki_json:
        wiki_list = json.load(wiki_json)
        wiki_list = [i for i in wiki_list 
                     if ('Director' in i or 'Directed by' in i)
                       and 'imdb_link' in i
                       and 'No. of episodes' not in i
                    ]
        wiki_clean_list = [clean_movie(i) for i in wiki_list]
        wiki_df = pd.DataFrame(wiki_clean_list)

    # Extract IMDB ID    
    try:       
        wiki_df['imdb_id'] = wiki_df['imdb_link'].str.extract(r'(tt\d{7})')
        wiki_df.drop_duplicates(subset='imdb_id',inplace=True)
    except:
        print('Error')

    # Drop Columns with Null values in 90% of rows
    wiki_columns_to_keep = [i for i in wiki_df if wiki_df[i].count() >= (len(wiki_df)*0.1)]
    wiki_df = wiki_df[wiki_columns_to_keep]

    # Clean Box Office
    box_off = wiki_df['Box office'].dropna() 
    box_off = box_off.apply(lambda x: ' '.join(x) if type(x) == list else x)
    box_off = box_off.str.replace(r'\$.*[-—–](?![a-z])','$', regex=True)
    format_one = r'\$\s*\d+\.?\d*\s*[mb]illi?on'
    format_two = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)'
    def parse_dollars(x):
        if type(x) != str:
            return np.nan
        if re.match(r'\$\s*\d+\.?\d*\s*milli?on',x,flags=re.IGNORECASE):
            x = re.sub('\$|[a-zA-Z]|\s','',x)
            x = float(x) * 10**6
        elif re.match(r'\$\s*\d+\.?\d*\s*billi?on',x,flags=re.IGNORECASE):
            x = re.sub('\$|[a-zA-Z]|\s','',x)
            x = float(x) * 10**9
        elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)',x,flags=re.IGNORECASE):
            x = re.sub('\$|,','',x)
            x = float(x)
        else:
            return np.nan
        return x     
    wiki_df['box_office'] = box_off.str.extract(f'({format_one}|{format_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
    wiki_df.drop('Box office', axis=1, inplace=True)

    # Clean Budget
    bud = wiki_df['Budget'].dropna()
    bud = bud.apply(lambda x: ' '.join(x) if type(x) == list else x)
    bud = bud.str.replace(r'\[\d+\]\s*', '')
    bud = bud.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)
    format_one = r'\$\s*\d+\.?\d*\s*mill?i?on'
    format_two = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\smillion)'
    wiki_df['budget'] = bud.str.extract(f'({format_one}|{format_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
    wiki_df.drop('Budget', axis=1, inplace=True)

    # Clean Release Date
    rel_dt = wiki_df['Release date'].dropna()
    rel_dt = rel_dt.apply(lambda x: ' '.join(x) if type(x) == list else x)
    month = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)'
    format_one = month + r'\s\d{1,2}\,\s\d{4}'
    format_two = r'(?:\d{1,2}\s)?' + month + r' \d{4}'
    format_three = '(' + month + ',\s\d{4})'
    format_four = r'\d{4}.[01]\d.[0123]\d'
    format_five = r'(^\d{4})'
    wiki_df['release_date'] = pd.to_datetime(rel_dt.str.extract(f'({format_one}|{format_two}|{format_three}|{format_four}|{format_five})', flags=re.IGNORECASE)[0]) 
    wiki_df.drop('Release date', axis=1, inplace=True)

    # Clean Run Time
    run = wiki_df['Running time'].dropna()
    run = run.apply(lambda x: ' '.join(x) if type(x) == list else x)
    format_one = r'(\d+)\s*ho?u?r?s?\s*(\d*)'
    format_two = r'^(\d+)\s*m?'
    run_time_extract = run.str.extract(f'({format_one}|{format_two})', flags=re.IGNORECASE)
    run_time_extract = run_time_extract.apply(lambda x: pd.to_numeric(x,errors='coerce')).fillna(0)
    wiki_df['running_time'] = run_time_extract.apply(lambda x: x[1]*60+x[2] if x[2]!=0 else x[3], axis=1)
    wiki_df.drop('Running time', axis=1, inplace=True)

    # Extract Kaggle and Ratings Data
    kaggle_df = pd.read_csv(kaggle_file)
    ratings_df = pd.read_csv(ratings_file)
    
    return wiki_df, kaggle_df, ratings_df

In [80]:
# Assign file paths
file_dir = '../Data/'
wiki_file = f'{file_dir}wikipedia-movies.json'
kaggle_file = f'{file_dir}movies_metadata.csv'
ratings_file = f'{file_dir}ratings.csv'

In [81]:
# Save Extracted and Transformed Data into DataFrames
wiki_df, kaggle_df, ratings_df = extract_transform(wiki_file,kaggle_file,ratings_file)

  if (await self.run_code(code, result,  async_=asy)):


In [83]:
wiki_df.head()

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Cinematography,Country,Language,Writer(s),...,Distributor,Editor(s),Composer(s),Producer(s),Production company(s),imdb_id,box_office,budget,release_date,running_time
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990.0,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",Oliver Wood,United States,English,"[David Arnott, James Cappe]",...,20th Century Fox,Michael Tronick,"[Cliff Eidelman, Yello]","[Steve Perry, Joel Silver]",Silver Pictures,tt0098987,21400000.0,20000000.0,1990-07-11,102.0
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990.0,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",Mark Plummer,United States,English,"[James Foley, Robert Redlin]",...,Avenue Pictures,Howard E. Smith,Maurice Jarre,"[Ric Kidney, Robert Redlin]",Avenue Pictures,tt0098994,2700000.0,6000000.0,1990-05-17,114.0
2,https://en.wikipedia.org/wiki/Air_America_(film),1990.0,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",Roger Deakins,United States,"[English, Lao]","[John Eskow, Richard Rush]",...,TriStar Pictures,"[John Bloom, Lois Freeman-Fox]",Charles Gross,Daniel Melnick,"[Carolco Pictures, IndieProd Company]",tt0099005,57718089.0,35000000.0,1990-08-10,113.0
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990.0,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",Carlo Di Palma,United States,English,Woody Allen,...,Orion Pictures,Susan E. Morse,,Robert Greenhut,,tt0099012,7331647.0,12000000.0,1990-12-25,106.0
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990.0,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",Russell Boyd,US,English,Paul Hogan,...,Paramount Pictures,David Stiven,Maurice Jarre,John Cornell,,tt0099018,6939946.0,25000000.0,1990-12-19,95.0


In [85]:
wiki_df.columns.to_list()

['url',
 'year',
 'imdb_link',
 'title',
 'Based on',
 'Starring',
 'Cinematography',
 'Country',
 'Language',
 'Writer(s)',
 'Director',
 'Distributor',
 'Editor(s)',
 'Composer(s)',
 'Producer(s)',
 'Production company(s)',
 'imdb_id',
 'box_office',
 'budget',
 'release_date',
 'running_time']