In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
############################################
# 8.2.1
# Extract the Wikipedia Movies JSON
############################################

In [3]:
with open('wikipedia-movies.json', mode='r') as file:
    wiki_movies_raw=json.load(file)

In [4]:
len(wiki_movies_raw)

7311

In [5]:
# To see the first five records, 
wiki_movies_raw[:5]

[{'url': 'https://en.wikipedia.org/wiki/The_Adventures_of_Ford_Fairlane',
  'year': 1990,
  'imdb_link': 'https://www.imdb.com/title/tt0098987/',
  'title': 'The Adventures of Ford Fairlane',
  'Directed by': 'Renny Harlin',
  'Produced by': ['Steve Perry', 'Joel Silver'],
  'Screenplay by': ['David Arnott', 'James Cappe', 'Daniel Waters'],
  'Story by': ['David Arnott', 'James Cappe'],
  'Based on': ['Characters', 'by Rex Weiner'],
  'Starring': ['Andrew Dice Clay',
   'Wayne Newton',
   'Priscilla Presley',
   'Lauren Holly',
   'Morris Day',
   'Robert Englund',
   "Ed O'Neill"],
  'Narrated by': 'Andrew "Dice" Clay',
  'Music by': ['Cliff Eidelman', 'Yello'],
  'Cinematography': 'Oliver Wood',
  'Edited by': 'Michael Tronick',
  'Productioncompany ': 'Silver Pictures',
  'Distributed by': '20th Century Fox',
  'Release date': ['July 11, 1990', '(', '1990-07-11', ')'],
  'Running time': '102 minutes',
  'Country': 'United States',
  'Language': 'English',
  'Budget': '$20 million',


In [6]:
# To see the last five records,
wiki_movies_raw[-5:]

[{'url': 'https://en.wikipedia.org/wiki/Holmes_%26_Watson',
  'year': 2018,
  'imdb_link': 'https://www.imdb.com/title/tt1255919/',
  'title': 'Holmes & Watson',
  'Directed by': 'Etan Cohen',
  'Produced by': ['Will Ferrell',
   'Adam McKay',
   'Jimmy Miller',
   'Clayton Townsend'],
  'Screenplay by': 'Etan Cohen',
  'Based on': ['Sherlock Holmes',
   'and',
   'Dr. Watson',
   'by',
   'Sir Arthur Conan Doyle'],
  'Starring': ['Will Ferrell',
   'John C. Reilly',
   'Rebecca Hall',
   'Rob Brydon',
   'Steve Coogan',
   'Ralph Fiennes'],
  'Music by': 'Mark Mothersbaugh',
  'Cinematography': 'Oliver Wood',
  'Edited by': 'Dean Zimmerman',
  'Productioncompanies ': ['Columbia Pictures',
   'Gary Sanchez Productions',
   'Mosaic Media Group',
   'Mimran Schur Pictures'],
  'Distributed by': 'Sony Pictures Releasing',
  'Release date': ['December 25, 2018',
   '(',
   '2018-12-25',
   ')',
   '(United States)'],
  'Running time': '90 minutes',
  'Country': 'United States',
  'Language

In [7]:
# check records in the middle 
wiki_movies_raw[3600:3605]

[{'url': 'https://en.wikipedia.org/wiki/Benji:_Off_the_Leash!',
  'year': 2004,
  'imdb_link': 'https://www.imdb.com/title/tt0315273/',
  'title': 'Benji: Off the Leash!',
  'Directed by': 'Joe Camp',
  'Written by': 'Joe Camp',
  'Starring': ['Benji', 'Nick Whitaker', 'Shaggy', 'Gypsy the Cockatoo'],
  'Music by': 'Antonio di Lorenzo',
  'Productioncompany ': 'Mulberry Square Productions',
  'Distributed by': 'Mulberry Square Productions',
  'Release date': ['March 26, 2004', '(', '2004-03-26', ')'],
  'Running time': '97 min',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$3,817,362'},
 {'url': 'https://en.wikipedia.org/wiki/The_Best_Thief_in_the_World',
  'year': 2004,
  'imdb_link': 'https://www.imdb.com/title/tt0389796/',
  'title': 'The Best Thief in the World',
  'Directed by': 'Jacob Kornbluth',
  'Produced by': ['Tim Perrell', 'Nicola Usborne'],
  'Written by': 'Jacob Kornbluth',
  'Starring': ['Marc Rozendaal',
   'Michael Silverman',
   'David Warsh

In [8]:
############################################
# 8.2.2
# Extract the Kaggle Data
############################################

In [9]:
k_metadata=pd.read_csv('../copy_all_files_here/movies_metadata.csv',low_memory=False)
ratings=pd.read_csv('../copy_all_files_here/ratings.csv')

In [10]:
#  View the new DFs with .head() method, .tail() method and/or .sample(n=x) method
# k_metadata.head()
ratings.sample(n=10)

Unnamed: 0,userId,movieId,rating,timestamp
9578045,98787,111377,5.0,1466889248
782059,7995,80219,3.5,1322576509
23822563,247380,2000,3.0,1283035568
787318,8083,539,4.0,1109877777
19701265,204615,3556,3.0,1028930687
8483940,87350,2993,5.0,996901453
3877577,40320,1061,3.0,1017511584
17973413,186651,3461,4.5,1099727974
6817067,70389,86,4.0,862178355
12774457,132566,830,3.0,850399912


In [11]:
############################################
# 8.3.1
# Data Cleaning STrategies
############################################

In [12]:
# Bad data comes in three states:
# Beyond repair  -  all we can do is delete it 
# Badly damaged  -  Fill in missing data OR standardize units of measure OR consolidate form multiple columns
# Wrong form     -  convert data types OR parse text data to correct format OR split columns

# Data cleaning requires a lot of improvising.
#  always put in comments to show what you've done so you can refer to those comments in the future

In [13]:
############################################
# 8.3.2
# Iterative Process for Cleaning Data
############################################

In [14]:
# The iterative process for cleaning data can be broken down as follows:
# 1) INSPECT  -  we need to inspect our data and identify a problem.
# 2) PLAN  -  Once we've identified the problem, we need to make a plan and 
#             decide whether it is worth the time and effort to fix it.
# 3) EXECUTE  -  Finally, we execute the repair.

In [15]:
# early iterations to cleaning data include:
#     removing unneeded rows and columns
#     removing dupes
#     consolidating columns
#     reshaping data

#  later iterations to cleaning data shift towards more subtle issues

In [16]:
# 1) INSPECT

# Before we can do anything, we have to look at our data. The first thing we want to know is whether or not the data was imported correctly. The simplest way to confirm this is to print out the first few data points and examine the first few rows for irregularities, e.g., data in the wrong columns, all missing values, column headers that don't make sense, or garbled characters.

# If the data doesn't look correct, we know it wasn't imported correctly. Sometimes the beginning of the data looks fine, but if the import went wrong somewhere in the middle of the process, the rest of the data can be affected.

# Therefore, it's good practice to check the last few rows and a random sample of rows. We can also start to answer some simple questions about the data:

# Does it have a consistent structure (like a CSV table) or is it unstructured (like a collection of email messages)?
# How is each data point identified—is there an explicit, unique ID for each data point, or will one need to be built?
# However, most usable data contains too many data points to review every single one, so we'll need to use strategies that tell us about the whole dataset.

# First, count how many data points or rows exist. If the data is structured, count the number of columns and missing values in each column. If possible, count the number of unique values in each column and how frequently each unique value appears. To determine if this is possible, we'll need to investigate the data types for each column.

# When investigating the data type for a column, we want to know what the data type is and what the data type should be. For example, if we see "True" and "False" as entries for a column, we expect that the data type will be a Boolean. If the data type is a string, we need to investigate further.

# If a column's data type is numeric, we can summarize its data with some basic statistics, such as measures of central tendency (e.g., mean and/or median) and measures of spread (e.g., standard deviation, interquartile range, minimum/maximum). We can also investigate columns with statistical plots, like scatter plots and histograms.

In [17]:
# 2) PLAN

# After we've investigated our data and started to identify problem areas, we can make decisions about how to fix the problems. This requires articulating the problems clearly—even if that is simply expressing the problems to ourselves—and devising a plan to modify the data and fix the problem. In this step, we'll answer several questions, including:

# If a column doesn't have the right data type, is it a problem with the whole column? Or are just a handful of rows causing the issues?
# Do rows have outliers due to spurious data? Or are they valid data points?
# When values are missing, will they need to be removed, replaced, or interpolated?
# The answers to these questions will tell us how we need to modify our data. Keep in mind, there are two main ways: we can modify values and we can modify structure.

# Modifying data values includes removing rows or columns, replacing values, or generating new columns from old ones. We might remove rows with missing or corrupted data, columns with only one value, or columns mostly missing data. There are many ways we might replace data. Instead of dropping missing values, we might replace them with zeros or empty strings. We might have a column that contains nonstandard values, such as percentages that are stored as whole numbers from 0 to 100 and also as fractions from 0 to 1, and we would replace them with one standard form.

# Converting a column to a new data type is also a form of replacing values. We can also bin data (like rounding to the nearest hundred), replacing numeric data (e.g., income) with categorical data (e.g., income brackets). We might generate new columns by splitting an existing column into several new columns—by splitting an address column to street, city, state, and zip code columns, for example—or by calculating a new column from multiple existing columns, like calculating total price by multiplying item prices by quantities.

# Modifying data structure includes pivoting the values of one column into multiple columns, aggregating rows, and merging multiple data sets. It can also include aggregating large amounts of data into summary data or summary statistics.

# With clearly stated steps to fix the problem, we can make an informed decision about whether implementing the plan is worth the effort. Sometimes there are multiple viable resolutions to choose from. To decide, we weigh trade-offs and ultimately choose the best option.

In [18]:
# # 3) EXECUTE

# Once we have a detailed list of steps to modify our dataset, it's time to implement it. We'll start writing code to fix the problem we're focusing on.

# As we write, we might discover that the problem is more difficult than initially expected. This is a normal part of the process. As you implement your changes, try to take into account any unintended consequences you could introduce.

# After implementing your changes, the next step is to return and inspect the data in a new iteration. This step is important, especially when modifying data structure, which can introduce missing data points, or inadvertently create more bad data.

In [19]:
############################################
# 8.3.3
# Investigate the Wikipedia Data
############################################

In [20]:
wiki_movies_df = pd.DataFrame(wiki_movies_raw)
wiki_movies_df.sample(n=3)

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Predecessor,Founders,Area served,Products,Services,Russian,Hebrew,Revenue,Operating income,Polish
1594,https://en.wikipedia.org/wiki/Dragonheart,1996.0,https://www.imdb.com/title/tt0116136/,Dragonheart,Rob Cohen,Raffaella De Laurentiis,Charles Edward Pogue,"[Charles Edward Pogue, Patrick Read Johnson]",,"[Dennis Quaid, David Thewlis, Pete Postlethwai...",...,,,,,,,,,,
6253,https://en.wikipedia.org/wiki/Spirit_of_the_Ma...,2013.0,,Spirit of the Marathon II,Jon Dunham,"[Mark Jonathan Harris, Gwendolen Twist, Jon Du...",,,,"[Cliff Scott, Domenico Anzini, Domenico ""Mimmo...",...,,,,,,,,,,
6872,https://en.wikipedia.org/wiki/The_Discovery_(f...,2017.0,https://www.imdb.com/title/tt5155780/,The Discovery,Charlie McDowell,"[Alex Orlovsky, James D. Stern]",,,,"[Jason Segel, Rooney Mara, Jesse Plemons, Rile...",...,,,,,,,,,,


In [48]:
#193 columns, thats alot, let's see them all
sorted(wiki_movies_df.columns.tolist())

['Actor control',
 'Adaptation by',
 'Alias',
 'Alma mater',
 'Also known as',
 'Animation by',
 'Arabic',
 'Area',
 'Area served',
 'Artist(s)',
 'Attraction type',
 'Audio format',
 'Author',
 'Based on',
 'Biographical data',
 'Bopomofo',
 'Born',
 'Box office',
 'Budget',
 'Camera setup',
 'Cantonese',
 'Characters',
 'Children',
 'Chinese',
 'Cinematography',
 'Closing date',
 'Color process',
 'Comics',
 'Composer(s)',
 'Coordinates',
 'Country',
 'Country of origin',
 'Cover artist',
 'Created by',
 'Date premiered',
 'Designer(s)',
 'Developed by',
 'Developer(s)',
 'Dewey Decimal',
 'Died',
 'Directed by',
 'Director',
 'Distributed by',
 'Distributor',
 'Divisions',
 'Duration',
 'Edited by',
 'Editor(s)',
 'Ending theme',
 'Engine',
 'Engine(s)',
 'Executive producer(s)',
 'Family',
 'Fate',
 'Film(s)',
 'Followed by',
 'Format(s)',
 'Formerly',
 'Founded',
 'Founder',
 'Founders',
 'French',
 'Full name',
 'Gender',
 'Genre',
 'Genre(s)',
 'Genres',
 'Gwoyeu Romatzyh',
 'Ha

In [23]:
# Use List Comprehensions to Filter Data
# general format:    [expression for element in source_list]
# conditional:       [expression for element in source_list if filter_expression]

# find movies with a Director AND an imdb.com link
# NB: there are 2 director columns
# fyi, wiki_movies_raw is the json file
wiki2_movies = [movie for movie in wiki_movies_raw
               if ('Director' in movie or 'Directed by' in movie)
                   and 'imdb_link' in movie]
len(wiki2_movies)

7080

In [24]:
# by finding only movies with Director and imdb link, we've cut the list down from 7300 to 7080, not a massive win
# BUT...
# make a DF and look (below) - the number of columns fell from 193 to 78!!!
# thats a big win - 78 is still alot, but 193 was way more
# why did that happen?
# there may be columns that only apply to rows that do not have data in the columns we’re targeting. 
# In those cases, the columns will be eliminated, as seen here.
# This is why it's easier to load the JSON in first and then convert it to a DataFrame. 
# Instead of trying to identify which columns in our DataFrame don't belong, 
# we just remove the bad data points, and the bad columns never get imported in.

In [25]:
wiki2_movies_df=pd.DataFrame(wiki2_movies)
#wiki2_movies_df
wiki2_movies_df.head(1)

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Hepburn,Literally,Cantonese,Chinese,Yiddish,Arabic,Romanized,Russian,Hebrew,Polish
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,Renny Harlin,"[Steve Perry, Joel Silver]","[David Arnott, James Cappe, Daniel Waters]","[David Arnott, James Cappe]","[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",...,,,,,,,,,,


In [26]:
# there are TV shows in the data - let's remove those with another filter
wiki3_movies=[movie for movie in wiki_movies_raw
             if ('Director' in movie or 'Directed by' in movie)
             and 'imdb_link' in movie
             and 'No. of episodes' not in movie]
len(wiki3_movies)

7076

In [49]:
wiki3_movies_df=pd.DataFrame(wiki3_movies)
wiki3_movies_df.head(1)

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Hepburn,Literally,Cantonese,Chinese,Yiddish,Arabic,Romanized,Russian,Hebrew,Polish
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,Renny Harlin,"[Steve Perry, Joel Silver]","[David Arnott, James Cappe, Daniel Waters]","[David Arnott, James Cappe]","[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",...,,,,,,,,,,


In [None]:
# this only removed 4 items from our list.
# i'm following the coursework, but i probably woul NOT have done it like this. 
# look at the DF above, there are tons of languages that appear to be boolean
# i probably would've started by eliminating those

In [None]:
############################################
# 8.3.4
# Revisit Functions
############################################

In [None]:
# Remember, functions are blocks of code within a script or algorithm that perform a specific task. 
# There are four basic parts to a function:
    # Name
    # Parameters
    # Code block
    # Return value
    
# First, we need to talk about scope. 
# Inside the code block of a function, we can use variables 
#     that were created outside the function and initialize new variables inside the function.

# This is called the "scope" of the variables:
    # Variables created outside the function are called global variables.
    # New variables created inside the function are local variables. 
            #- local variables only work inside the function in which they're created
    # The hierarchy of variables is called the scope.

In [31]:
# Scope Example -->

x='global value'

def var():
    x='local value'
    print(x)

var()
print(x)

local value
global value


In [None]:
# In that example, x is 'local value' when you call the function only
# however, lists and dictionaries are mutable, which means they change when you do stuff to them in a function
# best practice is to make a copy of any list or dictionary that youre doing stuff to, 
# so you dont mess up the original and you can alway use the orig
# general format:
    # new_list = list(old_list)
    # new_dict = dict(old_dict)

In [32]:
# Lambda Functions
# Lambda functions are written in one line and automatically return a value without using the return keyword. 
# Lambda functions have no name and are also known as "anonymous functions."
# basic syntax:
#     lambda arguments: expression
# example:
#     lambda x: x * x
# example:
# square = lambda x: x * x
# square(5)

25

In [None]:
############################################
# 8.3.5
# Create a Function to Clean the Data, Part 1
############################################

In [34]:
# we're going to make a big function to clean movies, we will add to this code block as we go along

###

def clean_movie(movie):
    
    # Because the movies are dicts in the json file 
    #     and we want to make nondestructive edits, make a copy of the incoming movie.
    # To make a copy of movie, we'll use the dict() constructor.
    # Constructors are special functions that initialize new objects. 
    # They reserve space in memory for the object 
    #     and perform any initializations the object requires. 
    # Also, constructors can take parameters and initialize a new object using those parameters.
    
    movie = dict(movie)
    
    # remember - this 'movie' variable, inside the function clean_movie
    # is like x (global vs local) above - we're changing 'movie' inside the function, local
    # 'movie' outside the function, global, is still untouched
    
    ###
    
    # see below for info on Alternate Titles
    # we're going to loop through all the movies and make a dictionary of alt.titles
    # first establish a dictionary to hold them
    alt_titles={}
    
    # now loop
    for key in ['Also known as','Arabice','Cantonese','Chinese','French','Hangul','Hebrew','Hepburn','Japanese'
               ,'Literally','Mandarin','McCune-Reischauer','Original title','Polish','Revised Romanization','Romanized'
                ,'Russian','Simplified','Traditional','Yiddish']:
        # now check if the field exists for each movie in the json
        if key in movie:
            # if the field exists, add the field name to alt_titles.dict for that movie
            alt_titles[key]=movie[key]
            # then remove the field from that movie in the json
            movie.pop(key)
    # now add alt_titles keys to each movie 
    if len(alt_titles) > 0:
        movie['alt_titles']=alt_titles
    
    return(movie)

#################################
# ended here 8.3.5 Step 3!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#################################

In [50]:
# as stated above, i would have started by working on those different language columns...
# so lets do that
# The first one on the list is Arabic, so let's see which movies have a value for "Arabic."
wiki3_movies_df.loc[wiki3_movies_df['Arabic'].notnull()]

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Hepburn,Literally,Cantonese,Chinese,Yiddish,Arabic,Romanized,Russian,Hebrew,Polish
6834,https://en.wikipedia.org/wiki/The_Insult_(film),2018,https://www.imdb.com/title/tt7048622/,The Insult,Ziad Doueiri,"[Rachid Bouchareb, Jean Bréhat, Julie Gayet, A...",,,,"[Adel Karam, Kamel El Basha]",...,,Case No. 23,,,,قضية رقم ٢٣,Qadiyya raqm 23,,,
7058,https://en.wikipedia.org/wiki/Capernaum_(film),2018,https://www.imdb.com/title/tt8267604/,Capernaum,Nadine Labaki,"[Michel Merkt, Khaled Mouzanar]","[Nadine Labaki, Jihad Hojaily, Michelle Keserw...","[Georges Khabbaz, Nadine Labaki, Michelle Kese...",,"[Zain Al Rafeea, Yordanos Shiferaw, Boluwatife...",...,,,,,,کفرناحوم‎,,,,


In [51]:
# coursework shows this - you don't have to use .loc (???) OK
wiki3_movies_df[wiki3_movies_df['Arabic'].notnull()]

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Hepburn,Literally,Cantonese,Chinese,Yiddish,Arabic,Romanized,Russian,Hebrew,Polish
6834,https://en.wikipedia.org/wiki/The_Insult_(film),2018,https://www.imdb.com/title/tt7048622/,The Insult,Ziad Doueiri,"[Rachid Bouchareb, Jean Bréhat, Julie Gayet, A...",,,,"[Adel Karam, Kamel El Basha]",...,,Case No. 23,,,,قضية رقم ٢٣,Qadiyya raqm 23,,,
7058,https://en.wikipedia.org/wiki/Capernaum_(film),2018,https://www.imdb.com/title/tt8267604/,Capernaum,Nadine Labaki,"[Michel Merkt, Khaled Mouzanar]","[Nadine Labaki, Jihad Hojaily, Michelle Keserw...","[Georges Khabbaz, Nadine Labaki, Michelle Kese...",,"[Zain Al Rafeea, Yordanos Shiferaw, Boluwatife...",...,,,,,,کفرناحوم‎,,,,


In [52]:
wiki3_movies_df[wiki3_movies_df['Arabic'].notnull()]['url']

6834    https://en.wikipedia.org/wiki/The_Insult_(film)
7058     https://en.wikipedia.org/wiki/Capernaum_(film)
Name: url, dtype: object

In [47]:
# The different language columns are for alternate titles of the movie. 
# Let's combine all of them into one dictionary that has all the alternate titles.
# To do that, we need to go through each of the columns, one by one, 
#     and determine which are alternate titles. Some might be tricky. 
#     If you're not sure what a column name means, google it

In [53]:
# SKILL DRILL
# Go through each of the columns, one by one, and determine which columns hold alternate titles.
sorted(wiki3_movies_df.columns.tolist())

['Adaptation by',
 'Also known as',
 'Animation by',
 'Arabic',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cantonese',
 'Chinese',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Country of origin',
 'Created by',
 'Directed by',
 'Director',
 'Distributed by',
 'Distributor',
 'Edited by',
 'Editor(s)',
 'Executive producer(s)',
 'Followed by',
 'French',
 'Genre',
 'Hangul',
 'Hebrew',
 'Hepburn',
 'Japanese',
 'Label',
 'Language',
 'Length',
 'Literally',
 'Mandarin',
 'McCune–Reischauer',
 'Music by',
 'Narrated by',
 'Original language(s)',
 'Original network',
 'Original release',
 'Original title',
 'Picture format',
 'Polish',
 'Preceded by',
 'Produced by',
 'Producer',
 'Producer(s)',
 'Production company(s)',
 'Production location(s)',
 'Productioncompanies ',
 'Productioncompany ',
 'Recorded',
 'Release date',
 'Released',
 'Revised Romanization',
 'Romanized',
 'Running time',
 'Russian',
 'Screen story by',
 'Screenplay by',
 'Simplifie

In [55]:
# What's Hangul?
wiki3_movies_df[wiki3_movies_df['Hangul'].notnull()]['url']

# Clicking on the SHark Bait url, i see it's a language

3413    https://en.wikipedia.org/wiki/Wonderful_Days_(...
4238             https://en.wikipedia.org/wiki/Shark_Bait
Name: url, dtype: object

In [58]:
# What's Hepburn?
# wiki3_movies_df['Hepburn'].value_counts()

wiki3_movies_df[wiki3_movies_df['Hepburn'].notnull()]['url']

# a language i guess. see: Shoplifters

6393    https://en.wikipedia.org/wiki/When_Marnie_Was_...
6829    https://en.wikipedia.org/wiki/Mary_and_the_Wit...
7009    https://en.wikipedia.org/wiki/My_Hero_Academia...
7054            https://en.wikipedia.org/wiki/Shoplifters
Name: url, dtype: object

In [60]:
#  What's 'McCune–Reischauer'
# wiki3_movies_df['McCune–Reischauer'].value_counts()

wiki3_movies_df[wiki3_movies_df['McCune–Reischauer'].notnull()]['url']

# some other sort fo language?
# Google it, and you'll learn it's a romanization system for Korean. 

3413    https://en.wikipedia.org/wiki/Wonderful_Days_(...
4238             https://en.wikipedia.org/wiki/Shark_Bait
Name: url, dtype: object

In [None]:
# Now we can add in code to handle the alternative titles. The logic we need to implement follows:
    # Make an empty dict to hold all of the alternative titles.
    # Loop through a list of all alternative title keys:
        # Check if the current key exists in the movie object.
        # If so, remove the key-value pair and add to the alternative titles dict.
    # After looping through every key, add the alternative titles dict to the movie object.

In [None]:
# NB: the following investigation of columns is not from the module work, its my own investigation

###

# Looking at the sample date one cell above, 'Predecessor' column, amongst others, appears to be all NaNs.
# First, do a notnull().sum() methods chain to see if its every row
wiki_movies_df['Predecessor'].notnull().sum()

In [None]:
#  OK so there are 3 rows where its populated...
# Use loc function with notnull() method to find them
a= wiki_movies_df.loc[wiki_movies_df['Predecessor'].notnull()]
a

In [None]:
b= wiki_movies_df.loc[wiki_movies_df['title'].isnull()]
b