In [2]:
import pandas as pd
import numpy as np
import pickle
from tqdm.auto import tqdm

In [3]:
data = pd.read_csv('../data/wiki_movies/filtered_wiki_movies_scraped.csv')

In [4]:
data.head()
data.drop_duplicates(subset=['Title'], inplace=True)
data.rename(columns={"Release Year": "Year", "Origin/Ethnicity": "Origin", "scraped_summary": "Plot Summary"}, inplace=True)
data.drop("Unnamed: 0", inplace=True, axis=1)

In [5]:
data.head()

Unnamed: 0,Year,Title,Origin,Director,Cast,Genre,Wiki Page,Plot,Plot Summary
0,1907,Daniel Boone,American,Wallace McCutcheon and Ediwin S. Porter,"William Craven, Florence Lawrence",biographical,https://en.wikipedia.org/wiki/Daniel_Boone_(19...,Boone's daughter befriends an Indian maiden as...,"Daniel Boone; or, Pioneer Days in America is a..."
1,1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(fi...,The plot is that of a black woman going to the...,Laughing Gas is the title of several American ...
2,1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,https://en.wikipedia.org/wiki/The_Adventures_o...,On a beautiful summer day a father and mother ...,The Adventures of Dollie is a 1908 American si...
3,1908,The Black Viper,American,D. W. Griffith,D. W. Griffith,drama,https://en.wikipedia.org/wiki/The_Black_Viper,A thug accosts a girl as she leaves her workpl...,The Black Viper (aka La vipère noire in France...
4,1908,A Calamitous Elopement,American,D.W. Griffith,"Harry Solter, Linda Arvidson",comedy,https://en.wikipedia.org/wiki/A_Calamitous_Elo...,A young couple decides to elope after being ca...,A Calamitous Elopement is a 1908 American sile...


In [6]:
print(data['Title'].unique().shape)

(26198,)


In [7]:
count_series = data.groupby(['Title']).size().reset_index(name='count')['count']
count_series[count_series>1]

Series([], Name: count, dtype: int64)

In [8]:
data.iloc[13916]

Year                                                         2007
Title                                                        Numb
Origin                                                   American
Director                                          Harris Goldberg
Cast                                                Matthew Perry
Genre                                                comedy-drama
Wiki Page               https://en.wikipedia.org/wiki/Numb_(film)
Plot            When modestly successful screenwriter Hudson M...
Plot Summary                                                  NaN
Name: 14567, dtype: object

In [9]:
with open('../data/wiki_movies/filtered_data_wiki_movies.pickle', 'wb') as f:
    pickle.dump(data, f)

In [10]:
def parse_cols(df, parse_func_dict):
    for idx in tqdm(range(df.shape[0])):
        for k in parse_func_dict:
            print(parse_func_dict[k](df[k].iloc[idx]))

def parse_year(year):
    return str(year)

def parse_origin(origin):
    return str(origin)

def parse_director(director_str):
    directors = []
    director_str = director_str.lower()
    space_to = ""
    director_str = director_str.replace("\n", "")
    parse_by_comma = director_str.split(", ")
    for i, a in enumerate(parse_by_comma):
        if 'by' in a:
            a = a.split('by')[-1]
        # parse by 'and'
        if 'and' in a:
            al = []
            for v in a.split(' and '):
                tmp = v.replace(" ", space_to)
                if len(tmp)!=0: al.append(tmp)
            a = al[:]
                    
        if type(a) is list:
            directors.extend(a)
        else:
            if len(a.replace(" ", space_to))!=0:
                directors.append(a.replace(" ", space_to))
    return directors

def parse_genre(genre_str):
    genres = []
    genre_str = genre_str.lower()
    space_to = ""
    genre_str = genre_str.replace("\n", "")
    parse_by_comma = genre_str.split(", ")
    for i, a in enumerate(parse_by_comma):
        if 'by' in a:
            a = a.split('by')[-1]
        # parse by 'and'
        if 'and' in a:
            al = []
            for v in a.split(' and '):
                tmp = v.replace(" ", space_to)
                if len(tmp)!=0: al.append(tmp)
            a = al[:]
                    
        if type(a) is list:
            genres.extend(a)
        else:
            if len(a.replace(" ", space_to))!=0:
                genres.append(a.replace(" ", space_to))
    return genres

def parse_casts(cast_str):
    casts = []
    cast_str = cast_str.lower()
    space_to = ""
    cast_str = cast_str.replace("\n", "")
    parse_by_comma = cast_str.split(", ")
    for i, a in enumerate(parse_by_comma):
        if 'by' in a:
            a = a.split('by')[-1]
        # parse by 'and'
        if 'and' in a:
            al = []
            for v in a.split(' and '):
                tmp = v.replace(" ", space_to)
                if len(tmp)!=0: al.append(tmp)
            a = al[:]
                    
        if type(a) is list:
            casts.extend(a)
        else:
            if len(a.replace(" ", space_to))!=0:
                casts.append(a.replace(" ", space_to))
    return casts

parse_func_dict = {
    "Cast": parse_casts
}
parse_cols(data, parse_func_dict)


  0%|          | 0/26198 [00:00<?, ?it/s]

['williamcraven', 'florencelawrence']
['bertharegustus', 'edwardboulden']
['arthurv.johnson', 'lindaarvidson']
['d.w.griffith']
['harrysolter', 'lindaarvidson']
['charlesinslee']
['florenceauer', 'johng.adolfi']
['marionleonard']
['arthurv.johnson']
['marypickford', 'macksennett']
['henryb.walthall']
['sidneyolcott', 'genegauntier', "thomaso'connor"]
['marypickford', 'henryb.walthall']
['charlesogle', 'nataliejerome']
['edgarg.wynn']
['marieeline', 'florencelabadie', 'mignonanderson', 'williamrussell']
['marypickford', 'kingbaggot']
['jamescruze']
['elmerbooth', 'lilliangish']
['williamgarwood', 'margueritesnow']
['macksennett', 'mabelnormand', 'fordsterling', 'barneyoldfield']
['fordsterling', 'mabelnormand']
['williamgarwood', 'marieeline']
['louiseglaum']
['j.warrenkerrigan', 'paulinebush']
['maemarsh', 'clairemcdowell']
['henryb.walthall', 'blanchesweet']
['donaldcrisp', 'liliangish']
['edwardabeles', 'sydneydeane', 'josephsingleton']
['charliechaplin', 'mackswain', 'phyllisallen']