In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import json

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pydotplus

In [2]:
urls = ['https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(0%E2%80%939_and_A%E2%80%93C)',
       'https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(D%E2%80%93J)',
       'https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(K%E2%80%93R)',
       'https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(S%E2%80%93Z)']

Scrape Wikipedia for works of fiction that have been turned into feature films

In [37]:
origDF = pd.read_json('screenplay.json')

In [38]:
origDF.head()

Unnamed: 0,box_office,budget,director,genre,name,rating,runtime,url,user_rating,year
0,"$45,055,776","$20,000,000",Tom McCarthy,"[' Crime', ' Drama', ' History']",Spotlight,R,128 min,http://www.imdb.com/title/tt1895587/,8.1,2015
1,"$47,695,120","$8,500,000",Kenneth Lonergan,[' Drama'],Manchester by the Sea,R,137 min,http://www.imdb.com/title/tt4034228/,7.9,2016
2,,"$350,000",Preston Sturges,[' Comedy'],The Great McGinty,Passed,82 min,http://www.imdb.com/title/tt0032554/,7.5,1940
3,"$1,585,634","$839,727",Orson Welles,"[' Drama', ' Mystery']",Citizen Kane,Approved,119 min,http://www.imdb.com/title/tt0033467/,8.4,1941
4,,,George Stevens,"[' Comedy', ' Drama', ' Romance', ' Sport']",Woman of the Year,Not Rated,114 min,http://www.imdb.com/title/tt0035567/,7.3,1942


In [40]:
origDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76 entries, 0 to 75
Data columns (total 10 columns):
box_office     59 non-null object
budget         58 non-null object
director       76 non-null object
genre          76 non-null object
name           76 non-null object
rating         76 non-null object
runtime        75 non-null object
url            76 non-null object
user_rating    76 non-null float64
year           76 non-null int64
dtypes: float64(1), int64(1), object(8)
memory usage: 6.5+ KB


In [42]:
origDF.box_office.isnull().sum()

17

In [3]:
def return_dataframe(url):
    '''Takes in a url from the wikipedia "List of Fiction Works Made into 
    Feature Films", scrapes the page and returns a dataframe.
    '''
    response = requests.get(url)
    responsePage = response.text
    soup = BeautifulSoup(responsePage,'html.parser')
    
    bookMovie = []
    for tr in soup.find_all('tr')[2:]:
        if tr == 'NoneType':
            next
        else:
            cols = tr.find_all('td')
            x = []
            for td in cols:
                x.append(td.text)
            bookMovie.append(x)
    
    df = pd.DataFrame(bookMovie,columns=['fiction_work','film_adaptation'])
    return df

In [4]:
df = return_dataframe(urls[3])

In [5]:
df.head()

Unnamed: 0,fiction_work,film_adaptation
0,"The Saga of Pecos Bill (1923), Edward O'Reilly",Melody Time (1948)\nTall Tale (1995)
1,"Sahara (1992), Clive Cussler",Sahara (2005)
2,The Sailor Who Fell from Grace with the Sea (1...,The Sailor Who Fell from Grace with the Sea (1...
3,"Saint Jack (1973), Paul Theroux",Saint Jack (1979)
4,"Saint Johnson (1930), W. R. Burnett",Law and Order (1932)\nWild West Days (1937)


In [6]:
wikiDF = pd.DataFrame()

In [7]:
for url in urls:
    df = return_dataframe(url)
    print(df.head(2))
    wikiDF = wikiDF.append(df,ignore_index=True)

                                fiction_work       film_adaptation
0          100 Rifles (1958), Robert MacLeod     100 Rifles (1969)
1  101 Reykjavík (1996), Hallgrímur Helgason  101 Reykjavík (2000)
                          fiction_work           film_adaptation
0  The Da Vinci Code (2003), Dan Brown  The Da Vinci Code (2006)
1          Dad (1981), William Wharton                Dad (1989)
  fiction_work                                    film_adaptation
0         \n\n  This article does not cite any sources. Please...
1         None                                               None
                                     fiction_work  \
0  The Saga of Pecos Bill (1923), Edward O'Reilly   
1                    Sahara (1992), Clive Cussler   

                        film_adaptation  
0  Melody Time (1948)\nTall Tale (1995)  
1                         Sahara (2005)  


In [8]:
wikiDF.head()

Unnamed: 0,fiction_work,film_adaptation
0,"100 Rifles (1958), Robert MacLeod",100 Rifles (1969)
1,"101 Reykjavík (1996), Hallgrímur Helgason",101 Reykjavík (2000)
2,The 120 Days of Sodom (Les 120 journées de Sod...,Salò (1975)
3,"The 25th Hour (2001), David Benioff",25th Hour (2002)
4,"2010: Odyssey Two (1982), Arthur C. Clarke",2010 (1984)


In [9]:
wikiDF.shape

(1665, 2)

In [10]:
nullWiki = wikiDF[wikiDF.film_adaptation.isnull()]

In [11]:
nullWiki.shape

(24, 2)

In [12]:
nullWiki.head()

Unnamed: 0,fiction_work,film_adaptation
14,,
139,,
307,,
315,Bicho de Sete Cabeças (The Great Brain Storm) ...,
541,,


Drop the columns with None as the value

In [13]:
wikiDF.dropna(subset=["film_adaptation"],axis=0,inplace=True)

In [14]:
wikiDF.shape

(1641, 2)

In [15]:
#wikiDF[['film_adaptation','film_year']] = wikiDF['film_adaptation'].str.split('(',expand=True)
#df[['V','allele']] = df['V'].str.split('-',expand=True)

In [16]:
def find_year(name_year):
    return re.findall('\s.(\d{4})',name_year)

In [17]:
wikiDF['film_year'] = wikiDF['film_adaptation'].apply(lambda x: find_year(x))

In [18]:
wikiDF.head(10)

Unnamed: 0,fiction_work,film_adaptation,film_year
0,"100 Rifles (1958), Robert MacLeod",100 Rifles (1969),[1969]
1,"101 Reykjavík (1996), Hallgrímur Helgason",101 Reykjavík (2000),[2000]
2,The 120 Days of Sodom (Les 120 journées de Sod...,Salò (1975),[1975]
3,"The 25th Hour (2001), David Benioff",25th Hour (2002),[2002]
4,"2010: Odyssey Two (1982), Arthur C. Clarke",2010 (1984),[1984]
5,"4.50 From Paddington (1957), Agatha Christie","Murder, She Said (1961)\nLe crime est notre af...","[1961, 2008]"
6,"42nd Street (1930), Bradford Ropes",42nd Street (1933),[1933]
7,"48 Shades of Brown (1999), Nick Earls",48 Shades (2006),[2006]
8,"491 (1961), Lars Görling",491 (1964),[1964]
9,"50 Shades of Grey (2011), E.L. James",50 Shades of Grey (2015),[2015]


In [19]:
wikiDF.film_year.max()

['2015']

In [20]:
wikiDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1641 entries, 0 to 1664
Data columns (total 3 columns):
fiction_work       1641 non-null object
film_adaptation    1641 non-null object
film_year          1641 non-null object
dtypes: object(3)
memory usage: 51.3+ KB


In [21]:
wikiDF.to_csv('movie_year.csv')

In [22]:
# w['female'] = w['female'].map({'female': 1, 'male': 0})
#wikiDF.film_year.replace(to_replace='[]',value=None,inplace=True)

In [23]:
len(wikiDF.film_year.min())

0

In [24]:
#wikiDF.dropna(subset=["film_year"],axis=0,inplace=True)

In [25]:
#wikiDF.film_year.min()

In [26]:
# def drop_empty_list(elist):
#     if len(elist) == 0:
#         return None

In [27]:
# wikiDF['film_year'] = wikiDF['film_year'].apply(lambda x: drop_empty_list(x))

In [28]:
wikiDF.film_year.isnull().sum()

0