In [4]:
from bs4 import BeautifulSoup
import requests


In [1]:
url = 'https://en.wikipedia.org/wiki/List_of_American_films_of_2020' 

In [73]:
# Testing the scrapper across 
url_base = 'https://en.wikipedia.org/wiki/List_of_American_films_of_' 
years = [i for i in range(2018,2024)]

url_list = [url_base + str(year) for year in years] 

In [74]:
url_list

['https://en.wikipedia.org/wiki/List_of_American_films_of_2018',
 'https://en.wikipedia.org/wiki/List_of_American_films_of_2019',
 'https://en.wikipedia.org/wiki/List_of_American_films_of_2020',
 'https://en.wikipedia.org/wiki/List_of_American_films_of_2021',
 'https://en.wikipedia.org/wiki/List_of_American_films_of_2022',
 'https://en.wikipedia.org/wiki/List_of_American_films_of_2023']

In [5]:
response = requests.get(url)

In [16]:
if response.status_code == 200: # Check for sucessful request
    soup = BeautifulSoup(response.content, 'html.parser')

tables = soup.find_all('table')

In [18]:
rows = [] 


for table in tables:
    current_table_rows = table.find('tbody').find_all('tr')
    rows.append(current_table_rows)

[[<tr><th class="sidebar-title" style="background:#ccccff; font-size:100%; line-height: 1.5em; border-bottom: 2px solid white"><a href="/wiki/Cinema_of_the_United_States" title="Cinema of the United States">American films<br/>by year</a><br/><span typeof="mw:File"><a class="mw-file-description" href="/wiki/File:United_States_film_clapperboard.svg"><img class="mw-file-element" data-file-height="101" data-file-width="114" decoding="async" height="71" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/77/United_States_film_clapperboard.svg/80px-United_States_film_clapperboard.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/77/United_States_film_clapperboard.svg/120px-United_States_film_clapperboard.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/7/77/United_States_film_clapperboard.svg/160px-United_States_film_clapperboard.svg.png 2x" width="80"/></a></span></th></tr>, <tr><th class="sidebar-heading" style="background:#ccccff; border-top:2px solid white; 

In [21]:
# Flattening the rows list since it contains seperate lists for the rows of each table
rows_flattened = [item for sublist in rows for item in sublist] 

In [61]:
# Initialize processing parameters
table_processing_params = { 
    'movie_titles': {
        'data': [],
        'column_indexes': {
            'length_4': 0,
            'length_5': 1
        }
    },
    'production_companies': {
        'data': [],
        'column_indexes': {
            'length_4': 1,
            'length_5': 2
        }
    },
    'cast_and_crew_members': {
        'data': [],
        'column_indexes': {
            'length_4': 2,
            'length_5': 3
        }
    }
}

for row in rows_flattened:
    cells = row.find_all('td') 
    row_length = len(cells)
    
    # Process each data type based on row length
    for data_type, params in table_processing_params.items():
        data_holder = params['data']
        column_index_key = f'length_{row_length}'
        
        if column_index_key in params['column_indexes']:
            index = params['column_indexes'][column_index_key]
            
            data_holder.append(cells[index].get_text(strip=True))

# The first rows are related to a separate table of top movies. We want to filter those out
movie_titles = table_processing_params['movie_titles']['data'][8:]
production_companies = table_processing_params['production_companies']['data'][8:]
cast_and_crew_members = table_processing_params['cast_and_crew_members']['data'][8:]

In [76]:
def scrape_table_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    tables = soup.find_all('table', {'class': 'wikitable'})

    # Create a dictionary to store data for this specific URL
    url_data = {
        'movie_titles': [],
        'production_companies': [],
        'cast_and_crew_members': []
    }

    for table in tables:
        rows = table.find('tbody').find_all('tr')
        
        for row in rows:
            cells = row.find_all('td')
            row_length = len(cells)
            
            # Determine which columns to extract based on the row length
            if row_length == 4:
                url_data['movie_titles'].append(cells[0].get_text(strip=True))
                url_data['production_companies'].append(cells[1].get_text(strip=True))
                url_data['cast_and_crew_members'].append(cells[2].get_text(strip=True))
            elif row_length == 5:
                url_data['movie_titles'].append(cells[1].get_text(strip=True))
                url_data['production_companies'].append(cells[2].get_text(strip=True))
                url_data['cast_and_crew_members'].append(cells[3].get_text(strip=True))
            else:
                continue  # Skip rows with unexpected length

    return url_data


all_data = {}

# Loop through each URL and scrape data, storing it in the all_data dictionary
for url in url_list:
    year = url.split('_')[-1]  # Extract the year from the URL for labeling
    all_data[year] = scrape_table_from_url(url)

# Need to clean the first 3 rows of this data because of inconsistency in HTML
for key, list in all_data['2020'].items():
    all_data['2020'][key] = all_data['2020'][key][8:] 

In [77]:
for year, data in all_data.items():
    print(f"Data for {year}:")
    print("Movie Titles:", data['movie_titles'])
    print("Production Companies:", data['production_companies'])
    print("Cast and Crew Members:", data['cast_and_crew_members'])
    print("\n")

Data for 2018:
Movie Titles: ['Insidious: The Last Key', 'The Strange Ones', 'The Commuter', 'Proud Mary', 'Acts of Violence', 'Freak Show', 'Humor Me', '12 Strong', 'Den of Thieves', 'Forever My Girl', 'Thane of East County', 'Maze Runner: The Death Cure', 'Please Stand By', 'Winchester', 'A Fantastic Woman', 'Armed', 'The Cloverfield Paradox', 'Bad Apples', 'Peter Rabbit', 'Pad Man', 'Fifty Shades Freed', 'The 15:17 to Paris', 'Permission', 'Golden Exits', 'Black Panther', 'Nostalgia', 'Samson', 'Game Night', 'Annihilation', 'Every Day', 'The Cured', 'Red Sparrow', 'Death Wish', 'The Vanishing of Sidney Hall', 'Pickings', 'A Wrinkle in Time', 'Gringo', 'Thoroughbreds', 'The Strangers: Prey at Night', 'The Hurricane Heist', 'Tomb Raider', 'Love, Simon', 'I Can Only Imagine', 'Entebbe', 'Furlough', 'Josie', 'Flower', 'Pacific Rim Uprising', 'Isle of Dogs', 'Sherlock Gnomes', 'Unsane', 'Paul, Apostle of Christ', 'Final Portrait', 'Midnight Sun', 'Ready Player One', 'Acrimony', "God's No

In [87]:
import csv

In [91]:
all_data['2020'].keys()

dict_keys(['movie_titles', 'production_companies', 'cast_and_crew_members'])

In [92]:
# Create a CSV from our data
flattened_data = []
for year, info in all_data.items():
    for i in range(len(info['movie_titles'])):
        flattened_data.append({
            'Year': year,
            'Movie Title': info['movie_titles'][i],
            'Production Company': info['production_companies'][i],
            'Cast and Crew Members': info['cast_and_crew_members'][i],
        })

with open('CSV exports/output_test.csv', mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=flattened_data[0].keys())
    writer.writeheader()
    writer.writerows(flattened_data)