In [100]:
# Import packages
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [101]:
# Pull rotten tomatoes data
url = 'https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now/'
request = requests.get(url)
bs = BeautifulSoup(request.content)

In [102]:
# Years column
year_elements = bs.find_all('span', class_="subtle start-year")
years = [element.text for element in year_elements]  
years = [int(year.strip('()')) for year in years]

In [103]:
# Title column
title_elements = bs.find_all('a', href=re.compile('/m/'))
titles = [element.text for element in title_elements]
titles = [title for title in titles if title.strip() not in ['\n\n', '[More]', '']]

In [104]:
# Score column
score_elements = bs.find_all('span', class_ = 'tMeterScore', style = 'margin-right: 10px;')
scores = [element.text for element in score_elements]
scores = [int(score.strip('%')) for score in scores]

In [105]:
# Make dataframe
df = pd.DataFrame()
df['Title'] = titles
df['Year'] = years
df['Score'] = scores

In [106]:
# Add rank column
df['Rank'] = df['Score'].rank(method='min', ascending=False).astype(int)

In [107]:
# Add Category column
df['Category'] = pd.cut(df['Score'], bins=[-float('inf'), 60, 75, float('inf')], 
                        labels=['Rotten', 'Fresh', 'Certified Fresh'], right=False)

In [108]:
# Sort by Score
df = df.sort_values(by = 'Score')

In [109]:
# Reset the indexing
df = df.reset_index(drop=True)

**a**

In [110]:
# a
df['Score'].mean()

81.42857142857143

**b**

In [111]:
# b
cor_coef = df['Score'].corr(df['Rank'])
cor_coef

-0.9446540262404389

**c**

In [112]:
# c
mean_rank_by_category = df.groupby('Category', observed = False)['Rank'].mean()
mean_rank_by_category

Category
Rotten             134.727273
Fresh              113.931034
Certified Fresh     48.240000
Name: Rank, dtype: float64

**d**

In [113]:
# d
df['Decade'] = df['Year'] // 10 * 10

**e**

In [114]:
# e
avg_score_by_decade = df.groupby('Decade', observed = False)['Score'].mean()
moviecount_by_decade = df.groupby('Decade', observed = False)['Title'].count()
moviecount_by_decade


Decade
1960     2
1970     9
1980    27
1990    37
2000    30
2010    35
Name: Title, dtype: int64

In [115]:
# e
avg_score_by_decade

Decade
1960    98.500000
1970    91.000000
1980    79.777778
1990    76.918919
2000    78.933333
2010    86.171429
Name: Score, dtype: float64

**f**

In [116]:
# f
# Write to csv file
df.to_csv('action.csv', index=False)