In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

#### Raspagem: lista de mulheres nomeadas e ganhadoras do Oscar em categorias em que gêneros competem.

**Obs.: Essa foi uma raspagem inicial de uma página da Wikipedia. No projeto final, a fonte usada foi a base de dados do Oscar mostrada mais abaixo.**

Como alguns elementos ocupam mais que uma linha, não é possível fazer a raspagem iterando entre os múltiplos de cada célula. A solução encontrada foi:

- Para o ano ('year'): tentar converter os 4 primeiros caracteres da célula em inteiro. Se não houver erro, é porque a célula é um ano válido; ele é salvo na variável `new_year`.
- Para o status ('Nominated' ou 'Won'): todas as células de status possuem `<td class="no table-no2" (...)>` para _Nominated_ ou `<td class="yes table-yes2" (...)>` para _Won_. Verifica-se se a célula atual é uma dessas e, se sim, o texto dela, da categoria (título da tabela) e do ano `new_year` é adicionado em suas respectivas listas.


In [7]:
url = "https://en.wikipedia.org/wiki/List_of_female_Academy_Award_winners_and_nominees_for_non-gendered_categories"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

year = []
category = []
status = []
all_tables = soup.find_all(class_='wikitable')

for table in all_tables:
	td = table.find_all('td')
	new_year = 0
	for i in range(0, len(td)):
		try:
			old_year = new_year
			new_year = int(td[i].text[0:4])
			if new_year < 1000:
				new_year = old_year
		except ValueError:
			pass
		if td[i].get('class'):
			status.append(td[i].text[0:-1])
			category.append(td[0].text[18:-1])
			if new_year != 0:
				year.append(new_year)

oscar_women_df = pd.DataFrame({
	'year': year,
	'category': category,
	'status': status
})

#	renomeia as categorias que possuíam sobrescrito no título.
oscar_women_df.category = oscar_women_df.category.replace({
	'Best Sound [note 12]': 'Best Sound Mixing',
	'Best Original Score [note 7]': 'Best Original Score'
})

#	corrige um typo na tabela original.
oscar_women_df.status = oscar_women_df.status.replace({'Nominated}': 'Nominated'})

oscar_women_df


Unnamed: 0,year,category,status
0,2007,Best Animated Feature,Nominated
1,2011,Best Animated Feature,Nominated
2,2012,Best Animated Feature,Won
3,2013,Best Animated Feature,Won
4,2013,Best Animated Feature,Nominated
...,...,...,...
1418,2017,Best Original Screenplay,Nominated
1419,2017,Best Original Screenplay,Nominated
1420,2018,Best Original Screenplay,Nominated
1421,1917,Best Original Screenplay,Nominated


# Raspagem: [base de dados da Academia de Artes e Ciências Cinematográficas](https://awardsdatabase.oscars.org/).
Devido à dinamicidade do site, para a automação completa seria necessária o uso do Selenium. Para os fins do projeto, o código da página de busca avançada foi salvo nos arquivos `nominees-men.html` e `nominees-women.html`. As páginas possuem as seguintes categorias:

**Páginas salvas:**
1. Best Picture
2. Directing
3. Writing
4. Cinematography
5. Costume Design
6. Produdction Design
7. Documentary Feature
8. Film Editing
9. Makeup
10. International Feature Film
11. Visual Effects

### A FAZER:
[ ] Pesquisar um jeito mais simples de usar a split com múltiplos delimitadores, sem precisar usar o `.replace()` (linha 14)
	- Separadores: vírgula, ponto e vírgula, '&', 'and'
	- Remover nomes 'Producer', 'Producers', 'Written by' e 'Screenplay by'
[ ] Quando um diretor é indicado a dois filmes, somente o primeiro filme é minerado (e terminado por ';')

In [24]:
# Cria o dataframe para páginas de busca com apenas uma categoria.
def get_oscar_df(file):
	f = open(file)
	soup = BeautifulSoup(f, 'html.parser')
	df = pd.DataFrame({'year': [], 'category': [], 'film': [], 'name': [], 'status': []})

	all_groups = soup.find_all(class_='awards-result-chron result-group group-awardcategory-chron')
	for group in all_groups:
		year = group.find(class_="result-group-title").text.strip()[0:4]
		category = group.find(class_="result-subgroup-header").text.strip()
		all_films = group.find_all(class_="result-details awards-result-actingorsimilar")
		for row in all_films:
			film = row.find(class_="awards-result-film-title").text.strip()
			if row.find(class_="glyphicon glyphicon-star"):
				status = 'winner'
			else:
				status = 'nominated'

			all_names = row.find(class_="awards-result-nominationstatement").text.replace(' and ', ',').replace(';', ',').split(',')

			all_names = [i for i in all_names if 'Producer' not in i]
			for name in all_names:
				name = name.strip()
				row = pd.DataFrame({'year': [year],
									'category': [category],
									'film': [film],
									'name': [name],
									'status': [status]
									  })
				df = pd.concat([df, row], ignore_index=True)
				df = df[df.name != 'Jr.']
	return df


In [None]:
df = get_oscar_df('oscar-pages/directing-men.html')
df

In [69]:
# Cria o dataframe para páginas de buscas com múltiplas categorias.
def get_oscar_df_all(file):
	f = open(file)
	soup = BeautifulSoup(f, 'html.parser')
	df = pd.DataFrame({'year': [], 'category': [], 'film': [], 'name': [], 'status': []})

	all_groups = soup.find_all(class_='awards-result-chron result-group group-awardcategory-chron')
	for group in all_groups:
		year = group.find(class_="result-group-title").text.strip()[0:4]
		all_categories = group.find_all(class_="result-subgroup subgroup-awardcategory-chron")
		for category in all_categories:
			category = category.find(class_="result-subgroup-title").text.strip()
			all_films = group.find_all(class_="awards-result-nomination awards-result-nomination-actingorsimilar")
			for row in all_films:
				film = row.find(class_="awards-result-film-title")
				try:
					film = row.find(class_="awards-result-film-title").text.strip()
				except AttributeError:
					film = None
				if row.find(class_="glyphicon glyphicon-star"):
					status = 'winner'
				else:
					status = 'nominated'

				all_names = row.find(class_="awards-result-nominationstatement").text.replace(' and ', ',').replace(';', ',').split(',')

				all_names = [i for i in all_names if 'Producer' not in i]
				for name in all_names:
					name = name.strip()
					row = pd.DataFrame({'year': [year],
										'category': [category],
										'film': [film],
										'name': [name],
										'status': [status]
										  })
					df = pd.concat([df, row], ignore_index=True)
					df = df[df.name != 'Jr.']
	return df

In [71]:
df_men = get_oscar_df_all("oscar-pages/nominees-men.html")
df_men

Unnamed: 0,year,category,film,name,status
0,1927,ART DIRECTION,Sunrise,Rochus Gliese,nominated
1,1927,ART DIRECTION,The Dove;,William Cameron Menzies,nominated
2,1927,ART DIRECTION,7th Heaven,Harry Oliver,nominated
3,1927,ART DIRECTION,The Devil Dancer;,George Barnes,nominated
4,1927,ART DIRECTION,Sunrise,Charles Rosher,nominated
...,...,...,...,...,...
72145,2021,WRITING (Original Screenplay),Don't Look Up,Story by Adam McKay & David Sirota,nominated
72146,2021,WRITING (Original Screenplay),King Richard,Written by Zach Baylin,nominated
72147,2021,WRITING (Original Screenplay),Licorice Pizza,Written by Paul Thomas Anderson,nominated
72148,2021,WRITING (Original Screenplay),The Worst Person in the World,Written by Eskil Vogt,nominated


In [66]:
df_women = get_oscar_df_all("oscar-pages/nominees-women.html")
df_women

Unnamed: 0,year,category,film,name,status
0,1928,WRITING,Our Dancing Daughters,Josephine Lovett,nominated
1,1928,WRITING,A Woman of Affairs;,Bess Meredyth,nominated
2,1929,WRITING,The Big House,Frances Marion,nominated
3,1931,WRITING (Original Story),The Champ,Frances Marion,nominated
4,1931,WRITING (Original Story),What Price Hollywood?,Adela Rogers St. Johns,nominated
...,...,...,...,...,...
10344,2021,WRITING (Adapted Screenplay),West Side Story,Steven Spielberg,nominated
10345,2021,WRITING (Adapted Screenplay),West Side Story,Kristie Macosko Krieger,nominated
10346,2021,WRITING (Adapted Screenplay),CODA,Screenplay by Siân Heder,nominated
10347,2021,WRITING (Adapted Screenplay),The Lost Daughter,Written by Maggie Gyllenhaal,nominated


In [72]:
df_women.category.value_counts()

COSTUME DESIGN                                                                                                                               1289
FILM EDITING                                                                                                                                 1211
DOCUMENTARY (Feature)                                                                                                                        1194
BEST PICTURE                                                                                                                                 1085
ART DIRECTION                                                                                                                                 806
MAKEUP                                                                                                                                        617
WRITING (Adapted Screenplay)                                                                                                

In [79]:
df_men.to_csv('oscar-men.csv', index=False)

In [77]:
df_women.to_csv('oscar-women.csv', index=False)

In [80]:
df = pd.read_csv('oscar-men.csv')
df

Unnamed: 0,year,category,film,name,status
0,1927,ART DIRECTION,Sunrise,Rochus Gliese,nominated
1,1927,ART DIRECTION,The Dove;,William Cameron Menzies,nominated
2,1927,ART DIRECTION,7th Heaven,Harry Oliver,nominated
3,1927,ART DIRECTION,The Devil Dancer;,George Barnes,nominated
4,1927,ART DIRECTION,Sunrise,Charles Rosher,nominated
...,...,...,...,...,...
72145,2021,WRITING (Original Screenplay),Don't Look Up,Story by Adam McKay & David Sirota,nominated
72146,2021,WRITING (Original Screenplay),King Richard,Written by Zach Baylin,nominated
72147,2021,WRITING (Original Screenplay),Licorice Pizza,Written by Paul Thomas Anderson,nominated
72148,2021,WRITING (Original Screenplay),The Worst Person in the World,Written by Eskil Vogt,nominated
