In [1]:
import pandas as pd
from bs4 import BeautifulSoup

#Get data from a html
#Parsing HTML from file and creating soup of tags
soup = BeautifulSoup(open("../oscar.html"), "html.parser")


## Separando/organizando os dados

Todas as edições do oscar são dividdas respectivamente em:

**Ano > Categorias > Indicados > Vencedor**

### Variaveis
* **HTML_editions:** HTML com edições
* **categories:** DataFrame com todas as cetegorias | *categorie-year-winner*
* **unique_categorie:** Dataframe com cada todas as categorias | *categorie-first_year-count*
* **nominees:** DataFrame com todos os indicados | *name-year-movie-type-category-is_winner-is_movie*
* **winners:** Dataframe com todos os vencedores | *nome-categoria-tipo-ano*
    

In [2]:
HTML_editions = soup.find_all("div", class_="awards-result-chron result-group group-awardcategory-chron")

editions = []

categories = []
nominees = []

special_categories = ['HONORARY FOREIGN LANGUAGE FILM AWARD', 'SPECIAL AWARD', 'HONORARY AWARD',
                      'IRVING G. THALBERG MEMORIAL AWARD','JEAN HERSHOLT HUMANITARIAN AWARD',
                      'SPECIAL FOREIGN LANGUAGE FILM AWARD']

for edition in HTML_editions:
    year = int(edition.find(class_="result-group-title").find('a').contents[0].split(' ')[0]) + 1
    HTML_category = edition.find_all("div", class_="subgroup-awardcategory-chron")
    
    editions.append([year, len(HTML_category)])
    
    for category in HTML_category:
        category_name = category.find(class_="result-subgroup-title").find('a').contents[0].rstrip()
        categories.append([year, category_name])
        HTML_nominee = category.find_all("div", class_="result-details")
        
        if category_name not in special_categories:
            stat = "awards-result-nominationstatement"
            title = "awards-result-film-title"
            
            for nominee in HTML_nominee:
                winner = False
                if nominee.find(class_='glyphicon-star'):
                    winner = True
                if nominee.find(class_=stat) is not None and nominee.find(class_=title) is not None:
                    nominee_name = nominee.find(class_=stat).find('a').contents[0].rstrip()
                    movie_name = nominee.find(class_=title).find('a').contents[0].rstrip() + '| ('+str(year)+')'
                    nominees.append([year,category_name,movie_name,nominee_name,winner])
                    
                elif nominee.find(class_=stat) is not None:
                    nominee_name = nominee.find(class_=stat).find('a').contents[0].rstrip()
                    nominees.append([year,category_name,False,nominee_name,winner])
                    
                elif nominee.find(class_=title) is not None:
                    movie_name = nominee.find(class_=title).find('a').contents[0].rstrip() + '| ('+str(year)+')'
                    nominees.append([year,category_name,movie_name,False,winner])


In [3]:
# Populate nominees
nominees = pd.DataFrame(nominees)
nominees.columns = ['year', 'category', 'movie', 'nominee', 'is_winner']
nominees.head()

Unnamed: 0,year,category,movie,nominee,is_winner
0,1929,ACTOR,The Noose| (1929),Richard Barthelmess,False
1,1929,ACTOR,The Last Command| (1929),Emil Jannings,True
2,1929,ACTRESS,A Ship Comes In| (1929),Louise Dresser,False
3,1929,ACTRESS,7th Heaven| (1929),Janet Gaynor,True
4,1929,ACTRESS,Sadie Thompson| (1929),Gloria Swanson,False


In [4]:
# Dataframe with all winners
won = nominees[nominees['is_winner']].reset_index()
del won['index']
won.head()

Unnamed: 0,year,category,movie,nominee,is_winner
0,1929,ACTOR,The Last Command| (1929),Emil Jannings,True
1,1929,ACTRESS,7th Heaven| (1929),Janet Gaynor,True
2,1929,ART DIRECTION,The Dove;| (1929),William Cameron Menzies,True
3,1929,CINEMATOGRAPHY,Sunrise| (1929),Charles Rosher,True
4,1929,CINEMATOGRAPHY,Sunrise| (1929),Karl Struss,True


In [5]:
import re

# function to determinate the type of category 
def typeMovie(category):
    if re.search("DIRECTING|DIRECTION|DIRECTOR", category) is not None:
        return 'DIRECTION'
    elif re.search("ACTOR|ACTRESS", category) is not None:
        return 'ACTORS'
    elif re.search("FILM|SHORT|DOCUMENTARY", category) is not None:
        return 'MOVIE'
    elif re.search("SOUND|MUSIC", category) is not None:
        return 'MUSIC'
    elif re.search("CINEMATOGRAPHY", category) is not None:
        return 'CINEMATOGRAPHY'
    elif re.search("WRITING", category) is not None:
        return 'WRITING'
    elif re.search("PRODUCTION", category) is not None:
        return 'PRODUCTION'
    else:
        return 'OTHER'

#function  to return the first year of category appear
def fist_year(category):
    df =  ct_year.loc[ct_year['category'] == category]
    return df['year'].min()

def indications(movie):
    df =  nominees.loc[nominees['movie'] == movie]
    return len(df)

def winner(movie):
    df =  nominees.loc[nominees['movie'] == movie]
    df = df[nominees['is_winner']]
    return len(df)

def movie_year(movie):
    df =  nominees.loc[nominees['movie'] == movie]
    return df['year'].min()

def splitter(movie):
    if movie is not False:
        return movie.split('|')[0]

In [6]:
categories = pd.DataFrame(categories)
categories.columns = ['year', 'category']
# ct_year: copy categories for use in first_year function

ct_year = categories.copy()
ct_year['first_year'] = ct_year['category'].apply(fist_year)
ct_year.head()

Unnamed: 0,year,category,first_year
0,1929,ACTOR,1929
1,1929,ACTRESS,1929
2,1929,ART DIRECTION,1929
3,1929,CINEMATOGRAPHY,1929
4,1929,DIRECTING (Comedy Picture),1929


In [7]:
unique_categorie = categories.groupby('category').count()
unique_categorie = unique_categorie.reset_index()
unique_categorie = unique_categorie.sort_values('year', ascending=False)
unique_categorie['first_year'] = unique_categorie['category'].apply(fist_year)

# format years to datetime and sorting
unique_categorie['type'] = unique_categorie['category'].apply(typeMovie)
unique_categorie = unique_categorie.set_index('category')

unique_categorie.head()

Unnamed: 0_level_0,year,first_year,type
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DIRECTING,88,1930,DIRECTION
FILM EDITING,83,1935,MOVIE
ACTOR IN A SUPPORTING ROLE,81,1937,ACTORS
ACTRESS IN A SUPPORTING ROLE,81,1937,ACTORS
DOCUMENTARY (Short Subject),74,1942,MOVIE


## Banco de pessoas e filmes

Com todos as categorias, indicados e vencedores em **categories**, **nominees** e **winners** está na hora de pegar mais informações sobre os filmes, atores e diretores.

Usando a API da Wikipedia para pegar mais informações e as imagens dos atores e filmes



In [8]:
editions = pd.DataFrame(editions)
editions.columns = ['year', 'participation']
editions['edition'] = [x+1 for x in range(len(editions.index.values))]
editions.head()

Unnamed: 0,year,participation,edition
0,1929,13,1
1,1930,7,2
2,1931,8,3
3,1932,9,4
4,1933,13,5


In [9]:
import numpy as np
movies = pd.DataFrame(np.unique(list(nominees['movie'])), columns=['movie'])
movies['year'] = movies['movie'].apply(movie_year)
movies['indications'] = movies['movie'].apply(indications)

#ERRO COM TITANIC - 2 filmes com o mesmo nome

In [10]:
movies['win'] = movies['movie'].apply(winner)
movies = movies.sort_values('indications', ascending=False)
movies['movie'] = movies['movie'].apply(splitter)



In [11]:
def getWinner(year, category):
    if len(nominees.loc[(nominees['year'] == year) & (nominees['category'] == category) & (nominees['is_winner'] == True)].index) > 0:
        selec = nominees.loc[(nominees['year'] == year) & (nominees['category'] == category) & (nominees['is_winner'] == True)]
        if re.search("DIRECTING|DIRECTION|DIRECTOR|ACTOR|ACTRESS", category) is not None:
            return selec['nominee'].tolist()[0]
        elif re.search("FILM|SHORT|DOCUMENTARY|CINEMATOGRAPHY|WRITING|SOUND|MUSIC|PRODUCTION", category) is not None:
            return splitter(selec['movie'].tolist()[0])
        else:
            return selec['nominee'].tolist()[0]
    
    else:
        return None

for category in unique_categorie.index.values:
    editions[category] = editions['year'].apply(lambda x: getWinner(x, category))

    
editions

Unnamed: 0,year,participation,edition,DIRECTING,FILM EDITING,ACTOR IN A SUPPORTING ROLE,ACTRESS IN A SUPPORTING ROLE,DOCUMENTARY (Short Subject),DOCUMENTARY (Feature),CINEMATOGRAPHY,...,SPECIAL ACHIEVEMENT AWARD (Sound Effects),SPECIAL ACHIEVEMENT AWARD (Sound Editing),DOCUMENTARY,SHORT FILM (Dramatic Live Action),MUSIC (Original Song Score or Adaptation Score),MUSIC (Music Score of a Dramatic Picture),MUSIC (Adaptation Score),DIRECTING (Comedy Picture),DIRECTING (Dramatic Picture),WRITING (Title Writing)
0,1929,13,1,,,,,,,Sunrise,...,,,,,,,,Lewis Milestone,Frank Borzage,
1,1930,7,2,Frank Lloyd,,,,,,White Shadows in the South Seas,...,,,,,,,,,,
2,1931,8,3,Lewis Milestone,,,,,,With Byrd at the South Pole,...,,,,,,,,,,
3,1932,9,4,Norman Taurog,,,,,,Tabu,...,,,,,,,,,,
4,1933,13,5,Frank Borzage,,,,,,Shanghai Express,...,,,,,,,,,,
5,1934,13,6,Frank Lloyd,,,,,,A Farewell to Arms,...,,,,,,,,,,
6,1935,17,7,Frank Capra,Eskimo,,,,,Cleopatra,...,,,,,,,,,,
7,1936,18,8,John Ford,A Midsummer Night's Dream,,,,,A Midsummer Night's Dream,...,,,,,,,,,,
8,1937,21,9,Frank Capra,Anthony Adverse,Walter Brennan,Gale Sondergaard,,,Anthony Adverse,...,,,,,,,,,,
9,1938,22,10,Leo McCarey,Lost Horizon,Joseph Schildkraut,Alice Brady,,,The Good Earth,...,,,,,,,,,,


In [12]:
movies = movies.set_index('movie')
movies = movies.drop(movies.index[-1])
movies.head()

Unnamed: 0_level_0,year,indications,win
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
All about Eve,1951.0,14,6
Titanic,1998.0,14,11
La La Land,2017.0,14,6
Gone with the Wind,1940.0,13,8
The Curious Case of Benjamin Button,2009.0,13,3


# Salvando

Salvando informações em um csv para usar em outro notebook **movies_local.csv**

In [13]:
nominees.to_csv('./database/nominees_local.csv')

unique_categorie.to_csv('./database/unique_categorie.csv')

movies.to_csv('./database/movies_local.csv')

editions.to_csv('./database/all_editions.csv')

In [14]:
excel = pd.ExcelWriter('./database/oscar_local.xlsx')
editions.to_excel(excel,'Edições')
nominees.to_excel(excel,'Todos os indicados')
unique_categorie.to_excel(excel,'Todas as Categorias')
movies.to_excel(excel,'Filmes indicados')
excel.save()

In [15]:
# from tmdbv3api import Movie
# from tmdbv3api import TMDb

# movie = Movie()
# tmdb = TMDb()

# tmdb.api_key = 'aa043c2ae33192f63fcc150bb97816b7'

# def find_movie(mv):
#     search = movie.search(mv)
#     return len(search)

In [16]:
# movies['same_titles'] = [find_movie(x) for x in movies.index.values]

In [17]:
# no_match = movies.loc[movies['same_titles'] == 0]

In [18]:
# no_match

In [19]:
# bigger_then = movies.loc[movies['same_titles'] > 1].sort_values('same_titles', ascending=False)
# bigger_then

In [20]:
# unique = movies.loc[movies['same_titles'] == 1]
# unique

In [21]:
# from tmdbv3api import Movie
# from tmdbv3api import TMDb

# movie = Movie()
# tmdb = TMDb()

# tmdb.api_key = 'aa043c2ae33192f63fcc150bb97816b7'

# teste = movie.search('Titanic')

# teste

In [22]:
#tmdbv3api
#omdb
#imdb
# year | director | image | imdb_rate | categories | duration | budget | return | country | language



In [23]:
# unique