In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import packages.csv_to_df as csv_to_df
import requests
import re

In [2]:
# Función para extraer y ordenar todas las incidencias que encuentra de un partido en base a su URL
def get_incidences(url):
    incidences = []
    
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        incidence_table = soup.find_all('table', class_='datosPartido')
        
        if incidence_table:
            for table in incidence_table:
                rows = table.find_all('tr')
                for row in rows:
                    columns = row.find_all('td')
                    if len(columns) >= 2:
                        minute = columns[0].get_text().strip()
                        text = columns[1].get_text().strip()
                        if 'Gol' in text:
                            split_text = text.split(' de ')
                            if len(split_text) == 2:
                                incidence_type = split_text[0]
                                pattern = r'\(([^()]{0,4})\)'
                                match = re.search(pattern, split_text[1])
                                if match:
                                    goalscorer = split_text[1]
                                    goalscorer = re.sub(f' {pattern}', '', goalscorer)
                                    team = 'Rival'
                                else:
                                    goalscorer = split_text[1]
                                    team = 'Boca'
                                incidences.append([minute, goalscorer, incidence_type, team])
                                
                        elif 'le atajó un penal' in text and minute != '':
                            pattern = r'^(.*?) le (.*)$'
                            coincidence = re.match(pattern, text)
                            goalkeeper = coincidence.group(1).strip()
                            gk_pattern = r'\(([^()]{0,4})\)'
                            match = re.search(gk_pattern, goalkeeper)
                            if match:
                                goalkeeper = re.sub(f' {gk_pattern}', '', goalkeeper)
                                team = 'Rival'
                            else: team = 'Boca'
                            incidence_type = coincidence.group(2).strip()
                            incidence_match = re.search(gk_pattern, incidence_type)
                            if incidence_match:
                                incidence_type = re.sub(f' {gk_pattern}', '', incidence_type)
                            incidences.append([minute, goalkeeper, incidence_type, team])
                            
                        elif 'expulsado' in text:
                            player = re.sub(' expulsado', '', text)
                            pattern = r'\(([^()]{0,4})\)'
                            match = re.search(pattern, player)
                            if match:
                                player = re.sub(f' {pattern}', '', player)
                                team = 'Rival'
                            else: team = 'Boca'
                            incidence_type = 'Expulsado'
                            incidences.append([minute, player, incidence_type, team])
                        
                        elif 'desvió un penal' in text and minute != '':
                            player = re.sub(' desvió un penal', '', text)
                            pattern = r'\(([^()]{0,4})\)'
                            match = re.search(pattern, player)
                            if match:
                                player = re.sub(f' {pattern}', '', player)
                                team = 'Rival'
                            else: team = 'Boca'
                            incidence_type = 'Erró un penal'
                            incidences.append([minute, player, incidence_type, team])
                        if minute == '':
                            minute = 'Penales'
                            if 'Definición por penales:' in text:
                                text = re.sub('Definición por penales: ', '', text)

                            if 'atajó un penal' in text:
                                pattern = r'^(.*?) le (.*)$'
                                coincidence = re.match(pattern, text)
                                goalkeeper = coincidence.group(1).strip()
                                gk_pattern = r'\(([^()]{0,4})\)'
                                match = re.search(gk_pattern, goalkeeper)
                                if match:
                                    goalkeeper = re.sub(f' {gk_pattern}', '', goalkeeper)
                                    team = 'Boca'
                                else: team = 'Rival'
                                incidence_type = coincidence.group(2).strip()
                                incidence_match = re.search(gk_pattern, incidence_type)
                                if incidence_match:
                                    incidence_type = re.sub(f' {gk_pattern}', '', incidence_type)
                                incidences.append([minute, goalkeeper, incidence_type, team])
                                    
                            elif 'Convirtió' in text:
                                player = re.sub('Convirtió ', '', text)
                                pattern = r'\(([^()]{0,4})\)'
                                match = re.search(pattern, player)
                                if match:
                                    player = re.sub(f' {pattern}', '', player)
                                    team = 'Rival'
                                else: team = 'Boca'
                                incidence_type = 'Convirtió su penal'
                                incidences.append([minute, player, incidence_type, team])
                                
                            elif 'desvió un penal' in text:
                                player = re.sub(' desvió un penal', '', text)
                                pattern = r'\(([^()]{0,4})\)'
                                match = re.search(pattern, player)
                                if match:
                                    player = re.sub(f' {pattern}', '', player)
                                    team = 'Rival'
                                else: team = 'Boca'
                                incidence_type = 'Erró su penal'
                                incidences.append([minute, player, incidence_type, team])
                        
        else: incidences = ['None', 'None', 'No hubo incidencencias en el partido', 'None']
        
        return incidences
    
    else: return f'Error {response.status_code}'

In [3]:
# Extraer todas las incidencias elegidas en una competición seleccionado
# (La competición puede ser de un año en particular o de toda la historia)
# (El nombre de la competición debe coincidir en cómo está escrita en Boca_DF)
def get_incidences_type_list(competition, incidence_type):
    boca_df = csv_to_df.to_df('Boca_DF.csv')
    df = boca_df[boca_df['Competición'].str.contains(competition)]
    
    incidences_type_list = []

    for index, row in df.iterrows():
        game = row['URL']
        url = f'https://www.historiadeboca.com.ar/{game}'
        incidences = get_incidences(url)
        for incidence in incidences:
            if incidence_type in incidence[2] and 'Boca' in incidence[-1]:
                pattern = r'\((.*?)\)'
                match = re.search(pattern, incidence[1])
                if match:
                    player = re.sub(pattern, '', incidence[1])
                    player = re.sub(r'\s+', '', player)
                    player = re.sub(r'\.', '. ', player)
                    incidences_type_list.append(player)
                else:
                    player = incidence[1]
                    player = re.sub(r'\s+', '', player)
                    player = re.sub(r'\.', '. ', player)
                    incidences_type_list.append(player)
            else: continue
            
    return incidences_type_list

In [4]:
# Ordena la lista del incidente seleccionado
def sort_incendes_list(list):
    table = []

    for element in list:
        found = False
        for item in table:
            if item[0] == element:
                item[1] += 1
                found = True
                break
        if not found:
            table.append([element, 1])
    
    incidences_table = sorted(table, key=lambda x: x[1], reverse=True)
    
    return incidences_table

In [5]:
competition = 'Libertadores'
incidence_type = 'Gol'

list = get_incidences_type_list(competition, incidence_type)
sorted_list = sort_incendes_list(list)
df = pd.DataFrame(sorted_list, columns=['Jugador', incidence_type])
df.head(15)

Unnamed: 0,Jugador,Gol
0,J. R. Riquelme,25
1,M. Palermo,23
2,C. Tevez,22
3,Gmo. BarrosSchelotto,18
4,M. Delgado,18
5,R. Palacio,15
6,A. C. Rojas,12
7,D. Benedetto,11
8,N. Menéndez,10
9,A. H. Rojas,10
