## Thing left to do:

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.common import exceptions
from selenium.webdriver.common.keys import Keys
from datetime import datetime

### Get players stats match by match

In [81]:
def unfold_stats(driver):
    """click 'Show More' to reveal all hidden stats."""

    while True:
        try:
            WebDriverWait(driver, 5).until(ec.presence_of_element_located(
                ('class name', 'stats-see-more-btn')))
            driver.find_element_by_class_name('stats-see-more-btn').click()
        except exceptions.TimeoutException:
            return
        
            
def unfold_players(driver):
    """click 'Full List' buttons to reveal all hidden players."""

    WebDriverWait(driver, 5).until(ec.presence_of_element_located(
        ('class name', 'stats-category-full-list-label')))
    btns = driver.find_elements_by_class_name('stats-category-full-list-label')
    for btn in btns:
        btn.click()


def get_stat_label(elem):
    """Gets web element of stat table, returns its label"""
    
    label = elem.find_all('div', recursive=False)[0].contents[1].text
    return label


def get_player_id_score(player):
    """Gets player element, returns id and score"""
    
    p_id = int(player.contents[0]['href'].split('player/')[1])
    p_score = int(player.contents[2].text)
    return p_id, p_score


def stats_scraper(stats, item_type='player'):
    """Gets a list of web elements, returns players stats.

    :param stats: List of web elements of stats tables
    :param item_type: str. Available inputs: 'player' or 'team'.
    Default: 'players'.
    :returns nested dict of the form {stat: {player_id: score}}
    """

    items_stats = dict()
    for stat in stats:
        label = get_stat_label(stat)
        if label not in items_stats:
            items_score = dict()
            items = stat.find_all('li')
            for item in items:
                if item_type == 'player':
                    item_id, score = get_player_id_score(item)
                    items_score[item_id] = score
                elif item_type == 'team':
                    item_id, score = get_team_score(item)
                    items_score[item_id] = score
            items_stats[label] = items_score

    return items_stats


def create_players_stats_df(players_stats, season, gw):

    rows_for_df = []
    for pid in players_stats['Minutes'].keys():
        row = {}
        row['pid'] = pid
        for att, values in players_stats.items():
            if pid in values:
                row[att] = values[pid]
            else:
                row[att] = 0
        rows_for_df.append(row)
    
    df = pd.DataFrame(data=rows_for_df, columns=['pid'] + list(players_stats.keys()))
    df['Season'] = season
    df['Gameweek'] = gw
    return df



def select_position(driver, pos='all'):
    """Filter stats on web page by position."""

    if pos == 'all':
        driver.find_element_by_xpath('//*[@id="select2-selectPosition-container"]/span').click()
        driver.find_element_by_xpath(
            '//*[@id="stats-page-widget-react"]/div/ul/li[2]/span/span[1]'
            '/span').click()
    else:
        driver.find_element_by_xpath(
            '//*[@id="stats-page-widget-react"]/div/ul/li[2]/span/span[1]'
            '/span').click()
        gw_selector = driver.find_element_by_xpath(
            '/html/body/span/span/span[1]/input')
        gw_selector.send_keys(pos)
        gw_selector.send_keys(Keys.ENTER)

In [3]:
url = 'https://www.football.co.il/en/stats'
driver = webdriver.Chrome()
driver.get(url)
# unfold_players(driver)
html = driver.page_source
soup = BeautifulSoup(html)
poi = soup.select('#stats-page-widget-react > div > div', recursive=False)

player_stats_per_gw = stats_scraper(poi)

In [72]:
url = 'https://www.football.co.il/en/stats'
driver = webdriver.Chrome()
driver.get(url)
unfold_stats(driver)
unfold_players(driver)
html = driver.page_source
soup = BeautifulSoup(html)
poi = soup.select('#stats-page-widget-react > div > div', recursive=False)
print(len(poi))
select_position(driver, 'goalie')
unfold_stats(driver)
unfold_players(driver)
html = driver.page_source
soup = BeautifulSoup(html)
poi = poi + soup.select('#stats-page-widget-react > div > div', recursive=False)
print(len(poi))

37
52


In [74]:
df222 = create_players_stats_df(stats_scraper(poi), '1999', 55)

In [86]:
# df222.head()
select_position(driver)

In [324]:
df = pd.read_csv('stats.csv')

In [326]:
# Need to remove duplicate rows (gk)!!!!!!!!!!


# df[df['pid'] == 340791].sum()

# aaa = df[['pid', 'Season', 'Gameweek']]
# aaa.duplicated()
len(df['pid'].unique())

263

### Get players info

In [13]:
# https://www.football.co.il/en/player/128010
url = 'https://www.football.co.il/en/stats'
base_url = 'https://www.football.co.il'
driver = webdriver.Chrome()
# driver.get(url)
# html = driver.page_source
# soup = BeautifulSoup(html, features='lxml')
# driver.close()


# pids = [195168]
# player_info_df = create_players_info_df(driver, df['pid'].unique())

In [6]:
player_info_df

Unnamed: 0,Date of birth,Name,Position,Shirt number,Team,pid
0,1998-11-21,Amit Cohen,Defender,24,Hapoel Raanana,430404
1,1998-09-11,Ofri Arad,Defender,15,Maccabi Haifa,403068
2,1985-12-17,Ben Benjamin,Midfielder,19,Hapoel Raanana,402798
3,1998-08-28,Assaf Tzur,GK,22,Hapoel Raanana,378384
4,1996-08-25,Neta Lavi,Midfielder,6,Maccabi Haifa,340791
5,1995-07-05,Dolev Haziza,Forward,8,Maccabi Haifa,336147
6,1990-07-31,Ido Levy,Defender,21,Hapoel Raanana,327390
7,1992-02-13,Yuval Ashkenazi,Midfielder,18,Maccabi Haifa,296664


In [3]:
def get_player_info(driver, pid):
    
    try:
        driver.get(f'https://www.football.co.il/en/player/{pid}')
        player_html = driver.page_source
        player_soup = BeautifulSoup(player_html, features='lxml')
        p_info = player_soup.select('body > div.player-page > div >'
                                    'div.col-md-8.col-xs-12.player-right-side > '
                                    'div.player-details.col-xs-12')[0].text

        name_number = player_soup.select('.player-page > div > div > div > div', recursive=False)[0].text
        p_row = {}
        p_row['pid'] = pid
        p_row['Name'] = name_number.split(' | ')[0]
        p_row['Shirt number'] = name_number.split(' | ')[1]
        p_row['Team'] = p_info.split(' | ')[0].split('Team: ')[1]
        p_row['Position'] = p_info.split(' | ')[1].split('Position: ')[1]
        dob = p_info.split(' | ')[2].split('Date of birth: ')[1]
        p_row['Date of birth'] = datetime.strptime(dob, '%d.%m.%y')

        return p_row
    except IndexError:
        print(f'error: player id: {pid}')
        return None


def create_players_info_df(driver, p_ids):
    """Gets a list of players ids, returns a Dataframe with info"""

    count = 0
    rows = []
    for pid in p_ids:
        row = get_player_info(driver, pid)
        if row:
            rows.append(row)
        count += 1
        print(f'counter: {count}')    # delete row
    p_info_df = pd.DataFrame(data=rows)
    # Rename positions
    positions = {'defenseman': 'Defender', 'mid-fielder': 'Midfielder',
                 'goalie': 'GK', 'forward': 'Forward'}
    p_info_df.replace(positions, inplace=True)

    return p_info_df

# pids = [430404, 403068, 402798, 378384, 340791, 336147, 327390, 296664]
# rows = []
# for pid in pids:
#     rows.append(get_player_info(driver, pid))
# p_info_df = pd.DataFrame(data=rows)
# p_info_df

### Get matchses info

In [184]:
scores_url = f'{base_url}/en/scores'
driver = webdriver.Chrome()
driver.get(scores_url)


# get scores by gameweek
# select_gw = Select(driver.find_element_by_id('selectRound'))
html = driver.page_source
soup = BeautifulSoup(html, features='lxml')

In [112]:
# select_gw = Select(driver.find_element_by_id('selectRound'))
# for gw in select_gw.options:
#     print (gw.text)
# select_gw.select_by_index(3)
# show_all_btn = driver.find_element_by_class_name('scores-see-more-btn-label')
# show_all_btn.click()
# html = driver.page_source
# soup = BeautifulSoup(html, features='lxml')
# # results_table = soup.select('body > div.scores-page > div > div.col-xs-12.games-round-container.league-902.stage-RegularSeason.games-round-4 > div.current-round-details-view.col-xs-12')
# # len(results_table[0].find_all('div'))

# round_elems = soup.select('body > div.scores-page > div > div[class*="col-xs-12 games-round-container league-902"]')
# matches = round_elems[8].select('div.current-round-details-view.col-xs-12 > div[class*="current-round-details-row"]')
# match = matches[2].find_all('label')

# # for detail in match:
#     print(detail.text)
# match[5].text

def get_winner(score, teams):
    
    if score[0] > score[1]:
        return teams[0]
    elif score[1] > score[0]:
        return teams[1]
    else:
        return 'Draw'


def get_results(driver):
    
    html = driver.page_source
    soup = BeautifulSoup(html, features='lxml')
    gameweeks_elems = soup.select('body > div.scores-page > div > div[class*="col-xs-12 games-round-container league-902"]')
    rows = []
    season = '19/20'  # currently const. Not available for other seasons.
    
    for gw_elem in gameweeks_elems:
        matches_elems = gw_elem.select('div.current-round-details-view.col-xs-12 > div[class*="current-round-details-row"]')
        for match_elem in matches_elems:
            row = dict()
            row['Season'] = season
            row['Gameweek'] = int(match_elem['class'][6].split('-')[2])
            match_info = match_elem.find_all('label')            
            row['Date'] = datetime.strptime(match_info[0].text, '%d.%m.%y')
            row['Day'] = match_info[1].text
            row['Game time'] = match_info[2].text
            teams = match_info[3].text.split(' - ')
            row['Home team'] = teams[0]
            row['Away team'] = teams[1]
            score = match_info[4].text.strip().split(' - ')
            if len(score) < 2:
                continue
            row['Home team score'] = int(score[0])
            row['Away team score'] = int(score[1])
            row['Winner'] = get_winner(score, teams)
            row['Stadium'] = match_info[5].text
            rows.append(row)
    
    
    results_df = pd.DataFrame(data=rows, columns=rows[0].keys())
    
    return results_df


### Get teams data

In [8]:
def get_team_score(team):
    """Gets team element from stat table, returns team name and score"""

    t_name = team.contents[1].text
    t_score = float(team.contents[2].text)
    return t_name, t_score

In [23]:
url = 'https://www.football.co.il/en/stats'
driver = webdriver.Chrome()
driver.get(url)

# def select_position(driver, pos):
#     """Filter stats on web page by position."""

def select_item_type(driver, item_type):
    """

    :param driver: webdriver object
    :param item_type: str. 'player' or 'team'
    :return:
    """
    driver.find_element_by_xpath(
        '//*[@id="stats-page-widget-react"]/div/ul/li[1]/span').click()
    type_selector = driver.find_element_by_xpath(
        '/html/body/span/span/span[1]/input')
    type_selector.send_keys(item_type)
    type_selector.send_keys(Keys.ENTER)

In [28]:
html = driver.page_source
soup = BeautifulSoup(html, features='lxml')
# Same poi as of players stats
poi = soup.select('#stats-page-widget-react > div > div', recursive=False)
print(stats_scraper(poi, item_type='player'))
# poi

{'Minutes': {334794: 498, 18846: 498, 14433: 498, 12912: 498, 394101: 497}, 'Goals': {11253: 4, 387378: 3, 131676: 3, 67815: 3, 46305: 3}, 'Assists': {411501: 3, 331158: 3, 59385: 3, 331212: 2, 197070: 2}, 'Attempt on Goal': {217587: 18, 296664: 16, 46305: 16, 67815: 15, 45903: 15}, 'Key Pass': {336147: 12, 67815: 12, 331158: 11, 411501: 8, 178263: 8}, 'Accurate Key Passes': {331158: 8, 67815: 8, 336147: 6, 444615: 4, 411501: 4}, 'On Target': {178263: 10, 11253: 9, 67815: 8, 296664: 7, 217587: 7}, 'Attempts inside the Box': {67815: 14, 11253: 13, 45903: 12, 455373: 11, 18846: 11}}


In [7]:
# get_team_score

team = poi[0].find_all('li')[0]
# teams = poi.find_all('li')
team.contents[1].text

### check saved csv's

In [105]:
pd.options.display.max_columns = None
p_df = pd.read_csv('players_stats_by_gw.csv')
t_df = pd.read_csv('teams_stats_by_gw.csv')
# p_df.groupby(by='pid').sum().sort_values('Goals', ascending=False)[['Minutes', 'Goals']]

In [110]:
t_df.head()
# p_df.groupby(by='pid').sum().sort_values(by='Sub Out', ascending=False).head(8)
# p_df.info()

Unnamed: 0,Team,Goal,Ball Possession,Attempt on Goal,On Target,Attempts inside the Box,Attempts outside the Box,Penalty Goal,Corner,Cross,Passes,Key Pass,Accurate Key Passes,Accurate Passes,Attacking Passes,Air Challenge,Won Air Challenge,Ground Challenges,Won Ground Challenges,Dribbles,Successful Dribbles,Successful Tackles,Steals,Ball Recoveries,Ball Recoveries in Opponents Half,Ball Recoveries in Own Half,Blocked Attempts on Goal,Lost Ball,Lost Ball own half,Penalty Miss,Yellow Card,Red Card,Foul,Offside,Opponent Fouls,Assists,Left Flank Attacks,Right Flank Attacks,Center Flank Attacks,Left Flank Attacks With Shot,Right Flank Attacks With Shot,Center Flank Attacks With Shot,Season,Gameweek
0,Maccabi Tel Aviv,2.0,68.21,17.0,6.0,6.0,11.0,1.0,5.0,10.0,729.0,7.0,1.0,668.0,405.0,23.0,13.0,65.0,35.0,21.0,11.0,15.0,0,48.0,12.0,36.0,4.0,55.0,7.0,0.0,2.0,0.0,14.0,1.0,9.0,0.0,25.0,33.0,23.0,7.0,2.0,4.0,19/20,1
1,Bnei Yehuda,2.0,65.76,23.0,10.0,15.0,8.0,0.0,16.0,44.0,594.0,11.0,6.0,501.0,419.0,35.0,16.0,91.0,46.0,26.0,12.0,25.0,0,52.0,10.0,42.0,1.0,62.0,10.0,0.0,2.0,0.0,7.0,0.0,4.0,1.0,38.0,45.0,9.0,9.0,6.0,3.0,19/20,1
2,Maccabi Haifa,4.0,65.63,27.0,12.0,18.0,9.0,0.0,12.0,29.0,629.0,15.0,9.0,564.0,453.0,26.0,14.0,83.0,41.0,22.0,13.0,25.0,0,46.0,12.0,34.0,2.0,50.0,2.0,0.0,5.0,0.0,11.0,3.0,10.0,4.0,28.0,37.0,25.0,5.0,10.0,3.0,19/20,1
3,Maccabi Netanya,0.0,57.32,9.0,3.0,4.0,5.0,0.0,4.0,7.0,462.0,4.0,0.0,395.0,339.0,47.0,25.0,132.0,66.0,25.0,10.0,38.0,0,47.0,6.0,41.0,2.0,73.0,14.0,0.0,2.0,1.0,15.0,3.0,12.0,0.0,32.0,28.0,19.0,4.0,1.0,0.0,19/20,1
4,Hapoel Kfar Saba,0.0,56.64,10.0,3.0,2.0,8.0,0.0,3.0,12.0,447.0,4.0,2.0,372.0,337.0,40.0,19.0,115.0,57.0,30.0,16.0,37.0,0,57.0,10.0,47.0,4.0,75.0,15.0,0.0,4.0,0.0,12.0,1.0,13.0,0.0,34.0,28.0,20.0,4.0,2.0,1.0,19/20,1


In [10]:
m = map(lambda x: x+2, range(3))
[x for x in m]

[2, 3, 4]