In [None]:
## footystats

In [1]:
import requests

import brotli
from bs4 import BeautifulSoup

from datetime import datetime, timedelta

In [194]:
import random

def generate_random_headers():
    browsers = [
        "Chrome",
        "Firefox",
        "Safari",
        "Edge",
        "Opera"
    ]

    os_systems = [
        "Windows NT 10.0; Win64; x64",
        "Macintosh; Intel Mac OS X 10_15_7",
        "X11; Linux x86_64"
    ]

    def random_version():
        return f"{random.randint(60, 99)}.0.{random.randint(3000, 4999)}.{random.randint(50, 150)}"

    user_agents = [
        f"Mozilla/5.0 ({os_choice}) AppleWebKit/537.36 (KHTML, like Gecko) {browser_choice}/{random_version()} Safari/537.36"
        if browser_choice != "Safari" else
        f"Mozilla/5.0 ({os_choice}) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
        for os_choice in os_systems
        for browser_choice in browsers
    ]

    accept_languages = [
        'en-US,en;q=0.4',
        'en-US,en;q=0.6',
        'en-US,en;q=0.8',
        'en-GB,en;q=0.5'
    ]

    headers = {
        'User-Agent': random.choice(user_agents),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': random.choice(accept_languages),
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1'
    }

    return headers

In [226]:
def get_url(url):
    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    #     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    #     'Accept-Language': 'en-US,en;q=0.5',
    #     'Accept-Encoding': 'gzip, deflate, br',
    #     'Connection': 'keep-alive',
    #     'Upgrade-Insecure-Requests': '1'
    # }
    headers = generate_random_headers()
    return requests.get(url, headers=headers)

def get_soup(response):
    try:
        soup = BeautifulSoup(response.content)
    except:
        decompressed_data = brotli.decompress(response.content)
        html_content = decompressed_data.decode('utf-8')    
        soup = BeautifulSoup(html_content)
    return soup

def get_page_soup(url):
    response = get_url(url)
    return get_soup(response)

In [146]:
fixtures_soup = get_page_soup("https://footystats.org/poland/1-liga/fixtures")

In [152]:
match_tables = fixtures_soup.find_all('div', {'class': 'league-matches'})

In [156]:
match_links = []
upcoming = []
for match_table in match_tables:
    h2h_links = match_table.find_all('a', {'class': 'h2h-link'})
    for h2h_link in h2h_links:
        match_link = h2h_link.get('href')
        if len(match_link.split('#')) == 1:
            upcoming.append(match_link)
        else:
            match_links.append(match_link)

In [49]:
def transform_to_number(string, return_type=int):
    try:
        return return_type(string.replace('%', '').strip())
    except ValueError:
        return string

In [161]:
prefix = "https://footystats.org"

In [104]:
url = "https://footystats.org/poland/gks-gornik-leczna-vs-zks-stal-rzeszow-h2h-stats#7457242"

In [186]:
soup = get_page_soup(url)

In [224]:
def scrape_match_details(soup, match_id):

    stats_home = {}
    stats_away = {}

    str_datetime = soup.find('time').get('datetime')
    match_date = datetime.strptime(str_datetime, '%Y-%m-%dT%H:%M')
    match_date = match_date + timedelta(hours=1)

    table = soup.find('table', {'class': 'comparison-table-table'})
    header = table.find('tr', {'class': 'row header'})
    teams = header.find_all('th', {'class': 'item stat'})
    home_team_id = teams[0].find('a').get('href').split('-')[-1]
    away_team_id = teams[1].find('a').get('href').split('-')[-1]

    stats_home['id'] = f"{match_id}-{home_team_id}"
    stats_home['match'] = match_id
    stats_home['team'] = home_team_id
    stats_home['opponent'] = away_team_id
    stats_home['home'] = True
    stats_home['date'] = match_date

    stats_away['id'] = f"{match_id}-{away_team_id}"
    stats_away['match'] = match_id
    stats_away['team'] = away_team_id
    stats_away['opponent'] = home_team_id
    stats_away['home'] = False
    stats_away['date'] = match_date

    for row in table.find_all('tr', {'class': 'row'}, recursive=False):
        key_item = row.find('td', {'class': 'item key'})
        key_stat = key_item.get_text().replace(' ', '')
        
        stats = row.select('td.item.stat')

        return_type = int
        if key_stat.strip() == 'xG':
            return_type = float

        stat_home = transform_to_number(stats[0].get_text(), return_type=return_type)
        stat_away = transform_to_number(stats[1].get_text(), return_type=return_type)
        stat = stat_home + stat_away

        if key_stat != 'Possession':
            stats_home[key_stat] = stat
            stats_away[key_stat] = stat

        stats_home[f"{str(key_stat)}_for"] = stat_home
        stats_home[f"{str(key_stat)}_against"] = stat_away

        stats_away[f"{str(key_stat)}_for"] = stat_away
        stats_away[f"{str(key_stat)}_against"] = stat_home

    score_div = soup.find('div', {'class': 'pa1e w50 rw100 fl'})
    score_rows = score_div.find_all('p', recursive=False)
    score_ft = score_rows[0].get_text()
    score_ht = score_rows[2].get_text()

    goals_home = int(score_ft.split()[0])
    goals_away = int(score_ft.split()[2])
    goals = goals_home + goals_away

    goals_home_1h = int(score_ht.split()[0][1:])
    goals_away_1h = int(score_ht.split()[2][:-1])
    goals_1h = goals_home_1h + goals_away_1h

    goals_home_2h = goals_home - goals_home_1h
    goals_away_2h = goals_away - goals_away_1h
    goals_2h = goals_home_2h + goals_away_2h

    # HOME

    stats_home[f"Goals"] = goals
    stats_home[f"Goals_for"] = goals_home
    stats_home[f"Goals_against"] = goals_away

    stats_home[f"Goals1H"] = goals_1h
    stats_home[f"Goals1H_for"] = goals_home_1h
    stats_home[f"Goals1H_against"] = goals_away_1h

    stats_home[f"Goals2H"] = goals_2h
    stats_home[f"Goals2H_for"] = goals_home_2h
    stats_home[f"Goals2H_against"] = goals_away_2h

    # AWAY

    stats_away[f"Goals"] = goals
    stats_away[f"Goals_for"] = goals_away
    stats_away[f"Goals_against"] = goals_home

    stats_away[f"Goals1H"] = goals_1h
    stats_away[f"Goals1H_for"] = goals_away_1h
    stats_away[f"Goals1H_against"] = goals_home_1h

    stats_away[f"Goals2H"] = goals_away
    stats_away[f"Goals2H_for"] = goals_away_2h
    stats_away[f"Goals2H_against"] = goals_home_2h

    return stats_home, stats_away

In [166]:
from tqdm import tqdm

In [171]:
link_soup.find('div', {'class': 'pa1e w50 rw100 fl'})

In [181]:
import chromedriver_binary

In [218]:
def is_loading(soup):
    title = soup.find('title').get_text().strip()
    if title == "Just a moment...":
        return True
    return False

In [None]:
scraped_ids = []

In [227]:
import time
from tqdm import tqdm

data = []
for link in tqdm(match_links):
    # get link
    link = prefix + link
    if link == "https://footystats.org/poland/mks-miedz-legnica-vs-wisla-krakow-h2h-stats#7457137":
        continue
    # check if already scraped
    match_id = link.split('#')[-1]
    if match_id in set(scraped_ids):
        continue
    
    tries = 0
    while tries < 3:
        tries += 1
        try:    
            response = get_url(link)
            link_soup = get_soup(response)
            if is_loading(link_soup):
                print('loading page, continuing')
                time.sleep(2) 
                continue
        except Exception as e:
            print(link)
            print('could not get valid url', e)
            continue
        # try:
        #     home, away = scrape_match_details(link_soup, link)
        #     data.append(home)
        #     data.append(away)
        # except Exception as e:
        #     print(link)
        #     print('could not get soup', e)
        # break
        home, away = scrape_match_details(link_soup, link)
        data.append(home)
        data.append(away)
        break

 11%|█         | 19/171 [00:00<00:05, 27.91it/s]


AttributeError: 'NoneType' object has no attribute 'find_all'

In [265]:
from requests_html import AsyncHTMLSession
import asyncio

async def fetch_page(url):
    session = AsyncHTMLSession()
    r = await session.get(url, headers=generate_random_headers())
    await r.html.arender(sleep=2, wait=2)  # Asynchronous rendering
    html_content = r.html.html  # Extract the rendered HTML
    await session.close()
    return html_content

# Wrapper function for processing
def get_html(url):
    return asyncio.run(fetch_page(url))

# Example usage
url = "https://footystats.org/poland/mks-miedz-legnica-vs-wisla-krakow-h2h-stats#7457137"
html = get_html(url)

In [267]:
BeautifulSoup(html).find('title-bar')

In [261]:
scrape_match_details(BeautifulSoup(html), 123)

AttributeError: 'NoneType' object has no attribute 'get'

In [249]:
import nest_asyncio

nest_asyncio.apply()

session = HTMLSession()
r = session.get("https://footystats.org/poland/mks-miedz-legnica-vs-wisla-krakow-h2h-stats#7457137")

html_str = r.html.render(wait=2, sleep=3)

RuntimeError: Cannot use HTMLSession within an existing event loop. Use AsyncHTMLSession instead.

In [246]:
soup = BeautifulSoup(html_str)

In [229]:
with open('file.html', 'wb') as f:
    f.write(response.content)

In [228]:
link_soup

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><link href="https://footystats.org/poland/mks-miedz-legnica-vs-wisla-plock-sa-h2h-stats" rel="canonical"/><link href="https://footystats.org/poland/mks-miedz-legnica-vs-wisla-plock-sa-h2h-stats" hreflang="x-default" rel="alternate"/>
<link href="https://footystats.org/jp/poland/mks-miedz-legnica-vs-wisla-plock-sa-h2h-stats" hreflang="ja-jp" rel="alternate"/>
<link href="https://footystats.org/tr/poland/mks-miedz-legnica-vs-wisla-plock-sa-h2h-stats" hreflang="tr" rel="alternate"/>
<link href="https://footystats.org/pt/poland/mks-miedz-legnica-vs-wisla-plock-sa-h2h-stats" hreflang="pt" rel="alternate"/>
<link href="https://footystats.org/kr/poland/mks-miedz-legnica-vs-wisla-plock-sa-h2h-stats" hreflang="ko" rel="alternate"/>
<link href="https://footystats.org/ru/poland/mks-miedz-legnica-vs-wisla-plock-sa-h2h-stats" hreflang="ru" rel="alternate"/>
<link href="https://footystats.org/es/poland/mks-miedz-legnica-vs-wisla-plock-sa-h

In [212]:
data = []
for link in tqdm(match_links):
    link = prefix + link
    if link == "https://footystats.org/poland/mks-miedz-legnica-vs-wisla-krakow-h2h-stats#7457137":
        continue
    try:
        response = get_url(link)
        link_soup = get_soup(response)
        if is_loading(link_soup):
            time.sleep(2)
            # rerun the iteration
    except Exception as e:
        print(link)
        print('could not get soup', e)
        continue
    
    home, away = scrape_match_details(link_soup, link)
    data.append(home)
    data.append(away)

 11%|█         | 18/171 [00:06<00:58,  2.63it/s]


AttributeError: 'NoneType' object has no attribute 'get'

In [217]:
link_soup

<!DOCTYPE html>
<html lang="en-US"><head><title>Just a moment...</title><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><meta content="IE=Edge" http-equiv="X-UA-Compatible"/><meta content="noindex,nofollow" name="robots"/><meta content="width=device-width,initial-scale=1" name="viewport"/><style>*{box-sizing:border-box;margin:0;padding:0}html{line-height:1.15;-webkit-text-size-adjust:100%;color:#313131;font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,Noto Sans,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol,Noto Color Emoji}body{display:flex;flex-direction:column;height:100vh;min-height:100vh}.main-content{margin:8rem auto;max-width:60rem;padding-left:1.5rem}@media (width <= 720px){.main-content{margin-top:4rem}}.h2{font-size:1.5rem;font-weight:500;line-height:2.25rem}@media (width <= 720px){.h2{font-size:1.25rem;line-height:1.5rem}}#challenge-error-text{background-image:url(data:image/svg+xml;base64,PHN2ZyB4bWx

In [215]:
scraped_ids = [statss['match'] for statss in data]

In [223]:
set(scraped_ids)

{'7457230',
 '7457231',
 '7457232',
 '7457233',
 '7457234',
 '7457235',
 '7457236',
 '7457237',
 '7457239',
 '7457240',
 '7457241',
 '7457242',
 '7457243',
 '7457244',
 '7457245',
 '7457246',
 '7457247'}

Possession, Shots, ShotsOnTarget, ShotsOffTarget, Cards, Corners, Fouls, Offsides, xG, Goals, Goals1H, Goals2H

In [126]:
stats_away

{'id': '7457242-796',
 'match': '7457242',
 'team': '796',
 'opponent': '8244',
 'home': False,
 'date': datetime.datetime(2024, 12, 7, 19, 35),
 'Possession_for': 42,
 'Possession_against': 58,
 'Shots': 29,
 'Shots_for': 12,
 'Shots_against': 17,
 'ShotsOnTarget': 13,
 'ShotsOnTarget_for': 6,
 'ShotsOnTarget_against': 7,
 'ShotsOffTarget': 8,
 'ShotsOffTarget_for': 4,
 'ShotsOffTarget_against': 4,
 'Cards': 0,
 'Cards_for': 0,
 'Cards_against': 0,
 'Corners': 15,
 'Corners_for': 2,
 'Corners_against': 13,
 'Fouls': 25,
 'Fouls_for': 12,
 'Fouls_against': 13,
 'Offsides': 3,
 'Offsides_for': 2,
 'Offsides_against': 1,
 'xG': 2.55,
 'xG_for': 1.11,
 'xG_against': 1.44,
 'Goals': 3,
 'Goals_for': 3,
 'Goals_against': 0,
 'Goals1H': 1,
 'Goals1H_for': 1,
 'Goals1H_against': 0,
 'Goals2H': 3,
 'Goals2H_for': 2,
 'Goals2H_against': 0}

In [93]:
stats_home

{'team': '8244',
 'opponent': '796',
 'home': True,
 'date': datetime.datetime(2024, 12, 7, 19, 35),
 'Possession_for': 58,
 'Possession_against': 42,
 'Shots': 29,
 'Shots_for': 17,
 'Shots_against': 12,
 'ShotsOnTarget': 13,
 'ShotsOnTarget_for': 7,
 'ShotsOnTarget_against': 6,
 'ShotsOffTarget': 8,
 'ShotsOffTarget_for': 4,
 'ShotsOffTarget_against': 4,
 'Cards': 0,
 'Cards_for': 0,
 'Cards_against': 0,
 'Corners': 15,
 'Corners_for': 13,
 'Corners_against': 2,
 'Fouls': 25,
 'Fouls_for': 13,
 'Fouls_against': 12,
 'Offsides': 3,
 'Offsides_for': 1,
 'Offsides_against': 2,
 'xG': 2.55,
 'xG_for': 1.44,
 'xG_against': 1.11,
 'goals': 3,
 'goals_for': 0,
 'goals_against': 3,
 'goals_1h': 1,
 'goals_1h_for': 0,
 'goals_1h_against': 1,
 'goals_2h': 0,
 'goals_2h_for': 0,
 'goals_2h_against': 2}

In [134]:
import pandas as pd

In [140]:
rows = [stats_home, stats_away]

In [141]:
pd.DataFrame(rows)

Unnamed: 0,id,match,team,opponent,home,date,Possession_for,Possession_against,Shots,Shots_for,...,xG_against,Goals,Goals_for,Goals_against,Goals1H,Goals1H_for,Goals1H_against,Goals2H,Goals2H_for,Goals2H_against
0,7457242-8244,7457242,8244,796,True,2024-12-07 19:35:00,58,42,29,17,...,1.11,3,0,3,1,0,1,0,0,2
1,7457242-796,7457242,796,8244,False,2024-12-07 19:35:00,42,58,29,12,...,1.44,3,3,0,1,1,0,3,2,0


In [None]:
# db

In [127]:
import sqlite3

conn = sqlite3.connect("example.db")
cursor = conn.cursor()

In [128]:
columns = ", ".join([f"{key} TEXT" for key in stats_home.keys()])
create_table_query = f"CREATE TABLE IF NOT EXISTS my_table ({columns})"
cursor.execute(create_table_query)

<sqlite3.Cursor at 0x7f1a58216140>

In [129]:
placeholders = ", ".join(["?" for _ in stats_home.values()])
insert_query = f"INSERT INTO my_table ({', '.join(stats_home.keys())}) VALUES ({placeholders})"
cursor.execute(insert_query, tuple(stats_home.values()))

<sqlite3.Cursor at 0x7f1a58216140>

In [130]:
conn.commit()

In [131]:
conn.close()