In [1]:
from selenium.webdriver.common.by import By
from selenium import webdriver as wb
from tqdm import tqdm
import pandas as pd
import unicodedata

import time
import os
import re

In [2]:
def normalize_text(text, lower=True, kebab_case=True, snake_case=False):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    if lower:
        text = text.lower()
    if snake_case:
        text = text.replace('/', '_').replace(' ', '_')
    elif kebab_case:
        text = text.replace('/', '-').replace(' ', '-')
    text = re.sub(r'[^a-zA-Z0-9_-]', '', text)
    return text

In [3]:
BASE_URL = 'https://rojabet.com/es-es/deportes'
WB_ARGUMENTS = ['headless', 'disable-gpu', 'no-sandbox', 'disable-dev-shm-usage']
WB_ARGUMENTS.pop(0)

ROOT_PATH = os.path.normpath(os.path.join(os.getcwd(), '..'))
CHROME_PATH = os.path.join(ROOT_PATH, 'chromedriver', 'chromedriver')

In [None]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec

options = wb.ChromeOptions()
for arg in WB_ARGUMENTS:
    options.add_argument(f'--{arg}')

browser = wb.Chrome(service=wb.chrome.service.Service(CHROME_PATH), options=options)
browser.set_window_size(1920, 1080)
browser.get(BASE_URL)

wait = WebDriverWait(browser, 10)

events_data = list()

sports = wait.until(
    ec.presence_of_all_elements_located((By.XPATH, '//aside[@id="sports-menu"]//ul'))
)[1]
sports = sports.find_elements(By.XPATH, './/li/a')[2:]
sports = [sport.get_attribute('href').removeprefix(BASE_URL).strip('/') for sport in sports]

print(f'Found {len(sports)} sports')
print(sports)

for sport in sports[1:]:

    try:
        try:
            browser.get(f'{BASE_URL}/{sport}')
            regions = wait.until(
                ec.presence_of_all_elements_located((
                    By.XPATH, "//aside[@id='sports-menu']//ul[1]"
                ))
            )[2]
        except Exception:
            browser.get(BASE_URL)
            time.sleep(10)
            browser.get(f'{BASE_URL}/{sport}')
            regions = wait.until(
                ec.presence_of_all_elements_located((
                    By.XPATH, "//aside[@id='sports-menu']//ul[1]"
                ))
            )[2]
        regions = regions.find_elements(By.XPATH, './/li/a')
        regions = [
            region.get_attribute('href').removeprefix(BASE_URL).strip('/') for region in regions
        ]
        regions = [url.removeprefix(sport).strip('/') for url in regions]

        print(f'# Sport: {sport} - Found {len(regions)} regions')
        print(regions)

        for region in tqdm(regions, desc='Regions', total=len(regions)):

            browser.get(f'{BASE_URL}/{sport}/{region}')

            try:
                events = wait.until(
                    ec.presence_of_element_located((
                        By.XPATH, "//*[contains(@class, 'Sportsbook_cuoponList__dataLoader')]"
                    ))
                )
                events = events.find_elements(
                    By.XPATH, ".//div[contains(@class, 'Sportsbook_sportsbook__event__')]"
                )
            except Exception:
                print(f'No events found for {sport}/{region}')
                continue

            for e in events:
                try:
                    time = e.find_element(
                        By.XPATH,
                        ".//span[contains(@class, 'Sportsbook_sportsbook__eventDescriptionTime__')]"
                    ).text
                    teams = e.find_element(
                        By.XPATH,
                        ".//span[contains(@class, 'Sportsbook_sportsbook__eventDescriptionTeams_')]"
                    ).text
                    market_id = e.find_elements(
                        By.XPATH, ".//div[contains(@class, 'Market_market')]"
                    )[0].get_attribute('id')
                    clean_market_id = market_id.removeprefix('root_market__').split('_')[0]

                    events_data.append({
                        'sport': sport,
                        'region': region,
                        'time': time,
                        'teams': teams,
                        'market_id': market_id,
                        'clean_market_id': clean_market_id
                    })
                    # print(f'{sport}/{region} - {market_id} - {time} - {teams}')
                except Exception:
                    print('Error extracting event data')
                    continue
                # break
            # break
    except Exception:
        print(f'# Error extracting region data for {sport}')
        continue
    # break

In [18]:
events_df = pd.DataFrame(events_data)
events_df.clean_market_idea = events_df.market_id.str.split('_').str[-1]
events_df.clean_market_idea = events_df.clean_market_idea.astype(int)
events_df

Unnamed: 0,sport,region,time,teams,market_id,clean_market_idea
0,tenis,australia,02:40,Charles Broom/Anirudh Chandrasekar - Christian...,root_market__3332458,3332458
1,tenis,australia,03:00,T Andrianjafitrimo / M Helgo - Kaylah Mcphee /...,root_market__3332433,3332433
2,tenis,australia,21:00,James McCabe - Omar Jasika,root_market__3332024,3332024
3,tenis,australia,21:00,Bernard Tomic - Adam Walton,root_market__3332130,3332130
4,tenis,estados-unidos,12:00,Matias Franco Descotte - Louis Wessels,root_market__3333589,3333589
...,...,...,...,...,...,...
218,baloncesto,lituania,14:00,Vilniaus Kibirkstis-MRU Women - BC Siauliai Women,root_market__3332244,3332244
219,baloncesto,lituania,12:00,BC Siauliai - Rytas Vilnius,root_market__3333035,3333035
220,baloncesto,serbia,14:30,ZKK Novosadska ZKA Women - ZKK Proleter 023 Women,root_market__3332633,3332633
221,baloncesto,serbia,16:00,ZKK Sloga Women - Spartak Subotica Women,root_market__3333303,3333303


In [33]:
headers = {
    "authority": "rojabet.com",
    "method": "GET",
    "scheme": "https",
    "accept": "application/json, text/plain, */*",
    "accept-encoding": "gzip, deflate, br, zstd",
    "accept-language": "en-US,en;q=0.9,es;q=0.8",
    # "cookie": "_fbp=fb.1.1736250193455.566644107827997343; callbell_uid=94c87cd0-ccec-11ef-86f1-0f4d50058ca7; cbanner=false; token_portal=eyJhbGciOiJIUzM4NCJ9.eyJzdWIiOiI0MDAyODMyNzUzIiwiaWF0IjoxNzM2MjUwMzc5LCJleHAiOjE3NDE0MzQzNzksInNhbHQiOi00OTg4NjM4Nzl9.Ifst4spRCTQayw136H0-IaTVMHYnSonOly6QFKsgI2Acuy3poeZjw0-OToqg2zgk; NEXT_LOCALE=es-es; _gid=GA1.2.1251334264.1738174579; token_present=true; notouserid=2846fc1b6c06b6f1b422b047e7a10646; notodeviceid=a9bb5e1c-e003-4ef1-b204-b4ee2dee378e; notouser_agent=Mozilla/5.0 (X11; notolanguage=en-US; nototimezone_offset=180; notonavigator_platform=Linux x86_64; notohardwareHash=41956109ea9b5bba7c26b7dbc393aa4d9d0a0c57; notorealIp=45.71.44.83; notoroute=45.71.44.83; osgBonus=343240_jymvqgwbgljwgncriwcwboxpr; osgGeoData=%7B%22countryCode%22%3A%22CL%22%2C%22countryName%22%3A%22Chile%22%7D; osgGeoIp=45.71.44.83; osgGeoAccess=true; _ga_Y362C4LFC6=GS1.1.1738216000.7.1.1738216023.0.0.0; _ga=GA1.1.1704984103.1736250193",
    "if-none-match": '"avolillxjnfvs"',
    "priority": "u=1, i",
    "referer": "https://rojabet.com/es-es/deportes/futbol/chile/chile-copa/{event_id}",
    "sec-ch-ua": '"Not A(Brand";v="8", "Chromium";v="132", "Google Chrome";v="132"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"Linux"',
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
    "x-osg-language": "es-ES"
}

import requests
import time
from tqdm import tqdm

api_event_url = "https://rojabet.com/api/coupons?eventId={event_id}"
json_data = dict()

for event_id in tqdm(events_df.clean_market_idea, desc="Events", total=len(events_df)):
    try:
        r = requests.get(api_event_url.format(event_id=event_id), headers=headers)
        data = r.json()
        json_data[event_id] = data
    except Exception:
        time.sleep(10)
        r = requests.get(api_event_url.format(event_id=event_id), headers=headers)
        data = r.json()
        json_data[event_id] = data

Events: 100%|██████████| 223/223 [02:38<00:00,  1.41it/s]


In [36]:
json_data[3332458]

{'events': [], 'markets': [], 'errors': []}

In [37]:
json_data[3332433]

{'events': [], 'markets': [], 'errors': []}

In [46]:
empty = 0
for key, value in json_data.items():
    if len(json_data[key]['events']) == 0:
        empty += 1
print(f'Empty events: {empty}')

Empty events: 2


In [34]:
json_data.keys()

dict_keys([3332458, 3332433, 3332024, 3332130, 3333589, 3333635, 3333637, 3331972, 3333654, 3333665, 3333707, 3333708, 3333590, 3333639, 3333650, 3331981, 3333655, 3333667, 3333710, 3333643, 3333648, 3331975, 3333641, 3331979, 3333134, 3333660, 3286034, 3286045, 3331342, 3333022, 3333027, 3333596, 3333044, 3333048, 3333040, 3333604, 3331558, 3331568, 3331765, 3331570, 3333644, 3333645, 3333649, 3332158, 3326392, 3331589, 3332486, 3326391, 3332692, 3331401, 3330974, 3333101, 3331361, 3330649, 3331528, 3330973, 3333099, 3333132, 3333143, 3333289, 3331346, 3332686, 3333058, 3332687, 3331050, 3331056, 3332338, 3332540, 3333059, 3333153, 3331343, 3332869, 3332885, 3333050, 3333119, 3332862, 3332863, 3333116, 3333542, 3333611, 3331546, 3331547, 3333063, 3333150, 3333130, 3073395, 3093159, 3332544, 3332646, 3332547, 3332561, 3332602, 3332603, 3332606, 3332601, 3332604, 3333030, 3331515, 3332627, 3332631, 3331540, 3331539, 3332937, 3332938, 3332939, 3332940, 3333140, 3333148, 3333091, 3332923,

In [32]:
events_df

Unnamed: 0,sport,region,time,teams,market_id,clean_market_idea
0,tenis,australia,02:40,Charles Broom/Anirudh Chandrasekar - Christian...,root_market__3332458,3332458
1,tenis,australia,03:00,T Andrianjafitrimo / M Helgo - Kaylah Mcphee /...,root_market__3332433,3332433
2,tenis,australia,21:00,James McCabe - Omar Jasika,root_market__3332024,3332024
3,tenis,australia,21:00,Bernard Tomic - Adam Walton,root_market__3332130,3332130
4,tenis,estados-unidos,12:00,Matias Franco Descotte - Louis Wessels,root_market__3333589,3333589
...,...,...,...,...,...,...
218,baloncesto,lituania,14:00,Vilniaus Kibirkstis-MRU Women - BC Siauliai Women,root_market__3332244,3332244
219,baloncesto,lituania,12:00,BC Siauliai - Rytas Vilnius,root_market__3333035,3333035
220,baloncesto,serbia,14:30,ZKK Novosadska ZKA Women - ZKK Proleter 023 Women,root_market__3332633,3332633
221,baloncesto,serbia,16:00,ZKK Sloga Women - Spartak Subotica Women,root_market__3333303,3333303
