# Retrieve the Stats of all the players
using the `players_urls.csv` file

In [1]:
import requests
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup

In [2]:
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

In [3]:
player_urls = pd.read_csv('../data/player_urls.csv')
# drop the first column
player_urls

#### Add overall stats and image to the player_urls dataframe

In [4]:
hand_pattern =  r"var hand = \'(\w)\';"

### Get the hand of the player

In [5]:
url = "https://www.tennisabstract.com/cgi-bin/player.cgi?p=StefanosTsitsipas"
response = requests.get(url, headers=headers)
match = re.search(hand_pattern, response.text)
if match:
    hand_value = match.group(1)
    print("Hand:", hand_value)
else:
    print("Hand value not found.")

### Get the image of the player

In [6]:
img = "https://www.tennisabstract.com/photos/stefanos_tsitsipas-sirobi.jpg"
# get the url of the image
response = requests.get(img, headers)
if response.status_code == 200 or 304:
    print("Image found")
else:
    print("Image not found")

### Get the stats for all the players

In [7]:
def clean_player_name(name):
    # Remove everything between [], (), and extra spaces
    cleaned_name = re.sub(r'\[.*?\]|\(.*?\)|\s', '', name).strip()
    return cleaned_name

In [8]:

def get_html_of_page1(url):
    options = Options()
    options.headless = True
    service = ChromeService(executable_path='../../config/chromedriver-win64/chromedriver-win64/chromedriver.exe')
    browser = webdriver.Chrome(service=service, options=options)
    browser.get(url)
    WebDriverWait(browser, 20).until(
        EC.presence_of_element_located((By.ID, 'matches'))
    )
    html = browser.page_source
    browser.quit()
    return html

In [9]:
def add_space_before_uppercase(s):
    return re.sub(r'([A-Z])', r' \1', s).strip()

In [10]:
def get_html_of_page(url):
    options = Options()
    options.headless = True
    service = ChromeService(executable_path='../../config/chromedriver-win64/chromedriver-win64/chromedriver.exe')
    with webdriver.Chrome(service=service, options=options) as browser:
        browser.get(url)
        WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, 'matches')))
        return browser.page_source

In [11]:
def parse_match_row(row):
    cells = row.find_all('td')
    return {
        'Date': cells[0].text.strip(),
        'Tournament': cells[1].text.strip(),
        'Surface': cells[2].text.strip(),
        'Round in Tournament': cells[3].text.strip(),
        'Ranking at that time': cells[4].text.strip(),
        'Opponent Ranking at that time': cells[5].text.strip(),
        'Result': cells[6].text.strip(),
        'Set Scores': cells[7].text.strip(),
        'Dominance Ratio': cells[9].text.strip(),
        'Ace Ratio': cells[10].text.strip(),
        'Double Fault Ratio': cells[11].text.strip(),
        'First Serve Percentage': cells[12].text.strip(),
        'First Serve Points Won': cells[13].text.strip(),
        'Second Serve Points Won': cells[14].text.strip(),
        'Break Points Saved': cells[15].text.strip(),
        'Time': cells[16].text.strip()
    }

In [12]:
def process_player(player):
    player_name = re.sub(r'\s', '', player['name'])
    print("Player Name:", player_name)
    url = MATCHES_URL.format(player_name)
    player_html = get_html_of_page(url)
    soup = BeautifulSoup(player_html, 'html.parser')
    table = soup.find('table', {'id': 'matches'})
    player_matches = []
    upcoming_matches = []
    if table:
        rows = table.find('tbody').find_all('tr')
        for row in rows:
            match_data = parse_match_row(row)
            result = match_data['Result']
            set_scores = match_data['Set Scores']
            full_name = add_space_before_uppercase(player_name)
            if 'd.' in result and not set_scores:
                continue
            if (set_scores == "" or set_scores == 'Live Scores')  and 'vs' in result:
                print(f'Upcoming match: {result}')
                upcoming_matches.append({
                    'Date': match_data['Date'],
                    'Tournament': match_data['Tournament'],
                    'Surface': match_data['Surface'],
                    'Results': result,
                    'player': full_name
                })
                continue
            winner, loser = map(clean_player_name, result.split("d."))
            match_data.update({'Winner': winner, 'Loser': loser, 'player_name': full_name})
            player_matches.append(match_data)
    return player_matches, upcoming_matches

In [13]:
MATCHES_URL = "https://www.tennisabstract.com/cgi-bin/player-classic.cgi?p={}&f=ACareerqq"
all_matches = []
upcoming_matches = []

with ThreadPoolExecutor(max_workers=15) as executor:
    future_to_player = {executor.submit(process_player, player): player for _, player in player_urls.iterrows()}
    for future in as_completed(future_to_player):
        player_matches, player_upcoming_matches = future.result()
        all_matches.extend(player_matches)
        upcoming_matches.extend(player_upcoming_matches)

all_matches_df = pd.DataFrame(all_matches)
upcoming_df = pd.DataFrame(upcoming_matches)

In [14]:
all_matches_df = pd.DataFrame(all_matches)
upcoming_df = pd.DataFrame(upcoming_matches)

In [15]:
all_matches_df

In [16]:
print(f'Size of the df: {len(all_matches_df)}')

In [17]:
import config.ConnectionConfig as cc


cc.setupEnvironment()
spark = cc.startLocalCluster("Tennis Predictions")
spark.getActiveSession()

In [18]:
spark_df = spark.createDataFrame(all_matches_df)
spark_df.write.csv("../data/all_matches_spark.csv", header=True, mode='overwrite')

In [None]:
# all_matches_df.to_csv('../data/full_matches_data_raw.csv', index=False)

In [None]:
upcoming_df

In [None]:
#upcoming_df.to_csv("../data/upcoming_matches_info.csv")