In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime
import random
import unicodedata
import warnings
warnings.filterwarnings("ignore")

In [8]:
def remove_diacritics(text):
    """
    Remove diacritics from text
    Converts characters like ă, â, î, ș, ț to a, a, i, s, t
    """
    # First normalize to NFD (decomposed form)
    normalized = unicodedata.normalize('NFD', text)
    # Filter out combining characters (the diacritics)
    without_diacritics = ''.join(char for char in normalized if unicodedata.category(char) != 'Mn')
    return without_diacritics

def scrape_transfermarkt_squad(url, team_name="Unknown"):
    """
    Scrape player data from a Transfermarkt squad page
    Returns a list of dictionaries with Player, Age, and Foot data
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
    }

    try:
        print(f"Scraping {team_name}...")
        response = requests.get(url, headers=headers, timeout=20)

        if response.status_code == 503:
            print(f"503 error for {team_name}. Waiting longer before retry...")
            time.sleep(30)  # Wait 30 seconds
            response = requests.get(url, headers=headers, timeout=20)

        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        players_data = []

        # Find the squad table - looking for the detailed view table
        squad_table = soup.find('table', class_='items')

        if not squad_table:
            print(f"Could not find squad table for {team_name}")
            return []

        # Find all player rows (skip header row)
        player_rows = squad_table.find('tbody').find_all('tr', class_=['odd', 'even'])

        for row in player_rows:
            try:
                # Extract player name
                player_cell = row.find('td', class_='posrela')
                if player_cell:
                    player_link = player_cell.find('a')
                    if player_link:
                        player_name_raw = player_link.get_text(strip=True)
                        # Remove diacritics from player name
                        player_name = remove_diacritics(player_name_raw)
                    else:
                        continue
                else:
                    continue

                # Extract age from Date of birth/Age column
                # Find all td elements in the row
                all_cells = row.find_all('td')
                age = "N/A"

                # Look through all cells to find the one with date and age format
                for cell in all_cells:
                    cell_text = cell.get_text(strip=True)
                    # Look for pattern like "02.08.1990 (34)" where 34 is the age
                    age_match = re.search(r'\d{2}\.\d{2}\.\d{4}\s*\((\d+)\)', cell_text)
                    if age_match:
                        age = age_match.group(1)
                        break

                # If we still don't have age, try a more general parentheses pattern
                if age == "N/A":
                    for cell in all_cells:
                        cell_text = cell.get_text(strip=True)
                        age_match = re.search(r'\((\d+)\)', cell_text)
                        if age_match and len(age_match.group(1)) <= 2:  # Age should be 1-2 digits
                            age = age_match.group(1)
                            break

                # Extract foot - should be in the "Foot" column
                foot_cells = row.find_all('td')
                foot = "N/A"

                # Find the foot column (usually around index 7-8)
                for i, cell in enumerate(foot_cells):
                    cell_text = cell.get_text(strip=True).lower()
                    if cell_text in ['right', 'left', 'both']:
                        foot = cell_text.capitalize()
                        break

                # If we couldn't find foot in the expected way, try looking for specific column
                if foot == "N/A":
                    try:
                        # Foot is typically in column index 7 or 8
                        if len(foot_cells) > 7:
                            foot_text = foot_cells[7].get_text(strip=True)
                            if foot_text.lower() in ['right', 'left', 'both']:
                                foot = foot_text.capitalize()
                    except:
                        pass

                players_data.append({
                    'Player': player_name,
                    'Age': age,
                    'Foot': foot,
                    'Team': team_name
                })

            except Exception as e:
                print(f"Error processing player row in {team_name}: {e}")
                continue

        print(f"Successfully scraped {len(players_data)} players from {team_name}")
        return players_data

    except requests.exceptions.RequestException as e:
        print(f"Request error for {team_name}: {e}")
        return []
    except Exception as e:
        print(f"Unexpected error for {team_name}: {e}")
        return []

def main():
    # List of URLs and corresponding team names
    urls_and_teams = [
        ("https://www.transfermarkt.com/fcsb/kader/verein/301/saison_id/2024/plus/1", "FCSB"),
        ("https://www.transfermarkt.com/cfr-cluj/kader/verein/7769/saison_id/2024/plus/1", "CFR Cluj"),
        ("https://www.transfermarkt.com/fc-rapid-1923/kader/verein/455/saison_id/2024/plus/1", "FC Rapid 1923"),
        ("https://www.transfermarkt.com/universitatea-craiova/kader/verein/40812/saison_id/2024/plus/1", "Universitatea Craiova"),
        ("https://www.transfermarkt.com/fc-universitatea-cluj/kader/verein/6429/saison_id/2024/plus/1", "FC Universitatea Cluj"),
        ("https://www.transfermarkt.com/fc-dinamo/kader/verein/312/saison_id/2024/plus/1", "FC Dinamo"),
        ("https://www.transfermarkt.com/sepsi-osk-sf-gheorghe/kader/verein/54585/saison_id/2024/plus/1", "Sepsi OSK"),
        ("https://www.transfermarkt.com/fcv-farul-constanta/kader/verein/29831/saison_id/2024/plus/1", "FCV Farul Constanta"),
        ("https://www.transfermarkt.com/petrolul-ploiesti/kader/verein/9465/saison_id/2024/plus/1", "Petrolul Ploiesti"),
        ("https://www.transfermarkt.com/sc-otelul-galati/kader/verein/4959/saison_id/2024/plus/1", "SC Otelul Galati"),
        ("https://www.transfermarkt.com/fc-hermannstadt/kader/verein/58049/saison_id/2024/plus/1", "FC Hermannstadt"),
        ("https://www.transfermarkt.com/uta-arad/kader/verein/952/saison_id/2024/plus/1", "UTA Arad"),
        ("https://www.transfermarkt.com/fc-botosani/kader/verein/8818/saison_id/2024/plus/1", "FC Botosani"),
        ("https://www.transfermarkt.com/acsm-politehnica-iasi/kader/verein/33966/saison_id/2024/plus/1", "Politehnica Iasi"),
        ("https://www.transfermarkt.com/fc-buzau/kader/verein/11380/saison_id/2024/plus/1", "FC Buzau"),
        ("https://www.transfermarkt.com/afc-unirea-04-slobozia/kader/verein/29700/saison_id/2024/plus/1", "AFC Unirea Slobozia")
    ]

    all_players = []

    print("Starting Transfermarkt scraping...")
    print("=" * 50)

    for i, (url, team_name) in enumerate(urls_and_teams):
        # Add random delay between 8-15 seconds to avoid rate limiting
        if i > 0:  # Don't wait before the first request
            wait_time = random.uniform(8, 15)
            print(f"Waiting {wait_time:.1f} seconds before next request...")
            time.sleep(wait_time)

        players = scrape_transfermarkt_squad(url, team_name)
        all_players.extend(players)

        print(f"Total players scraped so far: {len(all_players)}")
        print("-" * 30)

    # Create DataFrame
    if all_players:
        df = pd.DataFrame(all_players)

        # Remove the Team column for final output (keeping only Player, Age, Foot)
        df_final = df[['Player', 'Age', 'Foot']].copy()

        # Save to CSV
        filename = f"transfermarkt_players.csv"
        df_final.to_csv(filename, index=False)

        print("\n" + "=" * 50)
        print("SCRAPING COMPLETED!")
        print(f"Total players scraped: {len(df_final)}")
        print(f"Data saved to: {filename}")

        # Display summary
        print("\nSummary by team:")
        team_summary = df.groupby('Team').size().sort_values(ascending=False)
        for team, count in team_summary.items():
            print(f"{team}: {count} players")

        print(f"\nFirst 10 rows of data:")
        print(df_final.head(10).to_string(index=False))

        # Summary statistics
        print(f"\nFoot distribution:")
        foot_dist = df_final['Foot'].value_counts()
        print(foot_dist)

    else:
        print("No data was scraped. Please check the URLs and try again.")

# Run the scraper
if __name__ == "__main__":
    main()

Starting Transfermarkt scraping...
Scraping FCSB...
Successfully scraped 47 players from FCSB
Total players scraped so far: 47
------------------------------
Waiting 11.0 seconds before next request...
Scraping CFR Cluj...
Successfully scraped 50 players from CFR Cluj
Total players scraped so far: 97
------------------------------
Waiting 13.7 seconds before next request...
Scraping FC Rapid 1923...
Successfully scraped 48 players from FC Rapid 1923
Total players scraped so far: 145
------------------------------
Waiting 9.6 seconds before next request...
Scraping Universitatea Craiova...
Successfully scraped 38 players from Universitatea Craiova
Total players scraped so far: 183
------------------------------
Waiting 9.1 seconds before next request...
Scraping FC Universitatea Cluj...
Successfully scraped 37 players from FC Universitatea Cluj
Total players scraped so far: 220
------------------------------
Waiting 10.4 seconds before next request...
Scraping FC Dinamo...
Successfully 