In [17]:
from bs4 import BeautifulSoup
import re
import csv
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [20]:
class Player:
    def __init__(self, name):
        self.name = name
        self.country = "Unknown"
        self.position = "Unknown"
        self.goals_scored = 0
        self.goals_assisted = 0
        self.tackles = 0
        self.saves = 0
        self.club_name = "Unknown"
        self.league = "Unknown"
        self.league_website = "Unknown"


    def get_name(self):
        return self.name

    def set_name(self, name):
        self.name = name

    def get_country(self):
        return self.country

    def set_country(self, country):
        self.country = country

    def get_position(self):
        return self.position

    def set_position(self, position):
        self.position = position

    def get_goals_scored(self):
        return self.goals_scored

    def set_goals_scored(self, goals):
        self.goals_scored = goals

    def get_goals_assisted(self):
        return self.goals_assisted

    def set_goals_assisted(self, assists):
        self.goals_assisted = assists

    def get_tackles_won(self):
        return self.tackles

    def set_tackles_won(self, tackles):
        self.tackles = tackles

    def get_saves(self):
        return self.saves

    def set_saves(self, saves):
        self.saves = saves

    def get_club_name(self):
        return self.club_name

    def set_club_name(self, club_name):
        self.club_name = club_name

    def get_club_league(self):
        return self.league

    def set_club_league(self, league):
        self.league = league

    def get_league_website(self):
        return self.league_website

    def set_league_website(self, league_website):
        self.league_website = league_website
        
    def get_attributes(self):
        return [self.name , self.country, self.position, self.goals_scored,
                self.goals_assisted, self.tackles, self.saves, self.club_name,
                self.league, self.league_website]

In [3]:
class Country:
    def __init__(self, name):
        self.name = name
        self.players = []

    def get_name(self):
        return self.name

    def get_players(self):
        return self.players

    def add_player(self, player):
        self.players.append(player)

    def add_players(self, players):
        self.players.extend(players)


    def remove_player(self, player):
        players.remove(player)

    def count_win_rate():
        return None

    def count_clean_sheets():
        return None

    def count_total_goals_scored():
        return None

    def count_total_goals_conceeded():
        return None

    def count_passes_completed():
        return None

In [4]:
def find_players(squad_list):
    players_list = []
    player_postions = ["GK", "DF", "MF", "FW"]
    response = requests.get(squad_list)
    soup = BeautifulSoup(response.content, "html.parser")
    groups = soup.find_all('h2', string = re.compile('Group'))
    for group in groups:
        for i in range(4):
            team = group.find_next_siblings("h3")[i]
            for j in range(4):
                position = team.find_next_siblings("p")[j]
                players = position.get_text().split(":")[1]
                for player in players.split(", "):
                    player_name = player.split("(")[0]
                    # player_name = player_name.replace(',', '')
                    player_name = player_name.strip()

                    if(len(player.split("(")) > 1):
                        club_name = player.split("(")[1]
                    
                    club_name = club_name.replace(')', '')
                    club_name = club_name.replace('.', '')
                    club_name = club_name.strip()

                    player_obj = Player(player_name)
                    player_obj.set_country(team.get_text())
                    player_obj.set_position(player_postions[j])
                    player_obj.set_club_name(club_name)
                    players_list.append(player_obj)

    return players_list

In [13]:
def create_players_file(players_list):
    player_header = ["Name", "Country", "Position", "Goals", "Assists", "Tackles", "Saves", "Club", "League", "League_Website"]
    with open('players.csv', 'w') as file:
        writer = csv.writer(file)
        writer.writerow(player_header)
    
        for player in players_list:
            writer.writerow(player.get_attributes())

In [6]:
WC_SQUAD_LIST_URL = "https://www.skysports.com/football/news/12098/12741629/world-cup-2022-squad-lists-england-brazil-argentina-france-germany-spain-and-more"

In [23]:
players = find_players(WC_SQUAD_LIST_URL)
create_players_file(players)

## Assessing the data

In [24]:
# Load your data
df = pd.read_csv('players.csv')

In [30]:
# display the first five rows of the dataframe
df

Unnamed: 0,Name,Country,Position,Goals,Assists,Tackles,Saves,Club,League,League_Website
0,Saad Al Sheeb,Qatar,GK,0,0,0,0,Al-Sadd,Unknown,Unknown
1,Meshaal Barsham,Qatar,GK,0,0,0,0,Al-Sadd,Unknown,Unknown
2,Yousuf Hassan,Qatar,GK,0,0,0,0,Al-Gharafa,Unknown,Unknown
3,Pedro Miguel,Qatar,DF,0,0,0,0,Al-Sadd,Unknown,Unknown
4,Musaab Khidir,Qatar,DF,0,0,0,0,Al-Sadd,Unknown,Unknown
...,...,...,...,...,...,...,...,...,...,...
812,Jeong Woo-yeong,South Korea,MF,0,0,0,0,Freiburg,Unknown,Unknown
813,Lee Kang-in,South Korea,MF,0,0,0,0,Real Mallorca,Unknown,Unknown
814,Hwang Ui-jo,South Korea,FW,0,0,0,0,Olympiacos,Unknown,Unknown
815,Cho Gue-sung,South Korea,FW,0,0,0,0,Jeonbuk Motors,Unknown,Unknown


In [None]:
def find_league(club_name):
    res = requests.get("https://en.wikipedia.org/wiki/" + club_name)
    soup = BeautifulSoup(res.content, "html.parser")
    league = soup.find('th', string = re.compile('League'))
    return league.find_next_siblings("td")[0].get_text()

In [None]:
def find_league_website(league):
    res = requests.get("https://en.wikipedia.org/wiki/" + league)
    soup = BeautifulSoup(res.content, "html.parser")
    website = soup.find('th', string = re.compile('Website'))
    website_link = website.find_next_siblings("td")[0].get_text()
    return website_link