# DSC350 - Term Project - Milestone 4

**Cleaning/Formatting Website Data**<br>
Perform at least 5 data transformation and/or cleansing steps to your website data.

We begin the assignment by importing the necessary libraries for completion.

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

## Fetch Website Data

In [2]:
# Define URL for website
url = "https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats"
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.content, 'html.parser')

# Define the table that will be used
table = soup.find('table', {'id': 'stats_squads_standard_for'})

# Extract table headers and rows
headers = []
for th in table.find_all('tr')[1].find_all('th'):
    headers.append(th.text.strip())

rows = []
for row in table.find_all('tr')[2:]:
    cells = row.find_all(['th', 'td'])
    row_data = [cell.text.strip() for cell in cells]
    if len(row_data) == len(headers):
        rows.append(row_data)

# Convert to dataframe
df = pd.DataFrame(rows, columns=headers)

## Remove Nested Headers for Parsing

In [3]:
# Remove specified headers
headers_to_remove = ['', 'Playing Time', 'Performance', 'Expected', 'Progression', 'Per 90 Minutes']
df = df[[col for col in df.columns if col not in headers_to_remove]]

## Add Data

In [4]:
# Add column for "Squad ID"
df.insert(0, 'Squad ID', range(1, len(df) + 1))

## Remove Duplicates

In [5]:
# Remove duplicates from "Squad" column
df = df.drop_duplicates(subset=['Squad'])

## Change Headers

In [6]:
# Change headers to more readable format
new_column_names = {
    'Squad': 'Team',
    '# Pl': 'Num_Pl_Used',
    'Age': 'Avg_Team_Age',
    'Poss': 'Possession(Min)',
    'MP': 'Matches_Played',
    'Starts': 'Starts_by_Player',
    'Min': 'Mins',
    '90s': '90s_Played',
    'Gls': 'Goals',
    'Gls': 'Goals_Per_Game',
    'Ast': 'Assists',
    'Ast': 'Asst_Per_Game',
    'G+A': 'Gls+Assts',
    'G+A': 'Gls+Asst_Per_Game',
    'G-PK': 'Gls-PKs',
    'G-PK': 'Gls-PKs_Per_Game',
    'PK': 'PKs',
    'PKatt': 'PK_Attempts',
    'CrdY': 'Yellows',
    'CrdR': 'Reds',
    'xG': 'Expected_Gls',
    'xG': 'Exp_Gls_Per_Game',
    'npxG': 'Non_PK_Expected_Gls',
    'npxG': 'Non_PK_Exp_Gls_Per_Game',
    'xAG': 'Expected_Asst_Gls',
    'xAG': 'Exp_Asst_Gls_Per_Game',
    'npxG+xAG': 'Non_PK_Exp_Gls+Exp_Asst_Gls',
    'npxG+xAG': 'Non_PK_Exp_Gls+Exp_Asst_Gls_Per_Game',
    'PrgC': 'Prog_Carries',
    'PrgP': 'Prog_Passes',
    'Gls': 'Goals_Per_90',
    'Gls': 'Goals_Per_90_Per_Game',
    'Ast': 'Asst_Per_90',
    'Ast': 'Asst_Per_90_Per_Game',
    'G+A': 'Gls+Asst_Per_90',
    'G+A': 'Gls+Asst_Per_90_Per_Game',
    'G-PK': 'Gls-PKs_Per_90',
    'G-PK': 'Gls-PKs_Per_90_Per_Game',
    'G+A-PK': 'Gls+Asst-PKs_Per_90',
    'xG': 'Exp_Gls_Per_90',
    'xG': 'Exp_Gls_Per_90_Per_Game',
    'xAG': 'Exp_Asst_Gls_Per_90',
    'xAG': 'Exp_Asst_Gls_Per_90_Per_Game',
    'xG+xAG': 'Exp_Gls+Exp_Asst_Gls_Per_90',
    'npxG': 'Non_PK_Exp_Gls_Per_90',
    'npxG': 'Non_PK_Exp_Gls_Per_90_Per_Game',
    'npxG+xAG': 'Non_PK_Exp_Gls+Exp_Asst_Gls_Per_90',
    'npxG+xAG': 'Non_PK_Exp_Gls+Exp_Asst_Gls_Per_90_Per_Game'
    
}
df.rename(columns=new_column_names, inplace=True)

## Handling of Missing Values

In [7]:
df.fillna('N/A', inplace=True)

## Save to File and Load File to DataFrame

In [8]:
# Save dataframe to Excel file
file_path = (r'C:\Users\thefli0\Downloads\mls_fbref_stats.xlsx')
df.to_excel(file_path, index=False)

# Load Excel file to dataframe
df_from_excel = pd.read_excel(file_path)
print(df_from_excel)

    Squad ID              Team  Num_Pl_Used  Avg_Team_Age  Possession(Min)  \
0          1       Atlanta Utd           26          27.6             50.7   
1          2            Austin           24          29.2             50.4   
2          3       CF Montréal           26          25.5             49.7   
3          4         Charlotte           25          27.6             44.5   
4          5              Crew           25          26.9             60.3   
5          6       D.C. United           24          27.9             47.4   
6          7         Dynamo FC           24          28.2             58.7   
7          8     FC Cincinnati           25          27.0             53.0   
8          9         FC Dallas           26          27.3             47.5   
9         10              Fire           26          27.4             48.2   
10        11       Inter Miami           29          28.2             54.6   
11        12         LA Galaxy           25          27.9       