
<h1 style="color:#ffc0cb;font-size:70px;font-family:Georgia;text-align:center;"><strong>National Leagues</strong></h1>

In [1]:
# Install a conda package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install missingno


# work with data in tabular representation
from datetime import time
import pandas as pd
# round the data in the correlation matrix
import numpy as np
import os
import glob


# Modules for data visualization
import seaborn as sns
import missingno as msno

# run a python function file 
%run ../../src/cleaning/function.py

# ignore DeprecationWarning Error Messages
import warnings
warnings.filterwarnings('ignore')



In [13]:
def concat_all():
    # csv files in the path
    # All files and directories ending with .txt with depth of 2 folders, ignoring names beginning with a dot:
    files = glob.glob("../../data/external/National Leagues/*" + "/*.csv")

    # defining an empty list to store
    # content
    data_frame = pd.DataFrame()
    content = []

    # checking all the csv files in the
    # specified path
    for filename in files:
        # reading content of csv file
        # content.append(filename)
        df = pd.read_csv(filename, index_col=None)
        df.loc[:, 'season'] = filename
        content.append(df)

    # converting content to data frame
    df = pd.concat(content)

    # Only get needed columns
    df = df[['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS',
             'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'season']]
    
    # rename columns follow SQL conventions
    rename_cols(df)
    
    # Cast all values inside the dataframe (except the columns' name) into lower case.
    df = df.applymap(lambda s: s.lower() if type(s) == str else s)
    
    # convert columns to the best possible dtypes, object->string
    df = df.convert_dtypes()
    # Cast date columns to the Date data type
    df['match_date'] = pd.to_datetime(df['match_date'])
    
    # drop rows with duplication for primary keys
    df = df.drop_duplicates(subset=['league_division', 'match_date', 'home_team', 'away_team', 'season'], keep="first")
    
    # drop rows with na for primary keys
    df = df.dropna(subset=['league_division', 'match_date', 'home_team', 'away_team', 'season'])
    
    # remove extra leading and trailing whitespace 
    whitespace_remover(df)
    
    # print out the shape
    print(f'The shape of the df is (row, column): {df.shape}\n')
    print(f'The list of the National Leagues final columns\' names is: {df.columns.to_list()}\n\n\n')
    return df

In [14]:
df = concat_all()

The shape of the df is (row, column): (16229, 24)

The list of the National Leagues final columns' names is: ['league_division', 'match_date', 'home_team', 'away_team', 'full_time_home_team_goals', 'full_time_away_team_goals', 'full_time_result', 'half_time_home_team_goals', 'half_time_away_team_goals', 'half_time_result', 'referee', 'home_shots', 'away_shots', 'home_shots_on_target', 'away_shots_on_target', 'home_fouls_committed', 'away_fouls_committed', 'home_corners', 'away_corners', 'home_yellow_cards', 'away_yellow_cards', 'home_red_cards', 'away_red_cards', 'season']





In [15]:
df

Unnamed: 0,league_division,match_date,home_team,away_team,full_time_home_team_goals,full_time_away_team_goals,full_time_result,half_time_home_team_goals,half_time_away_team_goals,half_time_result,referee,home_shots,away_shots,home_shots_on_target,away_shots_on_target,home_fouls_committed,away_fouls_committed,home_corners,away_corners,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards,season
0,e0,2016-08-13,burnley,swansea,0,1,a,0,0,d,j moss,10,17,3,9,10,14,7,4,3,2,0,0,../../data/external/national leagues\season 20...
1,e0,2016-08-13,crystal palace,west brom,0,1,a,0,0,d,c pawson,14,13,4,3,12,15,3,6,2,2,0,0,../../data/external/national leagues\season 20...
2,e0,2016-08-13,everton,tottenham,1,1,d,1,0,h,m atkinson,12,13,6,4,10,14,5,6,0,0,0,0,../../data/external/national leagues\season 20...
3,e0,2016-08-13,hull,leicester,2,1,h,1,0,h,m dean,14,18,5,5,8,17,5,3,2,2,0,0,../../data/external/national leagues\season 20...
4,e0,2016-08-13,man city,sunderland,2,1,h,1,0,h,r madley,16,7,4,3,11,14,9,6,1,2,0,0,../../data/external/national leagues\season 20...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302,sp2,2022-02-20,fuenlabrada,alcorcon,2,2,d,1,0,h,,16,11,10,4,21,12,4,6,2,3,0,0,../../data/external/national leagues\season 20...
303,sp2,2022-02-20,burgos,oviedo,0,1,a,0,0,d,,15,9,5,2,7,10,6,4,1,4,0,0,../../data/external/national leagues\season 20...
304,sp2,2022-02-20,huesca,lugo,1,0,h,0,0,d,,25,9,8,1,12,15,11,5,3,2,0,1,../../data/external/national leagues\season 20...
305,sp2,2022-02-20,sp gijon,ponferradina,2,3,a,0,2,a,,21,11,11,4,11,17,9,4,3,3,0,0,../../data/external/national leagues\season 20...


<a id="3"></a>
<h1 style="color:#ffc0cb;font-size:40px;font-family:Georgia;text-align:center;"><strong>Save the Intermediate data that has been transformed</strong></h1>

In [5]:
write_interim_path(df, 'national_leagues_2016_2022.csv', 'national_leagues')

cleaned national_leagues_2016_2022.csv data was successfully saved!



