In [57]:
import kagglehub

path = kagglehub.dataset_download("martj42/womens-international-football-results")

print("Path to dataset files:", path)

Path to dataset files: /Users/veronica/.cache/kagglehub/datasets/martj42/womens-international-football-results/versions/33


In [58]:
import os
import pandas as pd

# List all CSV files in the directory
csv_files = [file for file in os.listdir(path) if file.endswith('.csv')]

# Read each CSV file into a DataFrame
dataframes = {file: pd.read_csv(os.path.join(path, file)) for file in csv_files}

# Print the names of the loaded DataFrames
print("Loaded DataFrames:", list(dataframes.keys()))

Loaded DataFrames: ['goalscorers.csv', 'shootouts.csv', 'results.csv']


In [59]:
goalscorers_df = dataframes.get('goalscorers.csv')
shootouts_df = dataframes.get('shootouts.csv')
results_df = dataframes.get('results.csv')

display(goalscorers_df)
display(shootouts_df)
display(results_df)

Unnamed: 0,date,home_team,away_team,team,scorer,minute,own_goal,penalty
0,1984-04-08,England,Denmark,England,Linda Curl,31,False,False
1,1984-04-08,England,Denmark,Denmark,Inge Hindkjær,49,False,True
2,1984-04-08,England,Denmark,England,Elisabeth Deighan,51,False,False
3,1984-04-08,Italy,Sweden,Italy,Carolina Morace,18,False,False
4,1984-04-08,Italy,Sweden,Sweden,Helen Johansson,23,False,False
...,...,...,...,...,...,...,...,...
2790,2025-07-22,England,Italy,England,Michelle Agyemang,90,False,False
2791,2025-07-22,England,Italy,England,Chloe Kelly,119,False,False
2792,2025-07-23,Germany,Spain,Spain,Aitana Bonmatí,113,False,False
2793,2025-07-27,England,Spain,Spain,Mariona Caldentey,25,False,False


Unnamed: 0,date,home_team,away_team,winner,first_shooter
0,1984-05-27,England,Sweden,Sweden,
1,1988-06-12,China PR,Brazil,Brazil,Brazil
2,1995-06-13,Sweden,China PR,China PR,China PR
3,1996-05-18,Japan,Canada,Japan,
4,1998-07-25,China PR,Norway,China PR,
...,...,...,...,...,...
122,2025-07-25,Ghana,South Africa,Ghana,South Africa
123,2025-07-27,England,Spain,England,England
124,2025-07-28,Argentina,Colombia,Colombia,Colombia
125,2025-08-01,Argentina,Uruguay,Argentina,Uruguay


Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1956-09-23,Germany,Netherlands,2,1,Friendly,Essen,Germany,False
1,1957-07-28,Germany,England,1,1,Friendly,Stuttgart,Germany,False
2,1957-10-13,Germany,Netherlands,2,0,Friendly,Berlin,Germany,False
3,1957-11-03,Netherlands,Austria,8,1,European Championship,Berlin,Germany,True
4,1957-11-03,Germany,England,0,4,European Championship,Berlin,Germany,False
...,...,...,...,...,...,...,...,...,...
10570,2025-08-12,Vietnam,Thailand,1,0,ASEAN Championship,Haiphong,Vietnam,False
10571,2025-08-12,Indonesia,Cambodia,1,1,ASEAN Championship,Phú Thọ,Vietnam,True
10572,2025-08-13,Philippines,Myanmar,1,1,ASEAN Championship,Haiphong,Vietnam,True
10573,2025-08-16,Myanmar,Thailand,2,1,ASEAN Championship,Haiphong,Vietnam,True


In [60]:
# Convert 'date' columns to datetime
results_df['date'] = pd.to_datetime(results_df['date'], errors='coerce')
shootouts_df['date'] = pd.to_datetime(shootouts_df['date'], errors='coerce')
goalscorers_df['date'] = pd.to_datetime(goalscorers_df['date'], errors='coerce')

In [61]:
# Check for null dates
print(results_df['date'].isna().sum())
print(shootouts_df['date'].isna().sum())
print(goalscorers_df['date'].isna().sum())

0
0
0


In [72]:
# Clean team names and strings
def clean_team_names(df, columns):
    for col in columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.strip()
            .str.replace(r'\s+', ' ', regex=True)
        )
    return df

goalscorers_df = clean_team_names(goalscorers_df, ['home_team', 'away_team', 'team', 'scorer'])
shootouts_df = clean_team_names(shootouts_df, ['home_team', 'away_team', 'winner', 'first_shooter'])
results_df = clean_team_names(results_df, ['home_team', 'away_team', 'tournament', 'city', 'country'])

In [74]:
# Normalize team names/countries across datasets

teams_results = set(results_df['home_team']).union(set(results_df['away_team']))
teams_shootouts = set(shootouts_df['home_team']).union(set(shootouts_df['away_team']))
teams_goalscorers = set(goalscorers_df['team'])

all_teams = teams_results.union(teams_shootouts).union(teams_goalscorers)
print(f"Total unique teams across datasets: {len(all_teams)}")

Total unique teams across datasets: 246


In [76]:
# Save to CSV to manually review
pd.Series(sorted(all_teams)).to_csv('data/unique_team_names.csv', index=False)