# DSC350 - Term Project - Milestone 2

**Cleaning/Formatting a Flat File Source** <br>
Perform at least 5 data transformation and/or cleansing steps to your flat file data.

We begin the assignment by importing the necessary libraries for completion.

In [1]:
import pandas as pd

## Read the original CSV file

In [2]:
# Define the path to read the file
file = (r'C:\Users\thefli0\Downloads\mls.csv')

# Read the file to a dataframe
df = pd.read_csv(file)

# Print the contents
print(df)

                      team             city             state   latitude  \
0        Atlanta United FC          Atlanta           Georgia  33.755556   
1                Austin FC           Austin             Texas  30.387700   
2              CF Montréal         Montreal            Quebec  45.563056   
3             Charlotte FC        Charlotte    North Carolina  35.225833   
4          Chicago Fire FC          Chicago          Illinois  41.862300   
5          Colorado Rapids    Commerce City          Colorado  39.805556   
6            Columbus Crew         Columbus              Ohio  39.968461   
7              D.C. United       Washington              D.C.  38.868411   
8            FC Cincinnati       Cincinnati              Ohio  39.111389   
9                FC Dallas           Frisco             Texas  33.154444   
10       Houston Dynamo FC          Houston             Texas  29.752200   
11          Inter Miami CF  Fort Lauderdale           Florida  26.193056   
12          

## Adding a New Column

In [3]:
# Define the path to read the file
file = (r'C:\Users\thefli0\Downloads\mls.csv')

# Read the file to a dataframe
df = pd.read_csv(file)

# Add 'ID' column with unique identifier
df['ID'] = range(1, len(df) + 1)
# Move 'ID' column for readability
columns = ['ID'] + [col for col in df.columns if col != 'ID']
df = df[columns]

# Print contents
print(df)

    ID                    team             city             state   latitude  \
0    1       Atlanta United FC          Atlanta           Georgia  33.755556   
1    2               Austin FC           Austin             Texas  30.387700   
2    3             CF Montréal         Montreal            Quebec  45.563056   
3    4            Charlotte FC        Charlotte    North Carolina  35.225833   
4    5         Chicago Fire FC          Chicago          Illinois  41.862300   
5    6         Colorado Rapids    Commerce City          Colorado  39.805556   
6    7           Columbus Crew         Columbus              Ohio  39.968461   
7    8             D.C. United       Washington              D.C.  38.868411   
8    9           FC Cincinnati       Cincinnati              Ohio  39.111389   
9   10               FC Dallas           Frisco             Texas  33.154444   
10  11       Houston Dynamo FC          Houston             Texas  29.752200   
11  12          Inter Miami CF  Fort Lau

## Replacing Headers

In [4]:
# Rename the columns for readability
df.rename(columns={
    'team': 'Official_Team_Name',
    'city': 'Team_City',
    'state': 'Team_State',
    'latitude': 'Latitude',
    'longitude': 'Longitude',
    'stadium': 'Stadium_Name',
    'stadium_capacity': 'Stadium_Capacity',
    'joined': 'Year_Joined',
    'head_coach': 'Head_Coach',
    'url': 'Website_URL',
    'wikipedia_url': 'Wikipedia_URL',
    'logo_url': 'Logo_URL'
}, inplace=True)

# Display results
print(df)

    ID      Official_Team_Name        Team_City        Team_State   Latitude  \
0    1       Atlanta United FC          Atlanta           Georgia  33.755556   
1    2               Austin FC           Austin             Texas  30.387700   
2    3             CF Montréal         Montreal            Quebec  45.563056   
3    4            Charlotte FC        Charlotte    North Carolina  35.225833   
4    5         Chicago Fire FC          Chicago          Illinois  41.862300   
5    6         Colorado Rapids    Commerce City          Colorado  39.805556   
6    7           Columbus Crew         Columbus              Ohio  39.968461   
7    8             D.C. United       Washington              D.C.  38.868411   
8    9           FC Cincinnati       Cincinnati              Ohio  39.111389   
9   10               FC Dallas           Frisco             Texas  33.154444   
10  11       Houston Dynamo FC          Houston             Texas  29.752200   
11  12          Inter Miami CF  Fort Lau

## Identifying and Handling Missing Values

In [5]:
# Fill numeric columns with 0 and string columns with "Unknown"
df.fillna({
    'Official_Team_Name': 'Unknown',
    'Team_City': 'Unknown',
    'Team_State': 'Unknown',
    'Latitude': 0.0,
    'Longitude': 0.0,
    'Stadium_Name': 'Unknown',
    'Stadium_Capacity': 0,
    'Year_Joined': 0,
    'Head_Coach': 'Unknown',
    'Website_URL': 'Unknown',
    'Wikipedia_URL': 'Unknown',
    'Logo_URL': 'Unknown',
}, inplace=True)

# Display results
print(df)

    ID      Official_Team_Name        Team_City        Team_State   Latitude  \
0    1       Atlanta United FC          Atlanta           Georgia  33.755556   
1    2               Austin FC           Austin             Texas  30.387700   
2    3             CF Montréal         Montreal            Quebec  45.563056   
3    4            Charlotte FC        Charlotte    North Carolina  35.225833   
4    5         Chicago Fire FC          Chicago          Illinois  41.862300   
5    6         Colorado Rapids    Commerce City          Colorado  39.805556   
6    7           Columbus Crew         Columbus              Ohio  39.968461   
7    8             D.C. United       Washington              D.C.  38.868411   
8    9           FC Cincinnati       Cincinnati              Ohio  39.111389   
9   10               FC Dallas           Frisco             Texas  33.154444   
10  11       Houston Dynamo FC          Houston             Texas  29.752200   
11  12          Inter Miami CF  Fort Lau

## Identifying and Handling Duplicates

In [6]:
# Identify duplicates based only on the 'Official_Team_Name' column
df.drop_duplicates(subset='Official_Team_Name', inplace=True)

# Display results
print(df)

    ID      Official_Team_Name        Team_City        Team_State   Latitude  \
0    1       Atlanta United FC          Atlanta           Georgia  33.755556   
1    2               Austin FC           Austin             Texas  30.387700   
2    3             CF Montréal         Montreal            Quebec  45.563056   
3    4            Charlotte FC        Charlotte    North Carolina  35.225833   
4    5         Chicago Fire FC          Chicago          Illinois  41.862300   
5    6         Colorado Rapids    Commerce City          Colorado  39.805556   
6    7           Columbus Crew         Columbus              Ohio  39.968461   
7    8             D.C. United       Washington              D.C.  38.868411   
8    9           FC Cincinnati       Cincinnati              Ohio  39.111389   
9   10               FC Dallas           Frisco             Texas  33.154444   
10  11       Houston Dynamo FC          Houston             Texas  29.752200   
11  12          Inter Miami CF  Fort Lau

## Formatting Dates

In [7]:
# Format the 'Year_Joined' column to ensure proper format
df['Year_Joined'] = pd.to_numeric(df['Year_Joined'], errors='coerce').fillna(0).astype(int)
# If different format, try to convert to correct format
# Handle any invalid years
df['Year_Joined'] = df['Year_Joined'].apply(lambda x: x if x > 0 else None)

# Display results
print(df)

    ID      Official_Team_Name        Team_City        Team_State   Latitude  \
0    1       Atlanta United FC          Atlanta           Georgia  33.755556   
1    2               Austin FC           Austin             Texas  30.387700   
2    3             CF Montréal         Montreal            Quebec  45.563056   
3    4            Charlotte FC        Charlotte    North Carolina  35.225833   
4    5         Chicago Fire FC          Chicago          Illinois  41.862300   
5    6         Colorado Rapids    Commerce City          Colorado  39.805556   
6    7           Columbus Crew         Columbus              Ohio  39.968461   
7    8             D.C. United       Washington              D.C.  38.868411   
8    9           FC Cincinnati       Cincinnati              Ohio  39.111389   
9   10               FC Dallas           Frisco             Texas  33.154444   
10  11       Houston Dynamo FC          Houston             Texas  29.752200   
11  12          Inter Miami CF  Fort Lau

## Combine Latitude and Longitude Columns

In [8]:
# Combine the two columns
df['Stadium_Coordinates'] = df['Latitude'].astype(str) + ', ' + df['Longitude'].astype(str)

# Remove the individual columns for each
df.drop(columns=['Latitude', 'Longitude'], inplace=True)

# Rearrage new column for readability
columns = df.columns.tolist()
columns.insert(4, columns.pop(columns.index('Stadium_Coordinates')))
df = df[columns]

# Display results
print(df)

    ID      Official_Team_Name        Team_City        Team_State  \
0    1       Atlanta United FC          Atlanta           Georgia   
1    2               Austin FC           Austin             Texas   
2    3             CF Montréal         Montreal            Quebec   
3    4            Charlotte FC        Charlotte    North Carolina   
4    5         Chicago Fire FC          Chicago          Illinois   
5    6         Colorado Rapids    Commerce City          Colorado   
6    7           Columbus Crew         Columbus              Ohio   
7    8             D.C. United       Washington              D.C.   
8    9           FC Cincinnati       Cincinnati              Ohio   
9   10               FC Dallas           Frisco             Texas   
10  11       Houston Dynamo FC          Houston             Texas   
11  12          Inter Miami CF  Fort Lauderdale           Florida   
12  13               LA Galaxy           Carson        California   
13  14          Los Angeles FC    

## Save Updated DataFrame to New File

In [9]:
# Save updated DataFrame that has been cleaned and transformed
output_file = r'C:\Users\thefli0\Downloads\mls_cleaned.csv' 
df.to_csv(output_file, index=False)

# Display result and ensure file creation
print(f"New cleaned and transformed DataFrame saved as {output_file}")

New cleaned and transformed DataFrame saved as C:\Users\thefli0\Downloads\mls_cleaned.csv
