# Project 2 Extract Transform and Load

Process CSV File (Player data, Team data, Match Data)
Process URL Scrapping (Stadium data from wikipedia)
Process API Data (Statdium Coordinates and Weather on the day)

In [1]:
import pandas as pd
import numpy as np
import requests
import pymongo
from datetime import timezone
from sqlalchemy import create_engine

from my_etl_utilities import convert_to_json
from api_keys import weather_api_key

# Extract and Transform CSV into DataFrame

## CSV Data Scrapping

### Team Data (An example of simple transformation)
* Read CSV
* Create a sub-frame
* Rename and transform
* Apply proper datatypes

In [2]:
#Extract
team_file = "D:/UPENN/Project2/Project2-ETL/Resources/TeamData.csv"
team_data_df = pd.read_csv(team_file)

#Transform
team_req_cols = ["team", "players_used", "avg_age","possession","games","minutes","cards_yellow","cards_red","goals_assists_pens_per90"]
team_df = team_data_df[team_req_cols].copy()

#Rename columns and organize data
team_df = team_df.rename(columns={"goals_assists_pens_per90":"performance"})
team_df.sort_values(by=['performance'],ascending=False,inplace=True)

# #Set proper data types
data_types_dict = {'team': str,'players_used': str,'avg_age':float,'possession':int,'minutes':int}
team_df = team_df.astype(data_types_dict)

#Set index
team_df = team_df.set_index("team")
team_df.head()

Unnamed: 0_level_0,players_used,avg_age,possession,games,minutes,cards_yellow,cards_red,performance
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
England,20,27.0,62,5,450,1,0,4.6
Portugal,24,27.8,60,5,450,6,0,4.0
France,24,27.3,52,6,540,5,0,4.0
Netherlands,21,27.6,53,5,480,12,1,3.38
Germany,20,28.1,59,3,270,3,0,3.33


### Player Data (CSV) (An example of string processing on data frame columns )
* Read CSV
* Create a sub-frame
* Rename and transform age by splitting string of a dataframe column
* Filter records by performance number
* Apply proper datatypes

In [3]:
#Extract
players_file = "D:/UPENN/Project2/Project2-ETL/Resources/PlayerStats.csv"
players_data_df = pd.read_csv(players_file)

#Transform
#"goals_assists_pens_per90": "Goals plus Assists minus Penalty Kicks made per 90 minutes. Minimum 30 minutes played per squad game to qualify as a leader"
players_req_cols = ["player", "position", "team","age","goals_assists_pens_per90"]
players_df = players_data_df[players_req_cols].copy()

#Rename columns and organize data
players_df = players_df.rename(columns={"goals_assists_pens_per90":"performance"})
players_df.sort_values(by=['performance'],ascending=False,inplace=True)

#Transform the age column
players_df['age'] = players_df.age.str.split("-").str[0]

#Set proper data types
data_types_dict = {'player': str,'position': str,'team':str,'age':int,'performance':float}
players_df = players_df.astype(data_types_dict)
players_df.dtypes

#Apply criteria to filter data 
players_df = players_df[players_df.performance < 4.0]

#Set index
players_df = players_df.set_index("player")
players_df.head()


Unnamed: 0_level_0,position,team,age,performance
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Niclas Füllkrug,FW,Germany,29,3.91
Paik Seung-ho,MF,Korea Republic,25,3.46
Wout Weghorst,FW,Netherlands,30,2.81
Mislav Oršić,FW,Croatia,29,2.47
Gonçalo Ramos,FW,Portugal,21,2.37


### Match Data (An example of transformation of data type)
* Read CSV
* Create a sub-frame
* Rename and transform - convert from string data type to datetime 
* Apply proper datatypes

In [4]:
#Extract
match_file = "D:/UPENN/Project2/Project2-ETL/Resources/MatchData.csv"
match_data_df = pd.read_csv(match_file)

#Transform
match_req_cols = ["match","dayofweek","match_time","home_team","away_team","score","attendance","venue"]
match_df = match_data_df[match_req_cols].copy()

#Set proper data types
data_types_dict = {'match': str,'dayofweek': str,'match_time':str,'home_team':str,'away_team':str,'score':str,'attendance':int,'venue':str}
match_df = match_df.astype(data_types_dict)

#convert to date and time
match_df['match_time'] = pd.to_datetime(match_df['match_time'])
match_df['utc_time'] = match_df['match_time']
match_df['utc_time'] = match_df['utc_time'].astype(np.int64)
match_df.dtypes

# #Set index
match_df = match_df.set_index("match")
match_df.head()

Unnamed: 0_level_0,dayofweek,match_time,home_team,away_team,score,attendance,venue,utc_time
match,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Sun,2022-11-20 19:00:00,Qatar,Ecuador,0–2,67372,Al Bayt Stadium,1668970800000000000
2,Mon,2022-11-21 16:00:00,England,IR Iran,6–2,45334,Khalifa International Stadium,1669046400000000000
3,Mon,2022-11-21 19:00:00,Senegal,Netherlands,0–2,41721,Al Thumama Stadium,1669057200000000000
4,Mon,2022-11-21 22:00:00,United States,Wales,1–1,43418,Ahmed bin Ali Stadium,1669068000000000000
5,Tue,2022-11-22 13:00:00,Argentina,Saudi Arabia,1–2,88012,Lusail Iconic Stadium,1669122000000000000


## Panda Web Scraping

In [5]:
url = 'https://en.wikipedia.org/wiki/2022_FIFA_World_Cup'
stadium_data = pd.read_html(url)
stadium_v1_df = stadium_data[5]

#Clean up the capacity column
#Step 1: 88,966[82][83] -> 88,966
#Step 2: replace "," with null 
#Step 3: convert the value into numeric
stadium_v1_df['Capacity'] = pd.to_numeric((stadium_v1_df['Capacity'].str.split("[").str[0]).str.replace(",",""))

#Set proper data types
data_types_dict = {'Stadium': str,'City': str,'Capacity':int}
stadium_v1_df = stadium_v1_df.astype(data_types_dict)

# stadium_v1_df.set_index("Stadium",inplace=True)
stadium_v1_df

Unnamed: 0,City,Stadium,Capacity
0,Lusail,Lusail Stadium,88966
1,Al Khor,Al Bayt Stadium,68895
2,Al Rayyan,Khalifa International Stadium,45857
3,Al Rayyan,Ahmad bin Ali Stadium,45032
4,Al Rayyan,Education City Stadium,44667
5,Doha,Al Thumama Stadium,44400
6,Doha,Stadium 974,44089
7,Al Wakrah,Al Janoub Stadium,44325


## API Data Extraction

In [6]:
#Get Stadium Data with GPS info
stadium_file = "D:/UPENN/Project2/Project2-ETL/Resources/Stadium.csv"
stadium_v2_df = pd.read_csv(stadium_file)
stadium_v2_df.drop(columns='City',inplace=True)
stadium_df = pd.merge(stadium_v1_df,stadium_v2_df,how="inner",on="Stadium")
stadium_df

Unnamed: 0,City,Stadium,Capacity,Lat,Lon
0,Lusail,Lusail Stadium,88966,25.422152,51.490266
1,Al Khor,Al Bayt Stadium,68895,25.653061,51.487936
2,Al Rayyan,Khalifa International Stadium,45857,25.264134,51.448493
3,Al Rayyan,Ahmad bin Ali Stadium,45032,25.329797,51.342447
4,Al Rayyan,Education City Stadium,44667,25.311527,51.424203
5,Doha,Al Thumama Stadium,44400,25.235689,51.532532
6,Doha,Stadium 974,44089,25.289761,51.566629
7,Al Wakrah,Al Janoub Stadium,44325,25.160004,51.574365


In [None]:

# url = f'https://history.openweathermap.org/data/3.0/history/timemachine?appid={weather_api_key}'
# print(weather_api_key)
#For each stadium get the weather info the Match data
# for index, stadium in stadium_df.iterrows():
    # match_sub_df = match_df.loc[match_df['venue'] == stadium['Stadium']]
    # for index, match in match_sub_df.iterrows():
    #     timestamp = match['utc_time']
    #     lon = stadium['Lon']
    #     lat = stadium['Lat']
        #try:
        # weather_url = f'{url}&lat={lat}&lon={lon}&dt={timestamp}'
        # print(weather_url)
        # stadium_weather = requests.get(weather_url).json()

# Mongo DB - Persist Stadium data in MongoDB with stadium info and unstructured data

In [None]:
# The default port used by MongoDB is 27017
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define the 'classDB' database in Mongo
db = client.FIFA_2022_DB

### Clean DataFrame

In [8]:
new_customer_location_df = customer_location_df[["id", "address", "us_state"]].copy()
new_customer_location_df.head()

NameError: name 'customer_location_df' is not defined

### Connect to local database

In [None]:
protocol = 'postgresql'
username = 'postgres'
password = 'admin'
host = 'localhost'
port = 5432
database_name = 'customer_db'
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

In [None]:
engine

Engine(postgresql://postgres:***@localhost:5432/customer_db)

### Check for tables

In [None]:
engine.table_names()

  engine.table_names()


[]

### Use pandas to load csv converted DataFrame into database

In [None]:
new_customer_data_df.to_sql(name='customer_name', con=engine, if_exists='append', index=False)

1000

### Use pandas to load json converted DataFrame into database

In [None]:
new_customer_location_df.to_sql(name='customer_location', con=engine, if_exists='append', index=False)

1000

### Confirm data has been added by querying the customer_name table
* NOTE: can also check using pgAdmin

In [None]:
pd.read_sql_query('select * from customer_name', con=engine).head()

Unnamed: 0,id,first_name,last_name
0,1,Benetta,Cancott
1,2,Lilyan,Cherry
2,3,Ezekiel,Benasik
3,4,Kennedy,Atlay
4,5,Sanford,Salmen


### Confirm data has been added by querying the customer_location table

In [None]:
pd.read_sql_query('select * from customer_location', con=engine).head()

Unnamed: 0,id,address,us_state
0,1,043 Mockingbird Place,Indiana
1,2,4 Prentice Point,Indiana
2,3,46 Derek Junction,Texas
3,4,11966 Old Shore Place,Missouri
4,5,5 Evergreen Circle,New York
