In [1]:
# Import dependencies
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import inspect

### Store CSV into DataFrame

In [2]:
# Read CSV - athlete events
csv_events = "Resources/athlete_events.csv"
events_df = pd.read_csv(csv_events)
events_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [3]:
# Drop data
events_df.drop(['ID','Name', 'Sex', 'Age', 'Height', 'Weight', 'Team', 'Games','Sport','Event'], axis=1, inplace=True)
events_df.head()

Unnamed: 0,NOC,Year,Season,City,Medal
0,CHN,1992,Summer,Barcelona,
1,CHN,2012,Summer,London,
2,DEN,1920,Summer,Antwerpen,
3,DEN,1900,Summer,Paris,Gold
4,NED,1988,Winter,Calgary,


In [4]:
# Replace null values
events_df['Medal'].fillna('None', inplace = True)
# events_df['Height'].fillna(0, inplace = True)
# events_df['Weight'].fillna(0, inplace = True)
# events_df['Age'].fillna(0, inplace = True)

print(len(events_df))
events_df.head()

271116


Unnamed: 0,NOC,Year,Season,City,Medal
0,CHN,1992,Summer,Barcelona,
1,CHN,2012,Summer,London,
2,DEN,1920,Summer,Antwerpen,
3,DEN,1900,Summer,Paris,Gold
4,NED,1988,Winter,Calgary,


### Create new data with select columns

In [12]:
# Create a filtered dataframe from specific columns
events_cols = ["NOC", "Year", "Season", "City", "Medal"]
events_transformed = events_df[events_cols].copy()

# Rename the column headers
events_transformed = events_transformed.rename(columns={"NOC":"noc", "Year":"year", "Season":"season", "City":"city", "Medal":"medal"})

# events_transformed.to_csv('transforms.csv', index=True)

# Clean the data by setting the index
events_transformed.reset_index(level=0, inplace=True)
events_transformed[(events_transformed!=0).any(axis=1)]
events_transformed.head()

Unnamed: 0,index,noc,year,season,city,medal
0,0,CHN,1992,Summer,Barcelona,
1,1,CHN,2012,Summer,London,
2,2,DEN,1920,Summer,Antwerpen,
3,3,DEN,1900,Summer,Paris,Gold
4,4,NED,1988,Winter,Calgary,


In [13]:
# Read CSV - noc
csv_gdp = "Resources/gdp_csv.csv"
gdp_df = pd.read_csv(csv_gdp)
gdp_df.head()

Unnamed: 0,Country Name,Country Code,Year,Value
0,Arab World,ARB,1968,25760680000.0
1,Arab World,ARB,1969,28434200000.0
2,Arab World,ARB,1970,31385500000.0
3,Arab World,ARB,1971,36426910000.0
4,Arab World,ARB,1972,43316060000.0


In [14]:
# Drop data
gdp_df.drop(['Country Name'], axis=1, inplace=True)
gdp_df.head()

Unnamed: 0,Country Code,Year,Value
0,ARB,1968,25760680000.0
1,ARB,1969,28434200000.0
2,ARB,1970,31385500000.0
3,ARB,1971,36426910000.0
4,ARB,1972,43316060000.0


In [15]:
# Create a filtered dataframe from specific columns
gdp_cols = ["Country Code", "Year", "Value"]
gdp_transformed = gdp_df[gdp_cols].copy()

# Rename the column headers
gdp_transformed = gdp_transformed.rename(columns={"Country Code":"noc", "Year":"year", "Value":"gdp"})

# events_transformed.to_csv('transforms.csv', index=True)

# Clean the data by setting the index
gdp_transformed.reset_index(level=0, inplace=True)

gdp_transformed.head()

Unnamed: 0,index,noc,year,gdp
0,0,ARB,1968,25760680000.0
1,1,ARB,1969,28434200000.0
2,2,ARB,1970,31385500000.0
3,3,ARB,1971,36426910000.0
4,4,ARB,1972,43316060000.0


### Connect to local database

In [16]:
connection_string = "postgres:postgres@localhost:5432/olympics_db"
engine = create_engine(f'postgresql://{connection_string}')

### Check for tables

In [17]:
insp = inspect(engine)
print(insp.get_table_names())

['events', 'gdp']


### Use pandas to load csv converted DataFrame into database

In [19]:
events_transformed.to_sql(name='events', con=engine, if_exists='append', index=False)

In [21]:
gdp_transformed.to_sql(name='gdp', con=engine, if_exists='append', index=False)