In [2]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import inspect

### Extract CSVs into DataFrames

In [3]:
roadworks_details_file = "Resources\Roadworks.csv"
roadworks_details_df = pd.read_csv(roadworks_details_file)
# roadworks_details_df.head(3)
roadworks_details_df.info()  #check datatype and columns info


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   X                        136 non-null    float64
 1   Y                        136 non-null    float64
 2   OBJECTID                 136 non-null    int64  
 3   Id                       136 non-null    int64  
 4   DateStarted              136 non-null    object 
 5   EstimatedCompletionDate  136 non-null    object 
 6   WorkType                 136 non-null    object 
 7   Description              136 non-null    object 
 8   Suburb                   92 non-null     object 
 9   Road                     136 non-null    object 
 10  TrafficImpact            136 non-null    object 
 11  Region                   136 non-null    object 
 12  EntryDate                136 non-null    object 
dtypes: float64(2), int64(2), object(9)
memory usage: 13.9+ KB


### Transform roadworks_details DataFrame

In [4]:
# Create a filtered dataframe from specific columns
roadworks_cols = ["Id", "DateStarted", "EstimatedCompletionDate","WorkType","Suburb","Road","Region","TrafficImpact"]
roadworks_transformed= roadworks_details_df[roadworks_cols].copy()


# Rename the column headers
roadworks_transformed = roadworks_transformed.rename(columns={"Id":"id","DateStarted": "startdate",
                                                              "EstimatedCompletionDate": "finishdate",
                                                             "WorkType":"worktype","Suburb":"suburb",
                                                             "Road":"road","Region":"region","TrafficImpact":"trafficimpact"})
roadworks_transformed.head()
roadworks_transformed.set_index('id', inplace=True)
roadworks_transformed


#remove NAN values
filtered_df = roadworks_transformed.dropna(how='any')
filtered_df
filtered_df.isnull().values.any()


False

In [5]:
# check for duplicate rows except first occurrence based on all columns

duplicateRowsDF = filtered_df[roadworks_transformed.duplicated()]
print(duplicateRowsDF) #  duplicates no found 

Empty DataFrame
Columns: [startdate, finishdate, worktype, suburb, road, region, trafficimpact]
Index: []


  duplicateRowsDF = filtered_df[roadworks_transformed.duplicated()]


In [6]:
#if duplicates >> clean the data by dropping duplicates and setting the index
# roadworks_transformed.drop_duplicates("Id, inplace=True")
# roadworks_transformed.set_index("Id", inplace=True)

roadworks_final = filtered_df[~filtered_df.index.duplicated(keep='last')]
roadworks_final.reset_index(inplace=True)
# roadworks_final = roadworks_final.rename(columns={"Id": "roadworks_id"})
roadworks_final_df= pd.DataFrame(roadworks_final)
# roadworks_final_df.set_index("Id", inplace=True)
roadworks_final_filtered_df= roadworks_final_df.loc[roadworks_final_df['region'] == 'Metro', :]
roadworks_final_filtered_df.head()
roadworks_final_filtered_df.head()
roadworks_final_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17 entries, 0 to 71
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             17 non-null     int64 
 1   startdate      17 non-null     object
 2   finishdate     17 non-null     object
 3   worktype       17 non-null     object
 4   suburb         17 non-null     object
 5   road           17 non-null     object
 6   region         17 non-null     object
 7   trafficimpact  17 non-null     object
dtypes: int64(1), object(7)
memory usage: 1.2+ KB


### Create database connection

In [10]:
# connection_string = "postgres:postgres@localhost:5432/Roadworks_db"
# engine = create_engine(f'postgresql://{connection_string}')

# Note above generate the below error1- 
# OperationalError: (psycopg2.OperationalError) FATAL:  password authentication failed for user "postgres"

#tried the below instead
engine = create_engine('postgresql+psycopg2://postgres:<Password>@localhost/roadworks_db?port=5432') 



In [11]:
# inspector = inspect(engine)
# inspector.get_table_names()

# columns = inspector.get_columns('roadworks_details')
# for c in columns:
#     print(c['name'], c["type"])

### Load DataFrames into database

In [13]:
roadworks_final_filtered_df.to_sql(name='roadworks_details', con=engine, if_exists='append', index=False)

# Note - 
# error2 below > fixed by seting Id as index
# ProgrammingError: (psycopg2.errors.UndefinedColumn) column "index" of relation "roadworks_details" does not exist
# LINE 1: INSERT INTO roadworks_details (index, "Id", "Started Date", ...

#  error3 below > fixed by drop colunm in PostgreSQLdb and create new column for Roadworks_id and rename df id to 'Roadworks_id'
# ProgrammingError: (psycopg2.errors.UndefinedColumn) column "Id" of relation "roadworks_details" does not exist
# LINE 1: INSERT INTO roadworks_details ("Id", "Started Date", "Comple

#  error4 below > fixed by drop table and remove id from table
# NotNullViolation: null value in column "id" violates not-null constraint
# DETAIL:  Failing row contains (null, 45919, 2019-12-31, 2021-12-31, Maintenance, Perth City , Riverside Dr, Lane closures and speed reductions).

#  error5 below > fixed by remove varchar number from worktype and traffic in postgres table/column description
# StringDataRightTruncation: value too long for type character varying(250)

In [None]:
# 