In [None]:
import pyodbc
from sqlalchemy import create_engine
import pandas as pd
import psycopg2
import os
import numpy as np

sql_conn = pyodbc.connect(
    r"DRIVER={ODBC Driver 18 for SQL Server};"
    r"SERVER=DESKTOP-MS4DILS\THUS;"
    r"DATABASE=US_Traffic_Accidents_ETL;"
    r"Trusted_Connection=yes;"
    r"Encrypt=yes;"
    r"TrustServerCertificate=yes;",
    autocommit=True
)

#allows interaction with sql database
cursor = sql_conn.cursor()

#test connection
print(sql_conn)

In [None]:
# Map pandas dtypes to SQL Server types
def map_dtype(dtype, max_val=None):
    if pd.api.types.is_integer_dtype(dtype):
        if max_val is not None:
            return 'BIGINT' if max_val > 2_147_483_647 else 'INT'
        return 'BIGINT'
    elif pd.api.types.is_float_dtype(dtype):
        return "FLOAT"
    elif pd.api.types.is_bool_dtype(dtype):
        return "BIT"
    elif pd.api.types.is_datetime64_any_dtype(dtype):
        return "DATETIME"
    else:
        return "NVARCHAR(MAX)"

In [None]:
# use this section if grabbing data from sql server into postgresql
# tables = ['DimProduct', 'DimProductSubcategory', 'DimProductCategory', 'DimSalesTerritory', 'FactInternetSales']

# #Loop through wanted tables from SQL Server and load into PostgreSQL extract location
# for table in tables:
#     query = f"SELECT * FROM {table}"
#     df = pd.read_sql(query, sql_conn)
    
#     # Load into PostgreSQL
#     df.to_sql(table.lower(), pg_engine, index=False, if_exists='replace')  # Use lower case for PostgreSQL
#     print(f"Loaded {table} into PostgreSQL")

In [None]:
# Increase the maximum number of rows to display
pd.set_option('display.max_rows', None)

# Increase the maximum number of columns to display
pd.set_option('display.max_columns', None)

# If necessary, increase the maximum column width
pd.set_option('display.max_colwidth', None)

#removes white space function
def remove_whitespace(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].str.strip()
    return df

#turns csv into sql server extract table
def csv_to_extract(df, table_name):
    cursor = sql_conn.cursor()
    
    columns_with_types = ', '.join(
    f"{col} {map_dtype(dtype)}" for col, dtype in df.dtypes.items()
    )
    drop_stmt = f"DROP TABLE IF EXISTS {table_name}"
    cursor.execute(drop_stmt)
    create_stmt = f"IF OBJECT_ID('{table_name}', 'U') IS NULL CREATE TABLE {table_name} ({columns_with_types})" 
    cursor.execute(create_stmt)
    sql_conn.commit()
    # Insert data into table
    for _, row in df.iterrows():
        placeholders = ', '.join('?' for _ in row)
        insert_stmt = f"INSERT INTO {table_name} VALUES ({placeholders})"
        cursor.execute(insert_stmt, *row)
    sql_conn.commit()
    cursor.close()
    return

Accidents 2020 -> Transformation Table

In [None]:
#accidents 2020
filepath = r'acc_20.csv'
data = pd.read_csv(filepath,encoding='cp1252')
acc_20 = data.copy()

#check nulls
    #Only work zone has nulls but we will be leaving it as-is
#remove whitespace
acc_20 = remove_whitespace(acc_20)
#standardize data (if any)
    #no changes
#remove duplicates (if any)
acc_20.drop_duplicates()
#data type conversions (if any)
    #no changes
#create a new csv file (this is our tranformation table)




Accidents: 2016, 2017, 2018 --> Transfomation Table

In [None]:
# #accidents 2016
# filepath = r'acc_16.csv'
# data = pd.read_csv(filepath,encoding='cp1252')
# acc_16 = data.copy()
# #accidents 2017
# filepath = r'acc_17.csv'
# data = pd.read_csv(filepath,encoding='cp1252')
# acc_17= data.copy()
# #accidents 2018
# filepath = r'acc_18.csv'
# data = pd.read_csv(filepath,encoding='cp1252')
# acc_18 = data.copy()
# #accidents 2019
# filepath = r'acc_19.csv'
# data = pd.read_csv(filepath,encoding='cp1252')
# acc_19 = data.copy()
# #drop columns
# columns_to_drop = ['WEATHER2','CF1', 'CF2', 'CF3']
# acc_16 = acc_16.drop(columns=columns_to_drop)
# acc_17 = acc_17.drop(columns=columns_to_drop)
# acc_18 = acc_18.drop(columns=columns_to_drop)
# columns_to_drop = ['WEATHER1', 'WEATHER2', 'CF1', 'CF1NAME', 'CF2', 'CF2NAME', 'CF3', 'CF3NAME']
# acc_19 = acc_19.drop(columns=columns_to_drop)
