In [18]:
# Importing the required modules and packages
import numpy as np
import boto3
from config import aws_access_key_id, aws_secret_access_key, bucket_name
import requests
from config import password
import os
import pandas as pd
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.orm import sessionmaker
from sqlalchemy import inspect
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.exc import IntegrityError

In [19]:
# Importing the required modules and packages
import boto3
from config import aws_access_key_id, aws_secret_access_key, bucket_name
import pandas as pd
import os

# Setting up the S3 bucket and creating a connection to it using the credentials
bucket_name = bucket_name
aws_access_key_id = aws_access_key_id
aws_secret_access_key = aws_secret_access_key
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

# Looping through each file in the bucket and reading them into DataFrames
dataframe_dict = {}
for obj in s3.list_objects(Bucket=bucket_name)['Contents']:
    file_name = obj['Key']
    s3_object = s3.get_object(Bucket=bucket_name, Key=file_name)
    file_extension = os.path.splitext(file_name)[1]
    if file_extension == '.xlsx':
        data_dict = pd.read_excel(s3_object['Body'], sheet_name=None)
        data_list = []
        for key in data_dict:
            data_list.append(data_dict[key])
        data = pd.concat(data_list, ignore_index=True)
        dataframe_dict[file_name] = data
    elif file_extension == '.csv':
        data = pd.read_csv(s3_object['Body'])
        dataframe_dict[file_name] = data
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

In [20]:
# Creating a connection to the PostgreSQL database using the password and the database name

engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/swatdatabase')

In [21]:
# Loop through each file in dataframe_dict
for file_name, df in dataframe_dict.items():
    # Check if 'df' is empty
    if df.empty:
        continue

    # CLEANING PART 1
    # Clean each df before adding them to the new_df
    # Specific case because of repetitive numbers. Check if both 'geo' and 'id' are in column names
    geo_id_cols = [col for col in df.columns if 'geo' in col.lower() and 'id' in col.lower()]

    # Loop through each geo_id column and determine which has a first value with a shorter length
    drop_col = None
    for col in geo_id_cols:
        first_val = str(df[col][0])
        if drop_col is None or len(first_val) > len(str(df[drop_col][0])):
            drop_col = col

    # Drop the column associated with the longer first value
    if drop_col is not None:
        df.drop(columns=[drop_col], inplace=True)

    # CLEANING PART 2
    df.columns = df.columns.to_series().apply(lambda x: x.strip().lower())
    df.columns = df.columns.str.replace('ovay', 'ovary', regex=True)
    df.columns = df.columns.str.replace("(?i)fips|geo[_ ]?id", "fips", regex=True)
    df.columns = df.columns.str.replace("[-\s\:]+", "_", regex=True)

    # CREATE DATABASE TABLE
    # Determine the data type of each column
    dtype_dict = {}
    for col in df.columns:
        if df[col].dtype == "int":
            dtype_dict[col] = "INTEGER"
        elif df[col].dtype == "float":
            dtype_dict[col] = "FLOAT"
        else:
            dtype_dict[col] = "TEXT"

    # Create SQL query to create the database table
    query = f"CREATE TABLE IF NOT EXISTS {file_name.split('.')[0]} ("
    for col in df.columns:
        query += f"{col} {dtype_dict[col]}, "
    query = query[:-2] + ");"

    # Execute SQL query to create the database table
    with engine.connect() as conn:
        conn.execute(query)

    # INSERT DATA INTO DATABASE TABLE
    try:
        df.to_sql(file_name.split('.')[0], engine, if_exists="append", index=False)
    except IntegrityError as e:
        print(f"Error inserting data into table {file_name.split('.')[0]}: {e}")
        continue
    

In [22]:
# Read data from pgAdmin into python for ML modeling
import pandas as pd
from sqlalchemy import create_engine, Table, MetaData, inspect
# create a connection string to the PostgreSQL database
# replace <host>, <port>, <database_name>, <username>, and <password> with your own values
#connection_string = ‘postgresql+psycopg2://<username>:<password>@<host>:<port>/<database_name>’
# create an engine object using the connection string
#engine = create_engine(connection_string)
# create a metadata object
metadata = MetaData()
# get a list of all table names in the database
inspector = inspect(engine)
table_names = inspector.get_table_names()
# create an empty dictionary to store the DataFrames for each table
dfs_SQL_to_python = {}
# iterate over the table names, reflect each table, read the data, and store it in a DataFrame
for table_name in table_names:
    # reflect the table from the database
    table = Table(table_name, metadata, autoload=True, autoload_with=engine)
    # execute a SELECT statement to retrieve all rows from the table
    with engine.connect() as conn:
        result = conn.execute(table.select())
        # create a Pandas DataFrame from the results
        df = pd.DataFrame(result.fetchall(), columns=result.keys())
        # store the DataFrame in the dictionary
        dfs_SQL_to_python[table_name] = df

In [23]:
dfs_SQL_to_python.keys()

dict_keys(['closest_plant_and_cancer', 'cancer_per_county', 'closest_plant_in_each_county', 'sample_global_data'])

In [24]:
dfs_SQL_to_python["closest_plant_and_cancer"]

Unnamed: 0,latitude,longitude,county,state_abbrev,closest_plant,distance,plant_capacity,fips,state,rate,...,lung_range,melanoma_range,non_hudgkin_lymphoma_range,oral_cavity_range,ovary_range,pancreas_range,prostate_range,stomach_range,thyroid_range,uteras_range
0,32.534920,-86.642749,Autauga County,AL,Joseph M. Farley Nuclear Plant,128.0,1776.4,1001,"Autauga County, Alabama",506.4,...,23.2,17.7,10.6,0.0,12.1,9.9,40.7,0.0,0.0,15.8
1,30.660970,-87.749840,Baldwin County,AL,Joseph M. Farley Nuclear Plant,161.0,1776.4,1003,"Baldwin County, Alabama",455.7,...,10.6,8.6,5.9,4.4,5.3,4.7,13.8,4.6,3.4,6.5
2,31.869603,-85.393197,Barbour County,AL,Joseph M. Farley Nuclear Plant,48.0,1776.4,1005,"Barbour County, Alabama",447.2,...,27.9,23.7,0.0,0.0,0.0,0.0,57.0,0.0,0.0,25.4
3,32.998644,-87.126439,Bibb County,AL,Browns Ferry Nuclear Plant,118.0,3567.5,1007,"Bibb County, Alabama",466.1,...,38.2,0.0,0.0,0.0,0.0,0.0,52.0,0.0,0.0,28.1
4,33.980867,-86.567371,Blount County,AL,Browns Ferry Nuclear Plant,59.0,3567.5,1009,"Blount County, Alabama",438.7,...,19.7,12.9,11.8,0.0,12.1,9.8,28.9,0.0,10.5,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3129,41.659496,-108.879431,Sweetwater County,WY,Palo Verde Nuclear Generating Station,612.0,4209.6,56037,"Sweetwater County, Wyoming",427.8,...,27.2,18.6,17.4,0.0,0.0,0.0,49.1,0.0,0.0,16.2
3130,43.934637,-110.589810,Teton County,WY,Columbia Generating Station,460.0,1200.0,56039,"Teton County, Wyoming",410.8,...,0.0,34.4,0.0,0.0,0.0,0.0,54.5,0.0,0.0,0.0
3131,41.287492,-110.547562,Uinta County,WY,Palo Verde Nuclear Generating Station,560.0,4209.6,56041,"Uinta County, Wyoming",353.7,...,34.3,0.0,0.0,0.0,0.0,0.0,43.5,0.0,0.0,0.0
3132,43.904997,-107.682861,Washakie County,WY,Columbia Generating Station,594.0,1200.0,56043,"Washakie County, Wyoming",351.0,...,0.0,0.0,0.0,0.0,0.0,0.0,65.8,0.0,0.0,0.0


In [None]:
"""
Save this statement for join the tables in pgAdmin

SELECT cpiec.latitude, cpiec.longitude, cpiec.county, cpiec.state_abbrev,
	cpiec.closest_plant, cpiec.distance, cpiec.plant_capacity,
	cpc.*
--INTO closest_plant_and_cancer
FROM closest_plant_in_each_county as cpiec
JOIN cancer_per_county as cpc
ON cpiec.fips = cpc.fips;

"""