In [None]:
import pandas as pd
import numpy as np
import json
import requests
from io import StringIO

In [None]:
# Load the dataset
url = "https://data.cdc.gov/api/views/hksd-2xuw/rows.csv?accessType=DOWNLOAD"

# Use pandas to read the CSV directly from the URL
df = pd.read_csv(url)

# Now `df` contains your DataFrame loaded directly from the URL
print(df.head())


# Initial data overview
print("Initial shape of the dataset:", df.shape)

# Drop completely empty columns
df_cleaned = df.dropna(axis=1, how='all')
print("Shape after dropping completely empty columns:", df_cleaned.shape)

# Columns considered critical for this analysis
critical_columns = ['YearStart', 'YearEnd', 'LocationAbbr', 'StratificationCategory1', 'Stratification1', 'DataValue']

# Dropping rows with missing values in any of the critical columns
df_cleaned = df.dropna(subset=critical_columns)

#  Provide a simple statistical summary for numerical columns
print("Statistical summary of numerical columns:")
print(df_cleaned.describe())

# Optional: Save the cleaned dataset to a new CSV file
cleaned_file_path = r"/content/U.S._Chronic_Disease_Indicators.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved to: {cleaned_file_path}")


   YearStart  YearEnd LocationAbbr LocationDesc DataSource          Topic  \
0       2019     2019           GA      Georgia      BRFSS     Disability   
1       2019     2019           GU         Guam      BRFSS      Arthritis   
2       2019     2019           GU         Guam      BRFSS   Immunization   
3       2019     2019           ME        Maine      BRFSS       Diabetes   
4       2019     2019           NV       Nevada       NVSS  Health Status   

                             Question  Response DataValueUnit  \
0          Adults with any disability       NaN             %   
1              Arthritis among adults       NaN             %   
2  Influenza vaccination among adults       NaN             %   
3               Diabetes among adults       NaN             %   
4            Life expectancy at birth       NaN         Years   

      DataValueType  ...  TopicID  QuestionID ResponseID DataValueTypeID  \
0  Crude Prevalence  ...      DIS       DIS01        NaN         CRDPR

In [None]:
!pip install azure-storage-blob

Collecting azure-storage-blob
  Downloading azure_storage_blob-12.19.1-py3-none-any.whl (394 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m394.5/394.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-core<2.0.0,>=1.28.0 (from azure-storage-blob)
  Downloading azure_core-1.30.1-py3-none-any.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.4/193.4 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting isodate>=0.6.1 (from azure-storage-blob)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, azure-core, azure-storage-blob
Successfully installed azure-core-1.30.1 azure-storage-blob-12.19.1 isodate-0.6.1


In [None]:
# Import the required class
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

In [None]:
# Specify the path to your JSON configuration file
config_file_path = 'config.json'

# Load the JSON configuration file
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file)

# Retrieve the Azure connection string and container name from the config
CONNECTION_STRING_AZURE_STORAGE = config["connectionString"]
CONTAINER_AZURE = 'cis4400hw1'
blob_name = "cis4400hw1_20240407.csv"

# Convert DataFrame to CSV format using StringIO
output = StringIO()
df_raw.to_csv(output, index=False)
data = output.getvalue()
output.close()

# Upload the data to Azure Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING_AZURE_STORAGE)

# Get a blob client
blob_client = blob_service_client.get_blob_client(container=CONTAINER_AZURE, blob=blob_name)

# Upload the CSV data
blob_client.upload_blob(data, overwrite=True)

print(f"Uploaded {blob_name} to Azure Blob Storage in container {CONTAINER_AZURE}.")

Uploaded cis4400hw1_20240407.csv to Azure Blob Storage in container cis4400hw1.


In [None]:
# ... previous code ...

# Azure SQL Server connection details
server = 'DefaultEndpointsProtocol=https;AccountName=cis4400spring2024hw;AccountKey=qzdZ1Xwa9ogiDzZbSaUEOomYMr4VZv131rVIZ4BUdQB6olvD4mYqbq/Ayv7PXnbgxZV7t9tGPW6b+ASt7czDxA==;EndpointSuffix=core.windows.net'
database = 'cis4400hw1'
username = 'your_username'
password = 'your_password'
driver= '{ODBC Driver 17 for SQL Server}'

# Connect to your database
conn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server + ';PORT=1433;DATABASE=' + database + ';UID=' + username + ';PWD=' + password)
cursor = conn.cursor()

# SQL statements to create your data warehouse schema (fact and dimension tables)
# For example:
create_fact_table_sql = """
CREATE TABLE IF NOT EXISTS DiseaseIndicatorsFact (
    IndicatorID INT PRIMARY KEY,
    YearStart INT,
    YearEnd INT,
    DataValue FLOAT,
    ... other columns ...
);
"""
cursor.execute(create_fact_table_sql)

# Repeat for each dimension table...
# ...

# Commit the changes and close the connection
conn.commit()
conn.close()


In [14]:
from sqlalchemy import create_engine, Column, Integer, String, Float, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker

# Define the base class
Base = declarative_base()

# Define the dimension tables
class LocationDimension(Base):
    __tablename__ = 'location_dimension'

    LocationID = Column(Integer, primary_key=True)
    LocationAbbr = Column(String(10))
    LocationDesc = Column(String(100))
    Geolocation = Column(String(255))

class TimeDimension(Base):
    __tablename__ = 'time_dimension'

    TimeID = Column(Integer, primary_key=True)
    YearStart = Column(Integer)
    YearEnd = Column(Integer)
    Quarter = Column(Integer)
    Month = Column(Integer)

class TopicDimension(Base):
    __tablename__ = 'topic_dimension'

    TopicID = Column(Integer, primary_key=True)
    Topic = Column(String(50))
    Question = Column(String(255))
    DataValueType = Column(String(50))

# Define the fact table
class DiseaseIndicatorsFact(Base):
    __tablename__ = 'disease_indicators_fact'

    IndicatorID = Column(Integer, primary_key=True)
    LocationID = Column(Integer, ForeignKey('location_dimension.LocationID'))
    TimeID = Column(Integer, ForeignKey('time_dimension.TimeID'))
    TopicID = Column(Integer, ForeignKey('topic_dimension.TopicID'))
    DataValue = Column(Float)
    LowConfidenceLimit = Column(Float)
    HighConfidenceLimit = Column(Float)

    # Establish relationships
    location = relationship("LocationDimension")
    time = relationship("TimeDimension")
    topic = relationship("TopicDimension")

# Create an engine that stores data in the local directory's
# chronic_disease_indicators.db file.
engine = create_engine('sqlite:///chronic_disease_indicators.db')

# Create all tables in the engine
Base.metadata.create_all(engine)

# Create a configured "Session" class
Session = sessionmaker(bind=engine)

# Create a session
session = Session()


  Base = declarative_base()
