In [1]:
import requests
import pandas as pd
import pyodbc
import os

# Fetch USGS sites including latitude and longitude
def get_usgs_sites(state_code='UT'):
    url = "https://waterservices.usgs.gov/nwis/dv/"
    params = {
        'format': 'json',
        'stateCd': state_code,
        'siteStatus': 'all',
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()
    
    # Extract site code, site name, latitude, and longitude
    sites = [
        (
            site.get('sourceInfo', {}).get('siteCode', [{}])[0].get('value', 'Unknown'),
            site.get('sourceInfo', {}).get('siteName', 'Unknown'),
            site.get('sourceInfo', {}).get('geoLocation', {}).get('geogLocation', {}).get('latitude', 'Unknown'),
            site.get('sourceInfo', {}).get('geoLocation', {}).get('geogLocation', {}).get('longitude', 'Unknown')
        )
        for site in data.get('value', {}).get('timeSeries', [])
    ]
    
    return sites

# Fetch SQL data
def get_sql_data():
    server = 'wrt-sql-prod'
    database = 'dvrtDB'
    username = 'wrtsqlq'
    password = 'guest'
    
    # Corrected SQL query to get the data
    query = """
    SELECT [COLLECTION_SYSTEM]
           ,[collection_sys_description]
           ,[STATION_MASTER].[STATION_ID] As MasterStationID
           ,[STATION_MASTER].[STATION_NAME] As MasterStationName
           ,[COLLECTION_STATIONS].[STATION_NAME] As CollectionStationName
           ,[RETRIES]
           ,[SEQ_NO]
           ,[COMMENTS]
           ,LAT
           ,LON
           ,[STATION_TYPE]
           ,[COMMON_DESC]
           ,[DIVERTING_WORKS]
           ,[MEASURING_DEVICE]
           ,[RECORD_RATING]
           ,[SYSTEM_NAME]
           ,[UNITS_ID]
           ,[OWNER_NAME]   
           ,[CAPTURE_SEQ_NO]
           ,[ANALOG_CHANNEL]
           ,[LOW_FLOW]
           ,[HIGH_FLOW]
           ,[DEVICE_TYPE]
           ,[OWNER_PHONE]
           ,[REALTIME_INCLUDE]
           ,[CORRECTED_DATA]
           ,[STATUS]
           ,[SYSTEM_GROUP]
           ,[SYSTEM_SUBGROUP]
           ,[ADDRESS_ID]
           ,[DataEntryMethod]
           ,[DataLogger]
           ,[DatasetType]
           ,[SeriesVerifiedBy]
           ,[SeriesVerifiedDate]
           ,[SiteState]
           ,[SiteType]
           ,[SiteVerifiedBy]
           ,[SiteVerifiedDate]
           ,[Telemetry]
    FROM [dvrtDB].[dbo].[COLLECTION_STATIONS]
    LEFT JOIN [dvrtDB].[dbo].[COLLECTION_SYSTEMS] 
        ON [COLLECTION_SYSTEMS].[collection_sys_id] = [COLLECTION_STATIONS].[collection_sys_id]
    LEFT JOIN [dvrtDB].[dbo].[STATION_MASTER] 
        ON [STATION_MASTER].[CAPTURE_SEQ_NO] = [COLLECTION_STATIONS].[SEQ_NO]
    WHERE [COLLECTION_SYSTEMS].[collection_sys_description] = 'USGS Gage'
    ORDER BY [COLLECTION_SYSTEM] ASC
    """
    
    with pyodbc.connect(f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password}") as conn:
        df_sql = pd.read_sql_query(query, conn)
    
    return df_sql

# Export USGS and SQL data horizontally into a single CSV
def export_to_csv():
    # Fetch USGS and SQL data
    usgs_sites = get_usgs_sites()
    df_sql = get_sql_data()

    # Convert USGS data into DataFrame
    df_usgs = pd.DataFrame(usgs_sites, columns=['Site Code', 'Site Name', 'Latitude', 'Longitude'])

    # Remove duplicates from USGS data based on 'Site Code'
    df_usgs = df_usgs.drop_duplicates(subset='Site Code', keep='first')

    # Sort both dataframes by Site Code (for USGS) and CollectionStationName (for SQL)
    df_usgs['Site Code'] = df_usgs['Site Code'].astype(str)  # Ensure it's treated as a string
    df_sql['CollectionStationName'] = df_sql['CollectionStationName'].astype(str)  # Ensure it's treated as a string

    # Sort both dataframes
    df_usgs = df_usgs.sort_values(by='Site Code').reset_index(drop=True)
    df_sql = df_sql.sort_values(by='CollectionStationName').reset_index(drop=True)

    # Ensure both dataframes have the same number of rows (pad with NaN if necessary)
    max_rows = max(len(df_usgs), len(df_sql))
    df_usgs = df_usgs.reindex(range(max_rows))
    df_sql = df_sql.reindex(range(max_rows))

    # Concatenate the dataframes horizontally (USGS data on the left, SQL data on the right)
    combined_df = pd.concat([df_usgs, df_sql], axis=1)

    # Save the combined DataFrame to CSV in the current working directory
    output_path = os.path.join(os.getcwd(), 'New_2.0_sorted_combined_usgs_sql_data.csv')
    combined_df.to_csv(output_path, index=False)
    print(f"Data exported to {output_path}")

# Execute the function
export_to_csv()

Data exported to C:\Users\pbenko\Documents\20250213_distribution_data\data\USGS_Utah_Stations\New_2.0_sorted_combined_usgs_sql_data.csv


  df_sql = pd.read_sql_query(query, conn)
