In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import os
import folium

# Suppress openpyxl warnings
warnings.simplefilter("ignore")

In [2]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [3]:
# Set your working directory [Do not forget to make a short-cut of the folder shared by Galina to your Drive]
os.chdir('/content/drive/MyDrive/#WaterSoftHack25 - Water quality project/Project Codes/input datafiles/')

In [4]:
# Import data file
site_info = pd.read_csv('USGS_site_search_SSC_turbidity_formatted.csv')

In [5]:
site_info.head()

Unnamed: 0,agency_cd,site_no,station_nm,site_tp_cd,dec_lat_va,dec_long_va,coord_acy_cd,dec_coord_datum_cd,qw_begin_date,qw_end_date,qw_count_nu,SS_start_date,SS_end_date,SS_n_years,turb_start_date,turb_end_date,turb_n_years,Comments
0,USGS,11303500.0,SAN JOAQUIN R NR VERNALIS CA,ST,37.676012,-121.266291,S,NAD83,12/2/1950,2/7/2024,10183.0,11/1/1956,9/30/2023,,9/30/2014,8/18/2016,,2 months of turbidity missing
1,USGS,1335770.0,HUDSON RIVER AT WATERFORD NY,ST,42.788689,-73.674007,S,NAD83,1/23/1952,6/10/2014,7391.0,10/1/1976,3/30/2014,,,,,Some data is estimated (rather than observed)
2,USGS,4193500.0,Maumee River at Waterville OH,ST,41.500053,-83.712715,S,NAD83,1/25/1957,2/7/2024,6860.0,4/12/1950,9/29/2003,,11/24/2021,7/31/2025,,Significant data gaps
3,USGS,6358500.0,MISSOURI R NEAR MOBRIDGE SD,ST,45.523605,-100.473748,M,NAD83,6/17/1938,8/3/2016,6633.0,6/1/1937,9/29/1951,,,,,Suspended sediment discharge only
4,USGS,4208000.0,Cuyahoga River at Independence OH,ST,41.395331,-81.629848,S,NAD83,1/22/1957,2/6/2024,6169.0,10/12/1950,9/29/2002,,2/16/2011,5/29/2021,,Moderate data gaps


In [6]:
# Convert date columns to datetime objects
site_info['SS_start_date'] = pd.to_datetime(site_info['SS_start_date'], errors='coerce')
site_info['SS_end_date'] = pd.to_datetime(site_info['SS_end_date'], errors='coerce')
site_info['turb_start_date'] = pd.to_datetime(site_info['turb_start_date'], errors='coerce')
site_info['turb_end_date'] = pd.to_datetime(site_info['turb_end_date'], errors='coerce')

# Calculate length of record as the difference in days and convert to years
site_info['SS_n_years'] = round( (site_info['SS_end_date'] - site_info['SS_start_date']).dt.days / 365.25, 1)
site_info['turb_n_years'] = round( (site_info['turb_end_date'] - site_info['turb_start_date']).dt.days / 365.25, 1)

# Sort by SS_n_years and then by turb_n_years in descending order
site_info_sorted = site_info.sort_values(by=['SS_n_years', 'turb_n_years'], ascending=False)

# Display the first few rows of the sorted DataFrame
display(site_info_sorted.head())

Unnamed: 0,agency_cd,site_no,station_nm,site_tp_cd,dec_lat_va,dec_long_va,coord_acy_cd,dec_coord_datum_cd,qw_begin_date,qw_end_date,qw_count_nu,SS_start_date,SS_end_date,SS_n_years,turb_start_date,turb_end_date,turb_n_years,Comments
5,USGS,11447650.0,SACRAMENTO R A FREEPORT CA,ST,38.455664,-121.501617,5,NAD83,11/7/1958,2/20/2024,5983.0,1956-10-15,2023-09-30,67.0,2013-08-30,2025-08-04,11.9,Only continuous turbidity is available
0,USGS,11303500.0,SAN JOAQUIN R NR VERNALIS CA,ST,37.676012,-121.266291,S,NAD83,12/2/1950,2/7/2024,10183.0,1956-11-01,2023-09-30,66.9,2014-09-30,2016-08-18,1.9,2 months of turbidity missing
6,USGS,1357500.0,MOHAWK RIVER AT COHOES NY,ST,42.785389,-73.707778,1,NAD83,7/26/1951,10/19/2023,5665.0,1954-01-25,2018-09-30,64.7,2011-01-29,2024-04-16,13.2,"Significant gaps in SS (1959-1976,1979-1999), ..."
10,USGS,11179000.0,ALAMEDA C NR NILES CA,ST,37.587119,-121.960752,F,NAD83,2/18/1952,4/30/2022,3747.0,1959-12-13,2024-04-30,64.4,NaT,NaT,,Big gap from 1973-1999
2,USGS,4193500.0,Maumee River at Waterville OH,ST,41.500053,-83.712715,S,NAD83,1/25/1957,2/7/2024,6860.0,1950-04-12,2003-09-29,53.5,2021-11-24,2025-07-31,3.7,Significant data gaps


In [7]:
# Write to CSV
site_info_sorted.to_csv('USGS_site_search_SSC_turbidity_formatted_with_record_length.csv', index=False)