In [32]:
!pip install dataretrieval



In [33]:
from dataretrieval import nwis
import pandas as pd
import os


In [34]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [35]:
# Set your working directory [Do not forget to make a short-cut of the folder shared by Galina to your Drive]
os.chdir('/content/drive/MyDrive/#WaterSoftHack25 - Water quality project/Project Codes/input datafiles/')

In [36]:
# Import daily data file
df = pd.read_csv('11447650_cleaned_daily_data.csv')
df = df.rename(columns={'discharge_combined': 'Q'})

df.head()

Unnamed: 0,DATE,SSC,SSD,Q
0,1956-10-15 00:00:00+00:00,37.0,1250.0,12500.0
1,1956-10-16 00:00:00+00:00,38.142857,1300.0,12300.0
2,1956-10-17 00:00:00+00:00,39.285714,1450.0,12100.0
3,1956-10-18 00:00:00+00:00,40.428571,1300.0,12200.0
4,1956-10-19 00:00:00+00:00,41.571429,1300.0,12000.0


In [37]:
# Get the parameter code description
q_info = nwis.get_pmcodes(parameterCd="00060")
print(q_info)

(  parameter_cd     group                           parm_nm epa_equivalence  \
0        00060  Physical  Discharge, cubic feet per second     Not checked   

  result_statistical_basis result_time_basis  result_weight_basis  \
0                     Mean             1 Day                  NaN   

   result_particle_size_basis  result_sample_fraction  \
0                         NaN                     NaN   

   result_temperature_basis CASRN                   SRSName parm_unit  
0                       NaN        Stream flow, mean. daily     ft3/s  , NWIS_Metadata(url=https://help.waterdata.usgs.gov/code/parameter_cd_nm_query?fmt=rdb&parm_nm_cd=%2500060%25))


In [38]:
# Calculate percentiles
low_flow_thres = df['Q'].quantile(0.10)
high_flow_thres = df['Q'].quantile(0.90)

print(f"Low flow threshold (10th percentile): {low_flow_thres:.2f} ft3/s")
print(f"High flow threshold (90th percentile): {high_flow_thres:.2f} ft3/s")

Low flow threshold (10th percentile): 8660.00 ft3/s
High flow threshold (90th percentile): 50100.00 ft3/s


In [39]:
# Label flow category
def classify_flow(flow):
  if flow <= low_flow_thres:
    return 'low_flow'
  elif flow >= high_flow_thres:
    return 'high_flow'
  else:
    return 'normal'

df['flow_category'] = df['Q'].apply(classify_flow)


In [40]:
# Preview the result
print(df.head(5))

                        DATE        SSC     SSD        Q flow_category
0  1956-10-15 00:00:00+00:00  37.000000  1250.0  12500.0        normal
1  1956-10-16 00:00:00+00:00  38.142857  1300.0  12300.0        normal
2  1956-10-17 00:00:00+00:00  39.285714  1450.0  12100.0        normal
3  1956-10-18 00:00:00+00:00  40.428571  1300.0  12200.0        normal
4  1956-10-19 00:00:00+00:00  41.571429  1300.0  12000.0        normal


In [42]:
# Count observations in each category
category_counts = df['flow_category'].value_counts()
print("\nFlow Category Distribution:")
print(category_counts)


Flow Category Distribution:
flow_category
normal       19548
high_flow     2454
low_flow      2453
Name: count, dtype: int64


In [43]:
# Save data with category
df.to_csv('11447650_cleaned_daily_labeled_data.csv', index=False)