In [1]:
import hydrofunctions as hf, pandas as pd

site = "03443000"

# First, let's get the data and inspect its structure
nwis_data = hf.NWIS(site, 'dv', "1970-01-01", "2025-06-01", parameterCd='00065')
df = nwis_data.df()

# Let's inspect the DataFrame structure
print("DataFrame shape:", df.shape)
print("Column names:", df.columns.tolist())
print("Column levels:", df.columns.nlevels)
print("First few rows:")
print(df.head())

# Now let's handle the column structure properly
if df.columns.nlevels > 1:
    print("Multi-level columns dropped")
    # If multi-level columns exist, drop the appropriate levels
    stage = df.droplevel([0,1], axis=1).rename(columns={'00065':'stage_ft'})
else:
    print("Single-level columns kept as is")
    # Keep original column names
    stage = df

print("\nProcessed DataFrame:")
print(stage.head())
# stage.to_csv("03443000_stage_daily.csv")



Requested data from https://waterservices.usgs.gov/nwis/dv/?format=json%2C1.1&sites=03443000&parameterCd=00065&startDT=1970-01-01&endDT=2025-06-01
DataFrame shape: (10837, 6)
Column names: ['USGS:03443000:00065:00001', 'USGS:03443000:00065:00001_qualifiers', 'USGS:03443000:00065:00002', 'USGS:03443000:00065:00002_qualifiers', 'USGS:03443000:00065:00003', 'USGS:03443000:00065:00003_qualifiers']
Column levels: 1
First few rows:
                           USGS:03443000:00065:00001  \
datetimeUTC                                            
1995-10-01 00:00:00+00:00                       6.48   
1995-10-02 00:00:00+00:00                       6.43   
1995-10-03 00:00:00+00:00                       6.78   
1995-10-04 00:00:00+00:00                      16.22   
1995-10-05 00:00:00+00:00                      19.04   

                          USGS:03443000:00065:00001_qualifiers  \
datetimeUTC                                                      
1995-10-01 00:00:00+00:00                    

In [2]:
# print all the columns in the dataframe
print(df.columns)

# drop the columns: "USGS:03443000:00065:00001', 'USGS:03443000:00065:00001_qualifiers','USGS:03443000:00065:00002', 'USGS:03443000:00065:00002_qualifiers"
df = df.drop(columns=['USGS:03443000:00065:00001', 'USGS:03443000:00065:00001_qualifiers','USGS:03443000:00065:00002', 'USGS:03443000:00065:00002_qualifiers'])

# print number of rows in the dataframe
print(f"Number of rows in the dataframe: {len(df)}")

# check if the column "USGS:03443000:00065:00003" has null value
# Check for null values in the mean column
null_count = df['USGS:03443000:00065:00003'].isnull().sum()
print(f"Number of null values in mean column: {null_count}")

# Detect long gaps *before* you interpolate
is_nan = df['USGS:03443000:00065:00003'].isna()
gap_lengths = is_nan.groupby((~is_nan).cumsum()).sum()
too_long = gap_lengths[gap_lengths > 3]          # >3 consecutive days
print(too_long)


from itertools import groupby
import numpy as np

is_nan = df['USGS:03443000:00065:00003'].isna()
groups = [(k, sum(1 for _ in g)) for k, g in groupby(is_nan) if k]

long_gaps = [length for val, length in groups if length > 3]
print(f"Too long gaps to interpolate: {long_gaps}")

# Satır index'lerini tespit et
gap_groups = is_nan.groupby((~is_nan).cumsum())
rows_to_drop = gap_groups.filter(lambda g: g.sum() > 3).index

# Bu satırları tamamen sil
df_cleaned = df.drop(index=rows_to_drop)
print(f"Removed {len(rows_to_drop)} rows with gaps > 3 days.")

df_cleaned['USGS:03443000:00065:00003'] = (
    df_cleaned['USGS:03443000:00065:00003']
    .interpolate(method='linear')
)


# check if the column "USGS:03443000:00065:00003" has null value
# Check for null values in the mean column
null_count = df_cleaned['USGS:03443000:00065:00003'].isnull().sum()
print(f"Number of null values after cleaning: {null_count}")

# print number of rows in the dataframe
print(f"Number of rows in the dataframe: {len(df_cleaned)}")

print("Any remaining NaN?", df_cleaned.isnull().any().any())  # False olmalı

nan_columns = df_cleaned.isnull().sum()
print(nan_columns[nan_columns > 0])



# print number of rows in the dataframe
print(f"Number of rows in the dataframe: {len(df)}")

# check if the column "USGS:03443000:00065:00003" has null value
# Check for null values in the mean column
null_count = df['USGS:03443000:00065:00003'].isnull().sum()
print(f"Number of null values in mean column: {null_count}")

# Detect long gaps *before* you interpolate
is_nan = df['USGS:03443000:00065:00003'].isna()
gap_lengths = is_nan.groupby((~is_nan).cumsum()).sum()
too_long = gap_lengths[gap_lengths > 3]          # >3 consecutive days
print(too_long)


from itertools import groupby
import numpy as np

is_nan = df['USGS:03443000:00065:00003'].isna()
groups = [(k, sum(1 for _ in g)) for k, g in groupby(is_nan) if k]

long_gaps = [length for val, length in groups if length > 3]
print(f"Too long gaps to interpolate: {long_gaps}")

# Satır index'lerini tespit et
gap_groups = is_nan.groupby((~is_nan).cumsum())
rows_to_drop = gap_groups.filter(lambda g: g.sum() > 3).index

# Bu satırları tamamen sil
df_cleaned = df.drop(index=rows_to_drop)
print(f"Removed {len(rows_to_drop)} rows with gaps > 3 days.")

df_cleaned['USGS:03443000:00065:00003'] = (
    df_cleaned['USGS:03443000:00065:00003']
    .interpolate(method='linear')
)


# check if the column "USGS:03443000:00065:00003" has null value
# Check for null values in the mean column
null_count = df_cleaned['USGS:03443000:00065:00003'].isnull().sum()
print(f"Number of null values after cleaning: {null_count}")

# print number of rows in the dataframe
print(f"Number of rows in the dataframe: {len(df_cleaned)}")

print("Any remaining NaN?", df_cleaned.isnull().any().any())  # False olmalı

nan_columns = df_cleaned.isnull().sum()
print(nan_columns[nan_columns > 0])



Index(['USGS:03443000:00065:00001', 'USGS:03443000:00065:00001_qualifiers',
       'USGS:03443000:00065:00002', 'USGS:03443000:00065:00002_qualifiers',
       'USGS:03443000:00065:00003', 'USGS:03443000:00065:00003_qualifiers'],
      dtype='object')
Number of rows in the dataframe: 10837
Number of null values in mean column: 49
USGS:03443000:00065:00003
1733    5
3525    4
3529    4
3547    5
Name: USGS:03443000:00065:00003, dtype: int64
Too long gaps to interpolate: [5, 4, 4, 5]
Removed 22 rows with gaps > 3 days.
Number of null values after cleaning: 0
Number of rows in the dataframe: 10815
Any remaining NaN? False
Series([], dtype: int64)
Number of rows in the dataframe: 10837
Number of null values in mean column: 49
USGS:03443000:00065:00003
1733    5
3525    4
3529    4
3547    5
Name: USGS:03443000:00065:00003, dtype: int64
Too long gaps to interpolate: [5, 4, 4, 5]
Removed 22 rows with gaps > 3 days.
Number of null values after cleaning: 0
Number of rows in the dataframe: 10815

In [3]:
# save the dataframe to a csv file
# the dataframe should have "datetime" and "stage_ft" columns
# the dataframe should be saved as "dataset.csv"
# stage_ft column should be the "USGS:03443000:00065:00003" column
# Create a new dataframe with datetime and stage_ft columns
new_df = pd.DataFrame({
    'datetime': df.index,
    'stage_ft': df['USGS:03443000:00065:00003']
})

# Convert feet to meters
new_df['stage_m'] = (new_df['stage_ft'] * 0.3048).round(2)

# drop the stage_ft column
new_df = new_df.drop(columns=['stage_ft'])

# Save the new dataframe to a csv file
new_df.to_csv("dataset.csv", index=False)

print("DataFrame saved to dataset.csv")
print(new_df.head())




DataFrame saved to dataset.csv
                                           datetime  stage_m
datetimeUTC                                                 
1995-10-01 00:00:00+00:00 1995-10-01 00:00:00+00:00     1.96
1995-10-02 00:00:00+00:00 1995-10-02 00:00:00+00:00     1.93
1995-10-03 00:00:00+00:00 1995-10-03 00:00:00+00:00     1.91
1995-10-04 00:00:00+00:00 1995-10-04 00:00:00+00:00     3.79
1995-10-05 00:00:00+00:00 1995-10-05 00:00:00+00:00     5.41
