In [1]:
import pandas as pd
import numpy as np
import os

# Set up path for Google Drive folder

In [2]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [3]:
# Set your working directory [Do not forget to make a short-cut of the folder shared by Galina to your Drive]
os.chdir('/content/drive/MyDrive/#WaterSoftHack25 - Water quality project/Project Codes/input datafiles/')

# Load data

In [4]:
# Import instantaneous data file and keep only turbidity and date columns
df_turb = pd.read_csv('11447650_raw_iv_turb.csv')
df_turb = df_turb.drop(columns=['site_no', 'turbidity_cd'])

# Set the 'datetime' column as the index
df_turb['datetime'] = pd.to_datetime(df_turb['datetime'])  # ensure it's datetime
df_turb = df_turb.set_index('datetime')  # set as index

# Confirm the index type
print(type(df_turb.index))

<class 'pandas.core.indexes.datetimes.DatetimeIndex'>


In [5]:
# Import daily data file
df_all = pd.read_csv('11447650_cleaned_daily_data_plus_precipitation.csv')
df_all.columns

Index(['DATE', 'SSC_mg_L', 'SSD_st_d', 'Discharge_cfs', 'Precip_in'], dtype='object')

In [6]:
# Set the 'DATE' column as the index
df_all['DATE'] = pd.to_datetime(df_all['DATE'])  # ensure it's datetime
df_all = df_all.set_index('DATE')  # set as index

# Resample turbidity to match other variables

In [7]:
# Resample turbidity to daily values
turb_daily = df_turb['turbidity'].resample('D').mean()

display(turb_daily.head())
display(turb_daily.tail())

Unnamed: 0_level_0,turbidity
datetime,Unnamed: 1_level_1
2009-12-03 00:00:00+00:00,4.225
2009-12-04 00:00:00+00:00,4.405208
2009-12-05 00:00:00+00:00,4.281053
2009-12-06 00:00:00+00:00,4.666667
2009-12-07 00:00:00+00:00,4.757317


Unnamed: 0_level_0,turbidity
datetime,Unnamed: 1_level_1
2025-01-02 00:00:00+00:00,127.125
2025-01-03 00:00:00+00:00,90.01875
2025-01-04 00:00:00+00:00,64.60625
2025-01-05 00:00:00+00:00,51.472917
2025-01-06 00:00:00+00:00,48.813636


# Combine two dataframes

In [35]:
# Ensure each dataframe has DatetimeIndex that is tz-naive
for df in [turb_daily, df_all]:
    df.index = pd.to_datetime(df.index).tz_localize(None)

In [36]:
# Combine turbidity with other data
combined_df = pd.concat([df_all, turb_daily], axis=1)

In [37]:
# Rename turbidity and date columns
combined_df = combined_df.rename(columns={'turbidity': 'Turb_fnu'})

# Reset index
combined_df = combined_df.reset_index()
combined_df = combined_df.rename(columns={'index': 'DATE'})

In [38]:
combined_df.head()

Unnamed: 0,DATE,SSC_mg_L,SSD_st_d,Discharge_cfs,Precip_in,Turb_fnu
0,1956-10-15,37.0,1250.0,12500.0,0.0,
1,1956-10-16,38.142857,1300.0,12300.0,0.0,
2,1956-10-17,39.285714,1450.0,12100.0,0.0,
3,1956-10-18,40.428571,1300.0,12200.0,0.0,
4,1956-10-19,41.571429,1300.0,12000.0,0.0,


In [39]:
# Save all data
combined_df.to_csv('11447650_cleaned_daily_data_plus_precip_turb.csv', index=False)

# Trim data for the period that has all variables

In [42]:
# Find first and last non-NA turbidity values
# Find start and end dates where turbidity is not null
first_date = combined_df.loc[combined_df['Turb_fnu'].notna(), 'DATE'].min()
last_date = combined_df.loc[combined_df['SSC_mg_L'].notna(), 'DATE'].max()

print(f"Turbidity data starts on: {first_date}")
print(f"All other data ends on: {last_date}")

Turbidity data starts on: 2009-12-03 00:00:00
All other data ends on: 2023-09-29 00:00:00


In [43]:
# Filter the whole dataframe to that date range

df_trimmed = combined_df[(combined_df['DATE'] >= first_date) & (combined_df['DATE'] <= last_date)]

In [46]:
display(df_trimmed.head())
display(df_trimmed.tail())

Unnamed: 0,DATE,SSC_mg_L,SSD_st_d,Discharge_cfs,Precip_in,Turb_fnu
19407,2009-12-03,25.0,562.0,8440.0,0.0,4.225
19408,2009-12-04,20.0,478.0,8650.0,0.0,4.405208
19409,2009-12-05,14.0,334.0,8520.0,0.0,4.281053
19410,2009-12-06,11.0,242.0,8250.0,0.13,4.666667
19411,2009-12-07,21.0,507.0,9190.0,0.42,4.757317


Unnamed: 0,DATE,SSC_mg_L,SSD_st_d,Discharge_cfs,Precip_in,Turb_fnu
24451,2023-09-25,16.0,806.0,18200.0,0.0,5.411458
24452,2023-09-26,15.0,698.0,17800.0,0.0,5.282292
24453,2023-09-27,13.0,619.0,17400.0,0.0,5.1
24454,2023-09-28,13.0,582.0,17200.0,0.0,4.8
24455,2023-09-29,12.0,559.0,17000.0,0.0,4.809375


In [47]:
df_trimmed.to_csv('11447650_cleaned_daily_data_plus_precip_turb_short.csv', index=False)