In [None]:
import pandas as pd
from datetime import datetime
import os

In [None]:
# Load the dataset
file_path = 'path to the dataset after converting the tab deliminated txt file to csv from articnet website'
data = pd.read_csv(file_path)

In [None]:
# Define a function to process each code's data
def process_code_data(code_value, data):
    code_data = data[data['Code'] == code_value]
    code_data = code_data.set_index(['Year', 'Day']).reset_index().melt(id_vars=['Year', 'Day'], 
                                                                        value_vars=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], 
                                                                        var_name='Month', 
                                                                        value_name='Value')
    month_mapping = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 
        'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 
        'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
    }
    code_data['Month'] = code_data['Month'].map(month_mapping)
    
    def is_valid_date(year, month, day):
        try:
            datetime(int(year), int(month), int(day))
            return True
        except ValueError:
            return False

    code_data['is_valid'] = code_data.apply(lambda row: is_valid_date(row['Year'], row['Month'], row['Day']), axis=1)
    valid_code_data = code_data[code_data['is_valid']]
    valid_code_data['Datetime'] = pd.to_datetime(valid_code_data['Year'].astype(str) + '-' + valid_code_data['Month'] + '-' + valid_code_data['Day'].astype(str))
    valid_code_data = valid_code_data.drop(columns=['Year', 'Day', 'Month', 'is_valid']).sort_values(by='Datetime')
    valid_code_data = valid_code_data[['Datetime', 'Value']]
    full_date_range = pd.date_range(start=valid_code_data['Datetime'].min(), end=valid_code_data['Datetime'].max())
    full_timeseries = valid_code_data.set_index('Datetime').reindex(full_date_range).fillna(-9999).reset_index()
    full_timeseries.columns = ['Datetime', 'Streamflow(m3/s)']
    return full_timeseries

# Extract unique codes
unique_codes = data['Code'].unique()

# Process and save each code's data
for code in unique_codes:
    code_timeseries = process_code_data(code, data)
    output_file_path = f'/Users/yubinbaaniya/Library/CloudStorage/Box-Box/Bias Correction/mother Russia/observedprocessed/{code}.csv' # path to save the processed data
    code_timeseries.to_csv(output_file_path, index=False)

print("Data extraction and saving completed.")


In [None]:


# Specify the directory containing the CSV files
directory = '/Users/yubinbaaniya/Library/CloudStorage/Box-Box/Bias Correction/mother Russia/Russia_Hydroserver_Ready/Flow'

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Convert the first column to datetime and format it to YYYY-MM-DD
        df.iloc[:, 0] = pd.to_datetime(df.iloc[:, 0]).dt.strftime('%Y-%m-%d')
        
        # Save the processed file back to the same directory or another directory if needed
        df.to_csv(file_path, index=False)
        print(f"Processed {filename}")

print("All files processed successfully.")
