### Pearson Correlation Matrix entire data plot for stations with similar name
### without reservoir stations

In [None]:
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from itertools import combinations

# --- Load station IDs from metadata ---
metadata_file = "Things_with_similar_names_no_reservoir.csv"
df_meta = pd.read_csv(metadata_file, dtype=str)
station_ids = df_meta["dvrt_station_id"].dropna().unique()

# --- Download discharge data for each station ---
data_dict = {}
end_date = datetime.today().strftime("%Y-%m-%d")

for station_id in station_ids:
    url = f"https://www.waterrights.utah.gov/dvrtdb/daily-chart.asp?station_id={station_id}&end_date={end_date}&f=json"
    r = requests.get(url)
    if r.status_code == 200:
        json_data = r.json()
        if "data" in json_data:
            df = pd.DataFrame(json_data["data"], columns=["date", "value"])
            df["date"] = pd.to_datetime(df["date"])
            df["value"] = pd.to_numeric(df["value"], errors="coerce")
            df = df.set_index("date").rename(columns={"value": station_id})
            data_dict[station_id] = df

# --- Combine all time series into a single DataFrame ---
combined_df = pd.concat(data_dict.values(), axis=1)

# --- Optional: Fill missing values ---
combined_df = combined_df.interpolate(limit_direction="both")

# --- Compute correlation matrix ---
correlation_matrix = combined_df.corr()

# --- Filter: Find pairs with correlation ≥ 0.80 ---
similar_pairs = correlation_matrix.stack().reset_index()
similar_pairs.columns = ['Station1', 'Station2', 'Correlation']
similar_pairs = similar_pairs[similar_pairs['Station1'] != similar_pairs['Station2']]
similar_pairs = similar_pairs[similar_pairs['Correlation'] >= 0.80]

# --- Visualize as heatmap ---
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=False, linewidths=0.5)
plt.title("Station Similarity (Pearson Correlation)")
plt.show()

### Pearson Correlation Matrix entire data plot for stations with similar name
### without reservoir stations JUST SHOWS STATIONS GREATER THAN 0.8 in the matrix

In [None]:
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from itertools import combinations

# --- Load station IDs from metadata ---
metadata_file = "Things_with_similar_names_no_reservoir.csv"
df_meta = pd.read_csv(metadata_file, dtype=str)
station_ids = df_meta["dvrt_station_id"].dropna().unique()

# --- Download discharge data for each station ---
data_dict = {}
end_date = datetime.today().strftime("%Y-%m-%d")

for station_id in station_ids:
    url = f"https://www.waterrights.utah.gov/dvrtdb/daily-chart.asp?station_id={station_id}&end_date={end_date}&f=json"
    r = requests.get(url)
    if r.status_code == 200:
        json_data = r.json()
        if "data" in json_data:
            df = pd.DataFrame(json_data["data"], columns=["date", "value"])
            df["date"] = pd.to_datetime(df["date"])
            df["value"] = pd.to_numeric(df["value"], errors="coerce")
            df = df.set_index("date").rename(columns={"value": station_id})
            data_dict[station_id] = df

# --- Combine all time series into a single DataFrame ---
combined_df = pd.concat(data_dict.values(), axis=1)

# --- Optional: Fill missing values ---
combined_df = combined_df.interpolate(limit_direction="both")

# --- Compute correlation matrix ---
correlation_matrix = combined_df.corr()

# --- Filter: Find pairs with correlation ≥ 0.80 ---
similar_pairs = correlation_matrix.stack().reset_index()
similar_pairs.columns = ['Station1', 'Station2', 'Correlation']
similar_pairs = similar_pairs[similar_pairs['Station1'] != similar_pairs['Station2']]
similar_pairs = similar_pairs[similar_pairs['Correlation'] >= 0.80]

# --- Mask correlations below 0.8 ---
masked_corr = correlation_matrix.copy()
mask = masked_corr < 0.8
masked_corr[mask] = np.nan  # Set values < 0.8 to NaN (for white masking)

# --- Plot heatmap with only values ≥ 0.8 shown in red gradient ---
plt.figure(figsize=(14, 12))
sns.heatmap(
    masked_corr,
    cmap="Reds",           # Red gradient from 0.8 to 1.0
    linewidths=0.5,
    square=True,
    annot=False,
    mask=mask,             # Hide lower correlations
    vmin=0.8, vmax=1.0,    # Gradient range
    cbar_kws={"label": "Pearson Correlation (≥ 0.8)"},
    xticklabels=True,
    yticklabels=True
)

plt.xticks(rotation=90, fontsize=8)
plt.yticks(fontsize=8)
plt.title("Pearson Correlation Matrix (Only Values ≥ 0.8 in Gradient)", fontsize=14)
plt.tight_layout()
plt.show()