In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from typing import List

from tqdm.auto import tqdm


def _generate_dates(start_year: int, end_year: int) -> List[str]:
    """Generate a list of YYYYMMDD strings for the first of each month between start and end year."""
    dates = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            dates.append(f"{year}{month:02d}01")
    return dates


def q_download(site_id: str, start_year: int, end_year: int) -> pd.DataFrame:
    base_url = "http://www1.river.go.jp/cgi-bin/DspWaterData.exe?KIND=6&ID="
    end_date = f"{end_year}1231"
    dates = _generate_dates(start_year, end_year)

    all_data = []

    for date in tqdm(dates):
        url = f"{base_url}{site_id}&BGNDATE={date}&ENDDATE={end_date}"
        try:
            response = requests.get(url, timeout=10)
            response.encoding = "euc-jp"
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            table = soup.find("table")
            if table is None:
                continue

            df = pd.read_html(StringIO(str(table)))[0]
            if len(df)>2:
                print(df)
                return df
            # df = pd.read_html(str(table))[0]
            # df = df.iloc[4:]  # Skip the header rows
            # df.columns = ['Date'] + [f'Col{i}' for i in range(1, df.shape[1])]
            # df['Date'] = pd.to_datetime(df['Date'], format="%Y/%m/%d", errors='coerce')
            # df_numeric = df.drop(columns=['Date']).apply(pd.to_numeric, errors='coerce')
            # df['Q'] = df_numeric.mean(axis=1, skipna=True)
            # all_data.append(df[['Date', 'Q']])
        except Exception:
            continue

    if not all_data:
        return pd.DataFrame(columns=["Date", "Q"])

    result = pd.concat(all_data).dropna(subset=["Q"])
    result = result.sort_values("Date").reset_index(drop=True)
    result["date"] = result["Date"].dt.strftime("%Y-%m-%d")

    return result[["Q", "Date", "date"]]


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
table = q_download('103031283303060', 1950, 2025)

  2%|▏         | 16/912 [00:22<20:39,  1.38s/it]
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7d5c103d5910>>
Traceback (most recent call last):
  File "/nas/cee-water/cjgleason/ted/global_gauges/.venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
KeyboardInterrupt: 

KeyboardInterrupt



In [None]:
from io import StringIO

df = pd.read_html(StringIO(str(table)))[0]


df

In [None]:
len(df)

In [11]:
import pandas as pd
site_id = '303031283303010'
start_date = pd.Timestamp(1950,1,1).strftime("%Y%m%d")
end_date = pd.Timestamp(1950,2,1).strftime("%Y%m%d")

base_url = "http://www1.river.go.jp/cgi-bin/DspWaterData.exe"
params = {
    "KIND": 6,
    "ID": site_id,
    "BGNDATE": start_date,
    "ENDDATE": end_date
}

r = requests.get(base_url, params)
r.encoding = "euc-jp"

In [4]:
r.url

'http://www1.river.go.jp/cgi-bin/DspWaterData.exe?KIND=6&ID=103031283303060&BGNDATE=19500101&ENDDATE=20250101'

In [14]:
soup = BeautifulSoup(r.text, "html.parser")
table = soup.find("table")
table

<table border="1" cellpadding="2" cellspacing="1">
<tr>
<td align="CENTER" bgcolor="#33FFFF" nowrap=""><b><font size="-1">観測所記号</font></b></td>
<td align="CENTER" bgcolor="#33FFFF" nowrap=""><b><font size="-1">観測所名</font></b></td>
<td align="CENTER" bgcolor="#33FFFF" nowrap=""><b><font size="-1">水系名</font></b></td>
<td align="CENTER" bgcolor="#33FFFF" nowrap=""><b><font size="-1">河川名</font></b></td>
</tr>
<tr>
<td align="CENTER" bgcolor="#FFFFF0" nowrap="" width="20%"><b><font size="-1">303031283303010</font></b></td>
<td align="CENTER" bgcolor="#FFFFF0" nowrap="" width="30%"><b><font size="-1">木原（きはら）</font></b></td>
<td align="CENTER" bgcolor="#FFFFF0" nowrap="" width="25%"><b><font size="-1">利根川</font></b></td>
<td align="CENTER" bgcolor="#FFFFF0" nowrap="" width="25%"><b><font size="-1">霞ヶ浦</font></b></td>
</tr>
</table>

In [None]:
type(table)

In [None]:
table

In [17]:
soup.keys

In [18]:
for link in soup.find_all('a'):
    print(link.get('href'))

/dat/dload/download/26303031283303010195001011367161.dat
/cgi-bin/DspWaterGraph.exe?KIND=6&ID=303031283303010&BGNDATE=19500101&ENDDATE=19500201&PID=1367161


In [None]:
from urllib.request import urlopen
from urllib.parse import urlencode

In [19]:
import datetime
import pandas as pd


class JWIS:
    def __init__(self, obs_type, obs_id, date_begin, date_end, kawabou):
        self.obs_type = obs_type
        self.obs_id = obs_id
        self.date_begin = date_begin
        self.date_end = date_end
        self.kawabou = kawabou
        if obs_type == 1:
            self.view_url = "http://www1.river.go.jp/cgi-bin/DspWaterData.exe"
        elif obs_type == 2:
            self.view_url = "http://www1.river.go.jp/cgi-bin/DspDamData.exe"

    def kind_name(self, kind):
        if kind == '1':
            return 'H'
        elif kind == '5':
            return 'Q'
        else:
            return 'X'

    def retrieve_data(self, kind):
        columns = ["Date", "Time"]
        if self.obs_type == 1:  # flow rate & height
            kn = self.kind_name(kind)
            columns.extend([kn, "Flag_" + kn])
            n_comma = 3
        elif self.obs_type == 2:  # dam
            columns.extend([
                "Ave. Precip. (mm/h)", "Flag_P", "Water storage (10^3 m3)",
                "Flag_WS", "Input (m3/s)", "Flag_I", "Output (m3/s)", "Flag_O",
                "Water storage (%)", "Flag_WSP"
            ])
            n_comma = 11
        data = pd.DataFrame(columns=columns)

        url_params_dict = {
            "KIND": kind,
            "ID": self.obs_id,
            "KAWABOU": self.kawabou
        }


        d = self.date_begin
        while d <= self.date_end:
            date_delta = min(datetime.timedelta(days=30), self.date_end - d)
            d1 = d + date_delta
            url_params_dict["BGNDATE"] = d.strftime("%Y%m%d")
            url_params_dict["ENDDATE"] = d1.strftime("%Y%m%d")


            return self.view_url, url_params_dict
        #     url_params = urlencode(url_params_dict)
        #     view_uri = self.view_url + '?' + url_params
        #     f = urlopen(view_uri)

        #     parser = JWISParser()
        #     parser.feed(f.read().decode("euc-jp"))
        #     parser.close()

        #     data_list = []
        #     # with urlopen(parser.data_url) as data_file:
        #     data_file = urlopen(parser.data_url)
        #     for line in data_file:
        #         line = line.decode("Shift_JIS")
        #         if line.count(',') == n_comma and not line.startswith('#'):
        #             data_list.append(line.rstrip("\r\n").split(','))

        #     data = data.append(pd.DataFrame(data_list, columns=columns))
        #     d = d1 + datetime.timedelta(days=1)
        # return data

    def retrieve_hq_data(self):
        h_data = self.retrieve_data('1')
        q_data = self.retrieve_data('5')
        # hq_data = pd.merge(h_data, q_data, on=["Date", "Time"], how="outer")
        return h_data, q_data


In [30]:
from datetime import date
def ask_date():
    print("Beginning date")
    d_start_year = int(input("    year? "))
    d_start_month = int(input("    month? "))
    d_start_date = int(input("    date? "))
    d_start = date(d_start_year, d_start_month, d_start_date)

    print("Final date")
    d_end_year = int(input("    year? "))
    d_end_month = int(input("    month? "))
    d_end_date = int(input("    date? "))
    d_end = date(d_end_year, d_end_month, d_end_date)

    return (d_start, d_end)

d_start, d_end = ask_date()

Beginning date


    year?  2010
    month?  1
    date?  1


Final date


    year?  2010
    month?  2
    date?  1


In [38]:
j = JWIS(1, '306041286608100', d_start, d_end, "NO")
(h_url, h_params),(q_url, q_params) = j.retrieve_hq_data()

In [39]:
import requests
r = requests.get(q_url,q_params)
r.encoding = "euc-jp"
r.text

'<HTML>\n<HEAD>\n<META http-equiv="Content-Type" content="text/html; charset=EUC-JP">\n<META http-equiv="Content-Style-Type" content="text/css">\n<TITLE>任意期間時刻流量一覧表</TITLE>\n</HEAD>\n<BODY bgcolor="#ffffff">\n<CENTER>\n<TABLE BORDER="1" CELLPADDING="2" CELLSPACING="1">\n  \n    <TR>\n      <TD NOWRAP BGCOLOR="#33FFFF" ALIGN="CENTER"><B><FONT SIZE=-1>観測所記号</FONT></B></TD>\n      <TD NOWRAP BGCOLOR="#33FFFF" ALIGN="CENTER"><B><FONT SIZE=-1>観測所名</FONT></B></TD>\n      <TD NOWRAP BGCOLOR="#33FFFF" ALIGN="CENTER"><B><FONT SIZE=-1>水系名</FONT></B></TD>\n      <TD NOWRAP BGCOLOR="#33FFFF" ALIGN="CENTER"><B><FONT SIZE=-1>河川名</FONT></B></TD>\n    </TR>\n    <TR>\n      <TD WIDTH="20%"  NOWRAP BGCOLOR="#FFFFF0" ALIGN="CENTER"><B><FONT SIZE=-1>306041286608100</FONT></B></TD>\n      <TD WIDTH="30%"  NOWRAP BGCOLOR="#FFFFF0" ALIGN="CENTER"><B><FONT SIZE=-1>戸ノ内（とのうち）</FONT></B></TD>\n      <TD WIDTH="25%"  NOWRAP BGCOLOR="#FFFFF0" ALIGN="CENTER"><B><FONT SIZE=-1>淀川</FONT></B></TD>\n      <TD WIDTH="25

In [40]:
r.url

'http://www1.river.go.jp/cgi-bin/DspWaterData.exe?KIND=5&ID=306041286608100&KAWABOU=NO&BGNDATE=20100101&ENDDATE=20100131'

In [33]:
parser = JWISParser()
parser.feed(r.text)
parser.close()
parser.data_url

'http://www1.river.go.jp/dat/dload/download/25103031283303060201001011367396.dat'

In [None]:
n_comma = 3

In [None]:
data_list = []
# with urlopen(parser.data_url) as data_file:
data_file = urlopen(parser.data_url)
for line in data_file:
    line = line.decode("Shift_JIS")
    print(line)
    if line.count(',') == n_comma and not line.startswith('#'):
        data_list.append(line.rstrip("\r\n").split(','))
        break

# data = data.append(pd.DataFrame(data_list, columns=columns))
# d = d1 + datetime.timedelta(days=1)

In [None]:
data_list

In [25]:
from html.parser import HTMLParser

class JWISParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.host = "http://www1.river.go.jp"
        self.data_url = None
    
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            attrs_dict = dict(attrs)
            url = attrs_dict.get("href", "")
            if url.startswith("/dat/dload/download"):
                self.data_url = self.host + url

parser = JWISParser()

In [None]:
import requests

import pandas as pd

site_id = '6243235'
start = pd.Timestamp(2010,1,1)
end = pd.Timestamp(2011,1,1)

view_url = "http://www1.river.go.jp/cgi-bin/DspWaterData.exe"
params = {
    "KIND": "Q",  # Flow rate
    "ID": site_id,
    "KAWABOU": "NO",
    "BGNDATE": start.strftime("%Y%m%d"),
    "ENDDATE": end.strftime("%Y%m%d")
}

# Build URL and get HTML page
param_str = "&".join(f"{k}={v}" for k, v in params.items())
view_uri = f"{view_url}?{param_str}"

In [None]:
view_uri

In [None]:
r = requests.get(view_url, params)
r.encoding = "euc-jp"

In [None]:
r.text[:1000]

In [None]:
html_content = r.text
parser.feed(html_content)

In [None]:
parser.data_url