In [None]:
from pathlib import Path
import csv
import time

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from bs4 import BeautifulSoup
from openpyxl import load_workbook
import requests

In [None]:
def get_tomorrow_values(html, target_date: int):
    soup = BeautifulSoup(html)
    tables = soup.find_all("div", {"id": "stock_kabuka_table"})[0]
    table = tables.find_all("table", {"class": "stock_kabuka_dwm"})[0]

    for row in table.find("tbody").find_all("tr")[::-1]:
        date = int("20{}{}{}".format(*row.find("th").text.split("/")))
        if date <= target_date:
            continue
        cols = row.find_all("td")
        values = [int(float(val_str.text.replace(",", ""))) for val_str in cols[:4]]
        return date, values

In [None]:
nikkei_url = "https://kabutan.jp/stock/kabuka?code=0000&ashi=day&page={}"
topix_url = "https://kabutan.jp/stock/kabuka?code=0010&ashi=day&page={}"
nasdaq_url = "https://us.kabutan.jp/indexes/%5EIXIC/historical_prices/daily?page={}"

def search_stock_table_rows(soup):
    res = []
    for table in soup.find_all("table"):
        thead = table.find("tr")
        headers = [th.text for th in thead.find_all("th")]
        if len(headers) >= 4 and headers[1] == "始値" and headers[2] == "高値" and headers[3] == "安値" and headers[4] == "終値":
            res += table.find_all("tr")[1:]
    return res

def get_values(base_url, max_page=1):
    results = {}
    for i in range(max_page):
        url = base_url.format(i + 1)
        res = requests.get(url)
        soup = BeautifulSoup(res.text)
        rows = search_stock_table_rows(soup)

        if len(rows) == 0:
            print(f"stock table not found : {url}")
        
        for row in rows:
            cols = [col.text for col in row.findChildren(recursive=False)]
            date = "20{}{}{}".format(*cols[0].split("/"))
            values = [float(val_str.replace(",", "")) for val_str in cols[1:5]]
            results[date] = values
        time.sleep(0.2)
    return results

nikkei_values = get_values(nikkei_url, 10)
topix_values = get_values(topix_url, 10)
nasdaq_values = get_values(nasdaq_url, 11)

In [None]:
base_url = "https://us.kabutan.jp/stocks/NVDA/historical_prices/daily?page={}"
nvidia_values = get_values(base_url, 10)
nomura_values = get_values("https://kabutan.jp/stock/kabuka?code=6254&ashi=day&page={}", 10)

In [None]:
csv_path = Path(r"../data/sandbox/202402_downward.csv")
url_template = "https://kabutan.jp/stock/kabuka?code={}"

with open(csv_path, "r") as f:
    csv_reader = csv.reader(f)
    data = list(csv_reader)[1:]

results = []
for row in data:
    date, code = row
    target_url = url_template.format(code)
    res = requests.get(target_url)
    date, values = get_tomorrow_values(res.text, int(date))
    results.append([date, code, values])
    print(code)
    time.sleep(0.1)


In [None]:
cnt_up = 0
sum_up = 0
cnt_down = 0
sum_down = 0
for res in results:
    up_rate = (res[2][3] - res[2][0]) / res[2][0]
    if up_rate > 0:
        cnt_up += 1
        sum_up += up_rate
    elif up_rate < 0:
        cnt_down += 1
        sum_down += up_rate

print("count up = {}, rate = {}, down = {}, rate = {}".format(cnt_up, sum_up / cnt_up, cnt_down, sum_down / cnt_down))

In [None]:
cnt_up / len(results), cnt_down / len(results)

In [None]:
up_per_day = {}
down_per_day = {}

for res in results:
    day = res[0]
    if day not in up_per_day:
        up_per_day[day] = []
        down_per_day[day] = []
    up_rate = (res[2][3] - res[2][0]) / res[2][0]
    if up_rate > 0:
        up_per_day[day].append(up_rate)
    elif up_rate < 0:
        down_per_day[day].append(up_rate)


In [None]:
for key in sorted(up_per_day.keys()):
    num_up = len(up_per_day[key])
    mean_up = sum(up_per_day[key]) / num_up if num_up > 0 else 0
    num_down = len(down_per_day[key])
    mean_down = sum(down_per_day[key]) / num_down if num_down > 0 else 0
    nikkei = nikkei_values[str(key)]
    nikkei_up = (nikkei[3] - nikkei[0]) / nikkei[0]
    topix = topix_values[str(key)]
    topix_up = (topix[3] - topix[0]) / topix[0]
    print("key : {}, up: num {:2d}, mean {:.3f}, down : num {:2d}, mean {:.3f}, nikkei : {:.3f}, topix : {:.3f}".format(
        key, num_up, mean_up, num_down, mean_down, nikkei_up, topix_up
    ))

In [None]:
def get_up_rates_pairs(us_values, jp_values):
    jp_idx = 0
    us_idx = 0

    jp_key_ints = sorted([int(val) for val in jp_values.keys()])
    us_key_ints = sorted([int(val) for val in us_values.keys()])

    up_rates = []
    while jp_idx < len(jp_values) and us_idx < len(us_values):
        jp_cur = jp_key_ints[jp_idx]
        us_keys = []
        while us_key_ints[us_idx] < jp_cur:
            us_keys.append(us_key_ints[us_idx])
            us_idx += 1
            if us_key_ints[us_idx] >= jp_cur:
                break

        if jp_idx > 0 and len(us_keys) > 0:
            us_previous_end = us_values[str(us_key_ints[us_idx - len(us_keys) - 1])][3]
            us_start = us_values[str(us_keys[0])][0]
            us_end = us_values[str(us_keys[-1])][3]
            us_up_rate = (us_end - us_start) / us_end
            us_end_up_rate = (us_end - us_previous_end) / us_previous_end

            jp_start = jp_values[str(jp_cur)][0]
            jp_end = jp_values[str(jp_cur)][3]
            jp_up_rate = (jp_end - jp_start) / jp_start

            #print(jp_cur, us_keys, us_up_rate, jp_up_rate, us_end_up_rate)
            up_rates.append([jp_up_rate, us_up_rate, us_end_up_rate, jp_cur])

        jp_idx += 1

    up_rates = np.array(up_rates)
    return up_rates

In [None]:
up_rates = get_up_rates_pairs(nvidia_values, nomura_values)

In [None]:
import re

In [None]:
raw_str = """■串カツ田中ホールディングス <3547> [東証Ｓ]
2月既存店売上高は前年同月比0.6％減と前年割れに転じた。

■イルグルム <3690> [東証Ｇ]
2月売上高は前年同月比7.4％減と前年割れに転じた。

■ヘッドウォータース <4011> [東証Ｇ]
東証と日証金が8日売買分から信用取引に関する臨時措置を実施する。

■アサンテ <6073> [東証Ｐ]
2月売上高は前年同月比4.4％減と2ヵ月ぶりに前年割れとなった。

■ナ・デックス <7435> [東証Ｓ]
今期経常を21％下方修正。

■アゴーラ　ホスピタリティー　グループ <9704> [東証Ｓ]
東証と日証金が8日売買分から信用取引に関する臨時措置を実施する。
"""


regex = re.compile("■.*<(\d*)>")
for line in raw_str.split("\n"):
    res = regex.search(line)
    if res is not None:
        print(res.group(1))