In [87]:
import os
import pandas as pd
import polars as pl
import time


script_dir = os.getcwd()
root_dir = os.path.dirname(script_dir)
data_dir = os.path.join(root_dir, "data")

In [88]:
# データの読み込み
ondotori_hkb = f"{data_dir}\\AirTemp_odt_hkb_2cm.csv"


# dictに
files = {
    "Site_A": ondotori_hkb,
}

# pandas
df_pd = pd.read_csv(ondotori_hkb, encoding="shift-jis")

df_pd

Unnamed: 0,Date/Time,Date/Time.1,No.1
0,Date/Time,Date/Time,OFFICE 1
1,,,°C
2,2021/8/15 15:00,44423.625,20.5
3,2021/8/15 16:00,44423.66667,21.3
4,2021/8/15 17:00,44423.70833,19.8
...,...,...,...
27401,,,
27402,,,
27403,,,
27404,,,


In [89]:
# polars
df = pl.read_csv(ondotori_hkb, encoding="utf8-lossy", columns=["Date/Time", "No.1"])


# カラム名の変更と余計な行の削除
df = df.rename({
    "Date/Time": 'TIMESTAMP',
    "No.1": "Temp",
}).slice(2).filter(~pl.col("TIMESTAMP").is_null())

# 単位系
unit_dict = {
    "TIMESTAMP": "TS",
    "Temp": "DegC",
}


print(df.columns)
print(unit_dict)
df

['TIMESTAMP', 'Temp']
{'TIMESTAMP': 'TS', 'Temp': 'DegC'}


TIMESTAMP,Temp
str,str
"""2021/8/15 15:00""","""20.5"""
"""2021/8/15 16:00""","""21.3"""
"""2021/8/15 17:00""","""19.8"""
"""2021/8/15 18:00""","""21.4"""
"""2021/8/15 19:00""","""11.2"""
…,…
"""2024/7/13 7:00""","""10.2"""
"""2024/7/13 8:00""","""12.1"""
"""2024/7/13 9:00""","""16.3"""
"""2024/7/13 10:00""","""17.1"""


In [90]:
df_lazy = (
    pl.scan_csv(ondotori_hkb, encoding="utf8-lossy")
)

df = df_lazy.collect()
df

Date/Time,Date/Time_duplicated_0,No.1
str,str,str
"""Date/Time""","""Date/Time""","""OFFICE 1"""
,,"""��C"""
"""2021/8/15 15:00""","""44423.625""","""20.5"""
"""2021/8/15 16:00""","""44423.66667""","""21.3"""
"""2021/8/15 17:00""","""44423.70833""","""19.8"""
…,…,…
,,
,,
,,
,,


In [91]:
# ファイルをまとめるdict
files = {
    "Site_A": ondotori_hkb,
}

# 単位系の取得
unit_dict = {
    "timestamp": "TS",
    "Temp": "DegC",
}

# 出力するdfを格納するdict
dfs = {}

# ファイルごとに初期処理
for site_name, file in files.items():
    df = (
        pl.scan_csv(file, encoding="utf8-lossy")
        .rename({"Date/Time": 'timestamp', "Date/Time_duplicated_0": "record", "No.1": "Temp",})
        .slice(2)
        .filter(~pl.col("timestamp").is_null())
        .with_columns(pl.lit(f"{site_name}").alias("location"))
        .with_row_index("record", offset=0)
        .collect()
    )

    # timstampをunixtimeに変換
    df = df.with_columns(
        pl.col("timestamp").str.to_datetime("%Y/%m/%d %H:%M", strict=False).dt.epoch()
    )
    # unixtimeに変換出来なかった行をログ出力
    df_null_unixtime = df.filter(pl.col("timestamp").is_null())
    print(df_null_unixtime)

    # dfsに格納
    dfs[site_name] = df


df

DuplicateError: column with name 'record' has more than one occurrence