In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory of src to the path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.db.manager import DBManager
from src.input_to_instructions.load_and_execute import *
from src.input_to_instructions.types import *
from src.operation.execute import *
from src.response_generation.load_and_execute import *
from src.dateutils import normalize_sql_dates
ResponseGeneration.initialize(
    log_output=False,
    instance_type="unsloth"
)

INFO:datasets:PyTorch version 2.6.0 available.


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.6.9: Fast Llama patching. Transformers: 4.53.0.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 2. Max memory: 23.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

sh2orc/Llama-3.1-Korean-8B-Instruct does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


In [3]:
from collections import defaultdict, Counter
import logging

import pandas as pd
import numpy as np
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import json
import itertools

# from db.manager import DBManager
from operation.execute import OperationExecutor
from pathlib import Path
import warnings
import datetime


warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [4]:
BASE_DIR = "../"
def read_json(path):
    with open(path, "r", encoding="utf-8") as f:
        result = json.loads(f.read())
    
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in result]
    return result

In [5]:
if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16
print(f"attn_implementation: {attn_implementation}, torch_dtype: {torch_dtype}")


attn_implementation: flash_attention_2, torch_dtype: torch.bfloat16


# token length

In [6]:
import tokenize
tokenizer = ResponseGeneration.tokenizer

def get_token_length_of_data(df):
    data_str = df.to_csv(index=False)
    return len(tokenizer.encode(str(data_str)))

In [7]:
result_df = DBManager.execute_sql(
    "SELECT * FROM data_t WHERE idu_id = 1"
)
result_df = pd.DataFrame(result_df)
# drop first two columns
result_df = result_df.iloc[:, 2:]

INFO:src.db.instance:SQL SELECT query executed successfully and results fetched


In [8]:
result_df = result_df.iloc[:60 * 24 * 365]
token_len_dict = {}
token_len_dict["one year"] = get_token_length_of_data(result_df)
print("Token length of the data:", token_len_dict["one year"])

Token length of the data: 11993771


In [9]:
result_df = result_df.iloc[:60 * 24 * 30]
token_len_dict["one month"] = get_token_length_of_data(result_df)
print("Token length of one month data:", token_len_dict["one month"])

Token length of one month data: 947109


In [10]:
# print(str(np.mean(result_df.iloc[:, :1])))
token_len_dict["process mean"] = len(tokenizer.encode(str(np.mean(result_df.iloc[:, :1]))))

In [30]:
result_df = result_df.iloc[:60 * 24]
token_len_dict["one day"] = get_token_length_of_data(result_df)
print("Token length of one day data:", token_len_dict["one day"])

Token length of one day data: 31599


In [None]:
# 9 channel IMU sensor data with 50Hz sampling rate for 1 hour
# 9 * 50 * 60 = 27000 data points
# each data point is a float number, average length is 6 characters
# 27000 * 6 = 162000 characters

# UCI-HAR: 50hz, 6 channels, 128 windows of 2.56 seconds with 50% overlap
# PAMAP2: 100hz, 52 channels, 10 subjects, 7 activities
har_data = pd.read_csv("./pamap2.txt", delim_whitespace=True, header=None)
columns_to_keep = [0, 2] + list(range(3, 14)) + list(range(20, 31)) + list(range(37, 48))
har_data = har_data.iloc[:, columns_to_keep]
window = 100 * 30 # 100hz, 10 s
har_data = har_data.iloc[:window, :]
print(har_data)
# print(dummy_imu_data)


        0      2       3        4        5        6        7        8   \
0     8.38  104.0  30.000  2.37223  8.60074  3.51048  2.43954  8.76165   
1     8.39    NaN  30.000  2.18837  8.56560  3.66179  2.39494  8.55081   
2     8.40    NaN  30.000  2.37357  8.60107  3.54898  2.30514  8.53644   
3     8.41    NaN  30.000  2.07473  8.52853  3.66021  2.33528  8.53622   
4     8.42    NaN  30.000  2.22936  8.83122  3.70000  2.23055  8.59741   
..     ...    ...     ...      ...      ...      ...      ...      ...   
995  18.33  101.0  30.125  2.31885  8.53114  4.16420  2.30904  8.56703   
996  18.34    NaN  30.125  2.20361  8.45581  4.12424  2.29410  8.55206   
997  18.35    NaN  30.125  2.27518  8.45422  4.00985  2.29308  8.47651   
998  18.36    NaN  30.125  2.16743  8.49437  4.16205  2.33956  8.59702   
999  18.37    NaN  30.125  2.23418  8.56732  3.89339  2.29524  8.70306   

          9         10  ...       38       39        40       41       42  \
0    3.35465 -0.092217  ...  9.659

  har_data = pd.read_csv("./pamap2.txt", delim_whitespace=True, header=None)


In [None]:
token_len_dict["imu 30 seconds"] = get_token_length_of_data(har_data)
print("Token length of 30 seconds IMU data:", token_len_dict["imu 30 seconds"])

Token length of 10 seconds IMU data: 176157


In [39]:
token_len_df = pd.DataFrame.from_dict(token_len_dict, orient="index", columns=["token_length"])
token_len_df.to_csv("revision_hvacdata_token_length.csv")
token_len_df

Unnamed: 0,token_length
one year,11993771
one month,947109
process mean,8
imu 30 seconds,528209
one day,31599
imu 10 seconds,176157
