### get raw data

In [None]:
# Set workspace to Windrecorder dir
import os
import io
os.chdir("..")
os.chdir("..")
# ------------------------------------------------------------

import datetime
import pandas as pd

from windrecorder.db_manager import db_manager
from windrecorder.oneday import OneDay
from windrecorder.record_wintitle import get_wintitle_stat_in_day
from windrecorder.config import config
from windrecorder import llm

In [None]:
# datetime_select = datetime.datetime.today()
datetime_select = datetime.datetime(2024,8,19)

In [None]:
df_day_search_result = OneDay().search_day_data(
                    datetime_select,
                    search_content="",
                )

df_day_search_result

In [None]:
df_day_activity_raw, day_time_sum = get_wintitle_stat_in_day(datetime_select, optimize_for_display=False)
df_day_activity_optimize_display, _ = get_wintitle_stat_in_day(datetime_select, optimize_for_display=True)

df_day_activity_optimize_display

### clean and format data

In [None]:
# convert day records datetime format
df_day_search_result['videofile_time'] = pd.to_datetime(df_day_search_result['videofile_time'], unit='s').dt.strftime('%Y-%m-%d %H-%M-%S')

# config
day_records_clean_df = pd.DataFrame(columns=['datetime', 'activity', 'ocr_content'])
activity_compare_lst = []
activity_always_exclude_lst = ["explorer.exe"]
vaild_day_activity_lst = []
max_token_limit = 1000000/2.5
max_pre_row_token = int(max_token_limit/len(df_day_search_result))
activity_deduplication_trace_depth = 10
MIN_ACTIVITY_DURATION_SECOND = 30
MAX_DAY_ACTIVITY_HEAD_TRUNCATION_FOR_LLM = 20

# Only keep long-lasting activities
for index, row in df_day_activity_raw.iterrows():
    if row['Screen Time'] < MIN_ACTIVITY_DURATION_SECOND:
        continue
    vaild_day_activity_lst.append(row['Page'])

# filtering day records
for index, row in df_day_search_result.iterrows():
    activity = row['win_title']
    
    # 剔除过近的重复项、活动时间过短的项
    if activity in activity_compare_lst[-activity_deduplication_trace_depth:] or activity in activity_always_exclude_lst or activity not in vaild_day_activity_lst or len(activity.split(" | ")[0]) == 0:
        continue
    activity_compare_lst.append(activity)
    ocr_text = row['ocr_text']
    
    row = {
        'datetime': row['videofile_time'],
        'activity': activity.split(" | ")[0],
        'ocr_content': row['ocr_text'][:max_pre_row_token],
    }
    day_records_clean_df.loc[len(day_records_clean_df )] = row

df_day_activity_optimize_display = df_day_activity_optimize_display.head(MAX_DAY_ACTIVITY_HEAD_TRUNCATION_FOR_LLM)

day_records_clean_df

In [None]:
df_day_activity_optimize_display

### generate prompt context

In [None]:
def convert_df_to_csv_str(df:pd.DataFrame):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)
    csv_string = csv_buffer.getvalue()
    return csv_string

prompt_day_wintitle_activity = convert_df_to_csv_str(df_day_activity_optimize_display)
prompt_day_records = convert_df_to_csv_str(day_records_clean_df)

prompt_mount = f"""
### Top {MAX_DAY_ACTIVITY_HEAD_TRUNCATION_FOR_LLM} activities by user screen time for the day, sorted by time, highest to lowest (CSV format):
---Start of screen time ranking csv---
{prompt_day_wintitle_activity}
---END of screen time ranking csv---

### Part of the user's specific activities on that day, sorted by time (CSV format) (ocr_content contains a lot of noise, please refer to it with very low weight and only focus on the key content information related to the activity title. Pay more attention to activity and only use ocr_content as an optional supplement.):
---Start of activities csv---
{prompt_day_records}
---END of activities csv---
"""

# prompt_mount = f"""
# ### Top {MAX_DAY_ACTIVITY_HEAD_TRUNCATION_FOR_LLM} activities by user screen time for the day, sorted by time, highest to lowest (CSV format):
# ---Start of screen time ranking csv---
# {prompt_day_wintitle_activity}
# ---END of screen time ranking csv---
# """

system_prompt = f"""
You are a user behavior analysis expert. Please generate a simple summary of the user's daily activities on that day based on the following computer activity records (screen time ranking, specific activity tracks) of the user ({config.user_name}) received on that day, describing what the user did and what content browsed on that day. If necessary, please retain the proper nouns or origin text of the activity content.
"""
prompt_end = """
Now, please summarize the user's daily activities in chronological order based on the above. Keep it concise, clear, simple, and insightful. Please output the results directly without adding additional instructions.
"""

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": prompt_mount + prompt_end}
]

In [None]:
# from openai import OpenAI

# # Point to the local server
# client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

# completion = client.chat.completions.create(
#   model="PrunaAI/Phi-3-mini-128k-instruct-GGUF-Imatrix-smashed",
#   messages=messages,
#   temperature=0.7,
# )

# print(completion.choices[0].message)

success, plain_text = llm.request_llm_one_shot(
    user_content=prompt_mount + prompt_end,
    system_prompt=system_prompt,
    temperature=.3,
)

In [None]:
print(plain_text)