In [3]:
import pandas as pd
import numpy as np
import openpyxl
from datetime import timedelta
from openpyxl import Workbook

In [None]:
try:
    data = pd.read_excel('raw.xlsx', engine='openpyxl')
    df = pd.DataFrame(data)
except FileNotFoundError:
    print("Lỗi: Không tìm thấy file 'raw.xlsx'. Vui lòng kiểm tra đường dẫn.")
    exit() # Exit if file not found

# Đảm bảo cột 'create_time' là kiểu datetime và xử lý NaT
df['create_time'] = pd.to_datetime(df['create_time'], errors='coerce')
df.dropna(subset=['create_time'], inplace=True)

# Sắp xếp theo ip_checkout và create_time để tối ưu hóa việc tìm kiếm
df_sorted = df.sort_values(by=['ip_checkout', 'create_time']).reset_index(drop=True)

final_grouped_buyer_ids = set() # To store all unique buyer_ids that meet the criteria

# Lặp qua từng nhóm ip_checkout duy nhất
for ip_checkout_val, group in df_sorted.groupby('ip_checkout'):
    # Convert group to a list of dictionaries for easier indexing and manipulation
    # This avoids repeated .iloc[i] or .loc access on a potentially sliced DataFrame
    records_for_ip = group.to_dict('records')

    # 'i' is the starting index of the current potential group within records_for_ip
    for i in range(len(records_for_ip)):
        start_record = records_for_ip[i]
        start_time = start_record['create_time']

        current_potential_group_ids = [start_record['buyer_id']]
        
        # 'j' iterates through subsequent records to expand the current potential group
        for j in range(i + 1, len(records_for_ip)):
            next_record = records_for_ip[j]
            next_time = next_record['create_time']
            next_id = next_record['buyer_id']

            # Check if the 'next_time' is within 1 hour of the 'start_time' of the current potential group
            if (next_time - start_time) <= timedelta(hours=1):
                current_potential_group_ids.append(next_id)
            else:
                # Since the data is sorted, if the time gap is too large,
                # subsequent records will also be too far. Break from inner loop.
                break
        
        # After forming a potential group (all records within 1 hour of start_time)
        # Check if it contains at least 3 distinct buyer_ids
        unique_ids_in_group = set(current_potential_group_ids)
        if len(unique_ids_in_group) >= 3:
            # Add all distinct buyer_ids from this valid group to our final set
            final_grouped_buyer_ids.update(unique_ids_in_group)

# Output the results
if final_grouped_buyer_ids:
    df_output_ids = pd.DataFrame(list(final_grouped_buyer_ids), columns=['buyer_id'])
    df_output_ids
else:
    print("ℹ️ Không tìm thấy ID nào để nhóm theo tiêu chí (ít nhất 3 ID riêng biệt trong 1 giờ).")

KeyError: ('ip_checkout', 'create_time')