In [75]:
import os
import re
import pandas as pd

from math import ceil
from datetime import datetime

In [None]:
train = "./data/txt2csv_train"
train_merged = "./data/train_merged" # ratio of 3 days
os.makedirs(train_merged, exist_ok=True)

In [None]:
test = "./data/txt2csv_test"
test_processed = "./data/test"
os.makedirs(test_processed, exist_ok=True)

# Train

In [147]:
# Function to convert date strings into a standard datetime format
def convert_date(date_string):
    # Remove dots from the date string
    date_string = re.sub(r"\\.", "", date_string)

    # Replace Korean AM/PM with English equivalents
    date_string = re.sub(r"오전", "AM", date_string)
    date_string = re.sub(r"오후", "PM", date_string)

    # Attempt to match the date string against multiple formats
    date_formats = [
        "%Y년 %m월 %d일 %p %I:%M",
        "%Y-%m-%d %H:%M:%S",
        "%Y.%m.%d %p %I:%M",
        "%Y %m %d %p %I:%M",
        "%Y. %m. %d. %p %I:%M"
    ]

    for date_format in date_formats:
        try:
            return datetime.strptime(date_string, date_format).strftime("%Y-%m-%d %H:%M:%S")
        except ValueError:
            continue

    print(f"Date conversion failed for: {date_string}")
    return None

In [None]:
# List CSV files in the input directory
csv_files = [file for file in os.listdir(train) if file.endswith(".csv")]
total_files_processed = 0

merge = []
# Process each CSV file
for csv_file in csv_files:
    print(f"Processing file: {csv_file}")
    input_path = os.path.join(train, csv_file)
    output_file_prefix = os.path.splitext(csv_file)[0]

    # Read the CSV file
    dataframe = pd.read_csv(input_path)

    # Validate required columns
    required_columns = ["Date", "User", "Message"]
    if not all(column in dataframe.columns for column in required_columns):
        print(f"Invalid format for {csv_file}: Missing required columns")
        continue

    # Convert the date column
    dataframe["Date"] = dataframe["Date"].apply(convert_date)

    # Handle rows with failed date conversions
    failed_dates = dataframe[dataframe["Date"].isna()]
    if not failed_dates.empty:
        print(f"Failed to convert dates in {csv_file}")
        print(failed_dates)

    # Drop rows with missing or invalid dates
    dataframe = dataframe.dropna(subset=["Date"])

    # Extract only the date part
    dataframe["Date"] = pd.to_datetime(dataframe["Date"]).dt.date

    # Filter out short messages (less than 5 characters)
    dataframe = dataframe[dataframe["Message"].str.len() >= 5]

    # Compute message statistics
    message_counts = dataframe.groupby(["Date", "User"]).size()

    # Detect and count images, files, and links
    dataframe["HasImage"] = dataframe["Message"].str.contains("사진|image", case=False, na=False)
    dataframe["HasFile"] = dataframe["Message"].str.contains("파일|file", case=False, na=False)
    dataframe["HasLink"] = dataframe["Message"].str.contains(r"http", na=False)

    # Combine media-related features
    dataframe["MediaFlag"] = dataframe["HasImage"] | dataframe["HasFile"] | dataframe["HasLink"]
    media_counts = dataframe.groupby(["Date", "User"])["MediaFlag"].sum()

    # Count questions
    dataframe["IsQuestion"] = dataframe["Message"].str.contains(r"\\?", na=False)
    question_counts = dataframe.groupby(["Date", "User"])["IsQuestion"].sum()

    # Calculate average message length
    dataframe["MessageLength"] = dataframe["Message"].str.len()
    avg_message_length = dataframe.groupby(["Date", "User"])["MessageLength"].mean()

    # Calculate message ratio per user per date
    total_daily_messages = dataframe.groupby("Date").size()
    message_ratios = (dataframe.groupby(["Date", "User"]).size() / total_daily_messages).reset_index(name="MessageRatio")

    # Create a daily summary dataframe
    daily_summary = pd.DataFrame({
        "MessageCount": message_counts,
        "MessageRatio": message_ratios.set_index(["Date", "User"])["MessageRatio"],
        "AvgMessageLength": avg_message_length
    }).fillna(0).reset_index()

    # Ensure integer columns are properly cast
    daily_summary["MessageCount"] = daily_summary["MessageCount"].astype(int)

    # Group data into 3-day intervals
    unique_dates = sorted(daily_summary["Date"].unique())
    group_count = ceil(len(unique_dates) / 3)
    grouped_summaries = []

    for group_index in range(group_count):
        group_start = group_index * 3
        group_end = group_start + 3
        group_dates = unique_dates[group_start:group_end]

        # Filter data for the current group
        group_data = daily_summary[daily_summary["Date"].isin(group_dates)]

        # Calculate group-level media statistics
        group_media_counts = media_counts.loc[media_counts.index.get_level_values("Date").isin(group_dates)]
        total_group_media = group_media_counts.groupby("Date").sum().sum()
        user_media_counts = group_media_counts.groupby("User").sum()
        user_media_ratios = user_media_counts / total_group_media

        # Calculate group-level question statistics
        group_question_counts = question_counts.loc[question_counts.index.get_level_values("Date").isin(group_dates)]
        total_group_questions = group_question_counts.groupby("Date").sum().sum()
        user_question_counts = group_question_counts.groupby("User").sum()
        user_question_ratios = user_question_counts / total_group_questions

        # Add media and question ratios to the data
        group_data = group_data.set_index("User")
        group_data["MediaRatio"] = user_media_ratios.fillna(0)
        group_data["QuestionRatio"] = user_question_ratios.fillna(0)

        # Calculate participation score
        participation_score = dataframe.groupby("User")["Date"].nunique() / len(group_dates)
        group_data["ParticipationScore"] = group_data.index.map(participation_score.fillna(0))

        # Aggregate data
        aggregated_data = group_data.groupby("User").agg({
            # "MessageCount": "sum",
            "MessageRatio": "mean",
            "MediaRatio": "mean",
            "QuestionRatio": "mean",
            "AvgMessageLength": "mean",
            "ParticipationScore": "mean"
        }).reset_index()

        # Add date range to the group data
        aggregated_data["StartDate"] = group_dates[0]
        aggregated_data["EndDate"] = group_dates[-1] if len(group_dates) > 1 else group_dates[0]
        grouped_summaries.append(aggregated_data)
        total_files_processed += 1
    for aggregated_data in grouped_summaries:
        merge.append(aggregated_data)

# Save group data to a CSV file
group_file_name = f"train_raw.csv"
group_file_path = os.path.join(train_merged, group_file_name)
final_data = pd.concat(merge, ignore_index=True).fillna(0)

# print(f"Final combined data shape: {final_data.shape}")
# duplicate_count = final_data.duplicated().sum()
# print(f"Number of duplicate rows in final data: {duplicate_count}")


final_data.to_csv(group_file_path, index=False, encoding="utf-8-sig")
print(f"Saved: {group_file_path}")
print(f"Total files processed: {total_files_processed}")


Processing file: myung_3.csv
Processing file: myung_2.csv
Processing file: myung_1.csv
Processing file: dding_1.csv
Processing file: changmin_1.csv
Processing file: changmin_2.csv
Processing file: changmin_3.csv
Processing file: changmin_4.csv
Processing file: maeng_3.csv
Processing file: maeng_2.csv
Processing file: munsik_2.csv
Processing file: munsik_3.csv
Processing file: munsik_1.csv
Processing file: munsik_5.csv
Date conversion failed for: direction_and_moment = {"direction" : [direction]
Date conversion failed for: direction_and_moment = {"direction" : [direction]
Failed to convert dates in munsik_5.csv
     Date         User    Message
669  None  "moment(s)"  [moment]}
682  None  "moment(s)"  [moment]}
Processing file: munsik_6.csv
Saved: ./data/train_merged/train_raw.csv
Total files processed: 113


In [None]:
# List CSV files in the input directory
csv_files = [file for file in os.listdir(test) if file.endswith(".csv")]
total_files_processed = 0

# Process each CSV file
for csv_file in csv_files:
    print(f"Processing file: {csv_file}")
    input_path = os.path.join(test, csv_file)
    output_file_prefix = os.path.splitext(csv_file)[0]

    # Read the CSV file
    dataframe = pd.read_csv(input_path)

    # Validate required columns
    required_columns = ["Date", "User", "Message"]
    if not all(column in dataframe.columns for column in required_columns):
        print(f"Invalid format for {csv_file}: Missing required columns")
        continue

    # Convert the date column
    dataframe["Date"] = dataframe["Date"].apply(convert_date)

    # Handle rows with failed date conversions
    failed_dates = dataframe[dataframe["Date"].isna()]
    if not failed_dates.empty:
        print(f"Failed to convert dates in {csv_file}")
        print(failed_dates)

    # Drop rows with missing or invalid dates
    dataframe = dataframe.dropna(subset=["Date"])

    # Extract only the date part
    dataframe["Date"] = pd.to_datetime(dataframe["Date"]).dt.date

    # Filter out short messages (less than 5 characters)
    dataframe = dataframe[dataframe["Message"].str.len() >= 5]

    # Detect and count images, files, and links
    dataframe["HasImage"] = dataframe["Message"].str.contains("사진|image", case=False, na=False)
    dataframe["HasFile"] = dataframe["Message"].str.contains("파일|file", case=False, na=False)
    dataframe["HasLink"] = dataframe["Message"].str.contains(r"http", na=False)

    # Combine media-related features
    dataframe["MediaFlag"] = dataframe["HasImage"] | dataframe["HasFile"] | dataframe["HasLink"]

    # Count questions
    dataframe["IsQuestion"] = dataframe["Message"].str.contains(r"\\?", na=False)

    # Calculate average message length
    dataframe["MessageLength"] = dataframe["Message"].str.len()

    # Group data into 3-day intervals
    unique_dates = sorted(dataframe["Date"].unique())
    group_count = ceil(len(unique_dates) / 3)
    grouped_summaries = []

    for group_index in range(group_count):
        group_start = group_index * 3
        group_end = group_start + 3
        group_dates = unique_dates[group_start:group_end]

        # Filter data for the current group
        group_data = dataframe[dataframe["Date"].isin(group_dates)]

        # Calculate message ratios
        total_messages = group_data.groupby("Date").size().sum()  # Total messages in 3 days
        user_message_counts = group_data.groupby("User").size()  # User-level message counts
        user_message_ratios = user_message_counts / total_messages.fillna(0)  # User-level ratios

        # Calculate media ratios
        total_media = group_data["MediaFlag"].sum()
        user_media_counts = group_data.groupby("User")["MediaFlag"].sum()
        user_media_ratios = user_media_counts / total_media.fillna(0)

        # Calculate question ratios
        total_questions = group_data["IsQuestion"].sum()
        user_question_counts = group_data.groupby("User")["IsQuestion"].sum()
        user_question_ratios = user_question_counts / total_questions.fillna(0)

        # Aggregate data
        aggregated_data = pd.DataFrame({
            "MessageRatio": user_message_ratios,
            "MediaRatio": user_media_ratios,
            "QuestionRatio": user_question_ratios,
            "AvgMessageLength": group_data.groupby("User")["MessageLength"].mean(),
            "ParticipationScore": group_data.groupby("User")["Date"].nunique() / len(group_dates)
        }).fillna(0).reset_index()

        # Add date range to the group data
        # aggregated_data["StartDate"] = group_dates[0]
        # aggregated_data["EndDate"] = group_dates[-1] if len(group_dates) > 1 else group_dates[0]

        grouped_summaries.append(aggregated_data)

        # Save group data to a CSV file
        group_file_name = f"{output_file_prefix}_3day_group{group_index + 1}.csv"
        group_file_path = os.path.join(test_processed, group_file_name)
        aggregated_data.to_csv(group_file_path, index=False, encoding="utf-8-sig")
        print(f"Saved: {group_file_path}")

        total_files_processed += 1

print(f"Total files processed: {total_files_processed}")




In [141]:
final_data.describe()

Unnamed: 0,MessageRatio,MediaRatio,QuestionRatio,AvgMessageLength,ParticipationScore
count,549.0,549.0,549.0,549.0,549.0
mean,0.256603,0.169399,0.205829,35.963433,6.584699
std,0.173435,0.266786,0.149741,92.770311,3.699746
min,0.015278,0.0,0.013514,5.0,0.666667
25%,0.121032,0.0,0.083333,15.5,4.0
50%,0.213333,0.0,0.160142,22.875,5.666667
75%,0.355777,0.25,0.30303,33.24697,8.0
max,1.0,1.0,0.8,1373.166667,38.0


In [142]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549 entries, 0 to 548
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   User                549 non-null    object 
 1   MessageRatio        549 non-null    float64
 2   MediaRatio          549 non-null    float64
 3   QuestionRatio       549 non-null    float64
 4   AvgMessageLength    549 non-null    float64
 5   ParticipationScore  549 non-null    float64
 6   StartDate           549 non-null    object 
 7   EndDate             549 non-null    object 
dtypes: float64(5), object(3)
memory usage: 34.4+ KB


In [143]:
import numpy as np

# Describe statistics for AvgMessageLength
avg_message_stats = final_data["AvgMessageLength"].describe()
print(avg_message_stats)

# Calculate interquartile range (IQR)
Q1 = avg_message_stats["25%"]
Q3 = avg_message_stats["75%"]
IQR = Q3 - Q1

# Define outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Outlier boundaries: Lower = {lower_bound}, Upper = {upper_bound}")

# Identify outliers
outliers = final_data[(final_data["AvgMessageLength"] < lower_bound) | (final_data["AvgMessageLength"] > upper_bound)]
print(f"Number of outliers: {outliers.shape[0]}")

# Remove outliers
filtered_data = final_data[(final_data["AvgMessageLength"] >= lower_bound) & (final_data["AvgMessageLength"] <= upper_bound)]
print(f"Shape after removing outliers: {filtered_data.shape}")

count     549.000000
mean       35.963433
std        92.770311
min         5.000000
25%        15.500000
50%        22.875000
75%        33.246970
max      1373.166667
Name: AvgMessageLength, dtype: float64
Outlier boundaries: Lower = -11.12045454545455, Upper = 59.86742424242425
Number of outliers: 40
Shape after removing outliers: (509, 8)


In [144]:
filtered_data.describe()

Unnamed: 0,MessageRatio,MediaRatio,QuestionRatio,AvgMessageLength,ParticipationScore
count,509.0,509.0,509.0,509.0,509.0
mean,0.254073,0.170851,0.205079,23.406219,6.623445
std,0.172803,0.266853,0.148564,10.969778,3.768783
min,0.015278,0.0,0.013514,5.0,1.666667
25%,0.117667,0.0,0.083333,15.0,4.0
50%,0.210609,0.0,0.163636,21.733333,5.666667
75%,0.354224,0.25,0.30303,30.0,8.0
max,1.0,1.0,0.8,57.416667,38.0


In [146]:
output_file_path = os.path.join(train_merged, "train.csv")
filtered_data.to_csv(output_file_path, index=False, encoding="utf-8-sig")
print(f"Filtered data saved to: {output_file_path}")

Filtered data saved to: ./data/train_merged/train.csv


# TEST

In [157]:
# List CSV files in the input directory
csv_files = [file for file in os.listdir(test) if file.endswith(".csv")]
total_files_processed = 0

# Process each CSV file
for csv_file in csv_files:
    print(f"Processing file: {csv_file}")
    input_path = os.path.join(test, csv_file)
    output_file_prefix = os.path.splitext(csv_file)[0]

    # Read the CSV file
    dataframe = pd.read_csv(input_path)

    # Validate required columns
    required_columns = ["Date", "User", "Message"]
    if not all(column in dataframe.columns for column in required_columns):
        print(f"Invalid format for {csv_file}: Missing required columns")
        continue

    # Convert the date column
    dataframe["Date"] = dataframe["Date"].apply(convert_date)

    # Handle rows with failed date conversions
    failed_dates = dataframe[dataframe["Date"].isna()]
    if not failed_dates.empty:
        print(f"Failed to convert dates in {csv_file}")
        print(failed_dates)

    # Drop rows with missing or invalid dates
    dataframe = dataframe.dropna(subset=["Date"])

    # Extract only the date part
    dataframe["Date"] = pd.to_datetime(dataframe["Date"]).dt.date

    # Filter out short messages (less than 5 characters)
    dataframe = dataframe[dataframe["Message"].str.len() >= 5]

    # Compute message statistics
    message_counts = dataframe.groupby(["Date", "User"]).size()

    # Detect and count images, files, and links
    dataframe["HasImage"] = dataframe["Message"].str.contains("사진|image", case=False, na=False)
    dataframe["HasFile"] = dataframe["Message"].str.contains("파일|file", case=False, na=False)
    dataframe["HasLink"] = dataframe["Message"].str.contains(r"http", na=False)

    # Combine media-related features
    dataframe["MediaFlag"] = dataframe["HasImage"] | dataframe["HasFile"] | dataframe["HasLink"]
    media_counts = dataframe.groupby(["Date", "User"])["MediaFlag"].sum()

    # Count questions
    dataframe["IsQuestion"] = dataframe["Message"].str.contains(r"\\?", na=False)
    question_counts = dataframe.groupby(["Date", "User"])["IsQuestion"].sum()

    # Calculate average message length
    dataframe["MessageLength"] = dataframe["Message"].str.len()
    avg_message_length = dataframe.groupby(["Date", "User"])["MessageLength"].mean()

    # Calculate message ratio per user per date
    total_daily_messages = dataframe.groupby("Date").size()
    message_ratios = (dataframe.groupby(["Date", "User"]).size() / total_daily_messages).reset_index(name="MessageRatio")

    # Create a daily summary dataframe
    daily_summary = pd.DataFrame({
        "MessageCount": message_counts,
        "MessageRatio": message_ratios.set_index(["Date", "User"])["MessageRatio"],
        "AvgMessageLength": avg_message_length
    }).fillna(0).reset_index()

    # Ensure integer columns are properly cast
    daily_summary["MessageCount"] = daily_summary["MessageCount"].astype(int)

    # Group data into 3-day intervals
    unique_dates = sorted(daily_summary["Date"].unique())
    group_count = ceil(len(unique_dates) / 3)
    grouped_summaries = []

    for group_index in range(group_count):
        group_start = group_index * 3
        group_end = group_start + 3
        group_dates = unique_dates[group_start:group_end]

        # Filter data for the current group
        group_data = daily_summary[daily_summary["Date"].isin(group_dates)]

        # Calculate group-level media statistics
        group_media_counts = media_counts.loc[media_counts.index.get_level_values("Date").isin(group_dates)]
        total_group_media = group_media_counts.groupby("Date").sum().sum()
        user_media_counts = group_media_counts.groupby("User").sum()
        user_media_ratios = user_media_counts / total_group_media

        # Calculate group-level question statistics
        group_question_counts = question_counts.loc[question_counts.index.get_level_values("Date").isin(group_dates)]
        total_group_questions = group_question_counts.groupby("Date").sum().sum()
        user_question_counts = group_question_counts.groupby("User").sum()
        user_question_ratios = user_question_counts / total_group_questions

        # Add media and question ratios to the data
        group_data = group_data.set_index("User")
        group_data["MediaRatio"] = user_media_ratios.fillna(0)
        group_data["QuestionRatio"] = user_question_ratios.fillna(0)

        # Calculate participation score
        participation_score = dataframe.groupby("User")["Date"].nunique() / len(group_dates)
        group_data["ParticipationScore"] = group_data.index.map(participation_score.fillna(0))

        # Aggregate data
        aggregated_data = group_data.groupby("User").agg({
            # "MessageCount": "sum",
            "MessageRatio": "mean",
            "MediaRatio": "mean",
            "QuestionRatio": "mean",
            "AvgMessageLength": "mean",
            "ParticipationScore": "mean"
        }).reset_index()

        # Add date range to the group data
        # aggregated_data["StartDate"] = group_dates[0]
        # aggregated_data["EndDate"] = group_dates[-1] if len(group_dates) > 1 else group_dates[0]

        grouped_summaries.append(aggregated_data)

        # Save group data to a CSV file
        group_file_name = f"{output_file_prefix}_3day_group{group_index + 1}.csv"
        group_file_path = os.path.join(test_processed, group_file_name)
        aggregated_data.to_csv(group_file_path, index=False, encoding="utf-8-sig")
        print(f"Saved: {group_file_path}")

        total_files_processed += 1

print(f"Total files processed: {total_files_processed}")


Processing file: munsik_4.csv
Saved: ./data/test/munsik_4_3day_group1.csv
Saved: ./data/test/munsik_4_3day_group2.csv
Saved: ./data/test/munsik_4_3day_group3.csv
Saved: ./data/test/munsik_4_3day_group4.csv
Total files processed: 4


In [None]:
# List CSV files in the input directory
csv_files = [file for file in os.listdir(test) if file.endswith(".csv")]
total_files_processed = 0

# Process each CSV file
for csv_file in csv_files:
    print(f"Processing file: {csv_file}")
    input_path = os.path.join(test, csv_file)
    output_file_prefix = os.path.splitext(csv_file)[0]

    # Read the CSV file
    dataframe = pd.read_csv(input_path)

    # Validate required columns
    required_columns = ["Date", "User", "Message"]
    if not all(column in dataframe.columns for column in required_columns):
        print(f"Invalid format for {csv_file}: Missing required columns")
        continue

    # Convert the date column
    dataframe["Date"] = dataframe["Date"].apply(convert_date)

    # Handle rows with failed date conversions
    failed_dates = dataframe[dataframe["Date"].isna()]
    if not failed_dates.empty:
        print(f"Failed to convert dates in {csv_file}")
        print(failed_dates)

    # Drop rows with missing or invalid dates
    dataframe = dataframe.dropna(subset=["Date"])

    # Extract only the date part
    dataframe["Date"] = pd.to_datetime(dataframe["Date"]).dt.date

    # Filter out short messages (less than 5 characters)
    dataframe = dataframe[dataframe["Message"].str.len() >= 5]

    # Compute message statistics
    message_counts = dataframe.groupby(["Date", "User"]).size()

    # Detect and count images, files, and links
    dataframe["HasImage"] = dataframe["Message"].str.contains("사진|image", case=False, na=False)
    dataframe["HasFile"] = dataframe["Message"].str.contains("파일|file", case=False, na=False)
    dataframe["HasLink"] = dataframe["Message"].str.contains(r"http", na=False)

    # Combine media-related features
    dataframe["MediaFlag"] = dataframe["HasImage"] | dataframe["HasFile"] | dataframe["HasLink"]
    media_counts = dataframe.groupby(["Date", "User"])["MediaFlag"].sum()

    # Count questions
    dataframe["IsQuestion"] = dataframe["Message"].str.contains(r"\\?", na=False)
    question_counts = dataframe.groupby(["Date", "User"])["IsQuestion"].sum()

    # Calculate average message length
    dataframe["MessageLength"] = dataframe["Message"].str.len()
    avg_message_length = dataframe.groupby(["Date", "User"])["MessageLength"].mean()

    # Calculate message ratio per user per date
    total_daily_messages = dataframe.groupby("Date").size()
    message_ratios = (dataframe.groupby(["Date", "User"]).size() / total_daily_messages).reset_index(name="MessageRatio")

    # Create a daily summary dataframe
    daily_summary = pd.DataFrame({
        "MessageCount": message_counts,
        "MessageRatio": message_ratios.set_index(["Date", "User"])["MessageRatio"],
        "AvgMessageLength": avg_message_length
    }).fillna(0).reset_index()

    # Ensure integer columns are properly cast
    daily_summary["MessageCount"] = daily_summary["MessageCount"].astype(int)

    # Group data into 3-day intervals
    unique_dates = sorted(daily_summary["Date"].unique())
    group_count = ceil(len(unique_dates) / 3)
    grouped_summaries = []

    for group_index in range(group_count):
        group_start = group_index * 3
        group_end = group_start + 3
        group_dates = unique_dates[group_start:group_end]

        # Filter data for the current group
        group_data = daily_summary[daily_summary["Date"].isin(group_dates)]
         
        # Calculate group-level message statistics
        group_message_counts = message_counts.loc[message_counts.index.get_level_values("Date").isin(group_dates)]
        total_group_message = group_message_counts.groupby("Date").sum().sum()
        user_message_counts = group_message_counts.groupby("User").sum()
        user_message_ratios = user_message_counts / total_group_message
        
        # Calculate group-level media statistics
        group_media_counts = media_counts.loc[media_counts.index.get_level_values("Date").isin(group_dates)]
        total_group_media = group_media_counts.groupby("Date").sum().sum()
        user_media_counts = group_media_counts.groupby("User").sum()
        user_media_ratios = user_media_counts / total_group_media

        # Calculate group-level question statistics
        group_question_counts = question_counts.loc[question_counts.index.get_level_values("Date").isin(group_dates)]
        total_group_questions = group_question_counts.groupby("Date").sum().sum()
        user_question_counts = group_question_counts.groupby("User").sum()
        user_question_ratios = user_question_counts / total_group_questions

        # Add media and question ratios to the data
        group_data = group_data.set_index("User")
        group_data["MessageRatio"] = user_message_ratios.fillna(0)
        group_data["MediaRatio"] = user_media_ratios.fillna(0)
        group_data["QuestionRatio"] = user_question_ratios.fillna(0)

        # Calculate participation score
        participation_score = dataframe.groupby("User")["Date"].nunique() / len(group_dates)
        group_data["ParticipationScore"] = group_data.index.map(participation_score.fillna(0))

        # Aggregate data
        aggregated_data = group_data.groupby("User").agg({
            # "MessageCount": "sum",
            "MessageRatio": "mean",
            "MediaRatio": "mean",
            "QuestionRatio": "mean",
            "AvgMessageLength": "mean",
            "ParticipationScore": "mean"
        }).reset_index()

        # Add date range to the group data
        # aggregated_data["StartDate"] = group_dates[0]
        # aggregated_data["EndDate"] = group_dates[-1] if len(group_dates) > 1 else group_dates[0]

        grouped_summaries.append(aggregated_data)

        # Save group data to a CSV file
        group_file_name = f"{output_file_prefix}_3day_group{group_index + 1}.csv"
        group_file_path = os.path.join(test_processed, group_file_name)
        aggregated_data.to_csv(group_file_path, index=False, encoding="utf-8-sig")
        print(f"Saved: {group_file_path}")

        total_files_processed += 1

print(f"Total files processed: {total_files_processed}")
