In [10]:
import numpy as np
import pandas as pd
import re
from datetime import datetime
from collections import defaultdict

# Define the function to parse date and interval
def parse_date_interval(cell):
    if pd.isna(cell):
        return np.nan, np.nan
    else:
        # Check if date interval is present
        interval_match = re.search(r"\((\+?\d+).*?일\)", cell)
        if interval_match:
            interval = int(interval_match.group(1))
        else:
            interval = np.nan

        # Parse date
        date_match = re.search(r"\)(.+)", cell)
        if date_match:
            date_str = date_match.group(1).strip()

            # Remove additional info
            date_str = re.sub(r"\(.+?\)", "", date_str)  # remove everything inside parentheses
            date_str = date_str.replace("오전", "").replace("오후", "").strip()  # remove "오전" and "오후"
            date = datetime.strptime(date_str, '%Y.%m.%d')
        else:
            date = np.nan
            
        return date, interval

# Load the data
data_path = "processed_data.csv"
df = pd.read_csv(data_path)

# Initialize data for new dataframe
new_data = defaultdict(list)

# Iterate over events
for i, row in df.iterrows():
    event_name = row['events']
    new_data['events'].append(event_name)

    # Initialize lists to store dates and intervals
    dates = []
    intervals = []

    # Initialize a dict to count events per year
    event_count_per_year = defaultdict(int)

    # Iterate over dates in the event row
    for cell in row[1:]:
        date, interval = parse_date_interval(cell)

        # Store the date and interval if they are not NaN
        if not pd.isna(date):
            dates.append(date)
            event_count_per_year[date.year] += 1
        if not pd.isna(interval):
            intervals.append(interval)

    # Compute desired statistics
    total_event_count = len(dates)
    most_recent_event_date = max(dates) if dates else np.nan
    avg_interval = np.mean(intervals) if intervals else np.nan
    std_interval = np.std(intervals) if intervals else np.nan

    # Store the statistics in the new data
    new_data['total_event_count'].append(total_event_count)
    new_data['most_recent_event_date'].append(most_recent_event_date)
    new_data['avg_interval'].append(avg_interval)
    new_data['std_interval'].append(std_interval)

    # Store the event counts per year
    for year in range(2017, 2024):
        new_data[f'event_count_{year}'].append(event_count_per_year[year])

# Create a new dataframe from the new data
new_df = pd.DataFrame(new_data)

# Display the new dataframe
new_df



Unnamed: 0,events,total_event_count,event_count_2021,event_count_2022,event_count_2023,most_recent_event_date,avg_interval,std_interval
0,경험치 2배,3,0,1,0,2022-05-01,487.666667,301.587282
1,경험치 3배 쿠폰(15분) 2장 지급,3,0,0,1,2023-03-26,644.0,816.713332
2,경험치 3배 쿠폰(30분) 4장 지급,0,0,0,0,NaT,,
3,레이디 블레어의 코디 타임,1,0,1,0,2022-08-14,175.0,0.0
4,룬 경험치 버프 효과 +100%,46,11,11,6,2023-06-18,49.456522,45.280626
5,몬스터 컬렉션 '의문의 모몽' 3개 지급,16,5,4,2,2023-06-04,105.4375,80.669828
6,몬스터 컬렉션 신규 몬스터 등록 확률 추가 100%,21,6,5,2,2023-06-04,95.333333,54.841185
7,몬스터 파크 클리어 경험치 추가 50%,28,6,7,7,2023-07-30,62.75,54.974426
8,미라클 타임,6,1,2,2,2023-07-16,274.166667,89.954464
9,불꽃늑대 퇴장 시 획득하는 경험치 2배,41,9,10,6,2023-06-18,49.341463,30.802763
