In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.inspection import partial_dependence

# Set font before plotting
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

# Read the CSV
df = pd.read_csv('C:/Users/User/kyushu_with_weather_with_restart_counter.csv', low_memory=False)

In [3]:
# Define the repair dates
# Force conversion of 날짜 column to datetime and drop bad values
df['날짜'] = pd.to_datetime(df['날짜'], errors='coerce')  # invalid strings become NaT
invalid_count = df['날짜'].isna().sum()
if invalid_count > 0:
    print(f"⚠️  {invalid_count} rows have invalid 날짜 values and will be dropped.")
    df = df.dropna(subset=['날짜'])  # remove rows where 날짜 couldn't be parsed

repair_dates = [pd.to_datetime(d) for d in [
    '2024-01-22', '2024-02-14', '2024-03-11', '2024-04-17', '2024-05-22',
    '2024-06-13', '2024-07-13', '2024-08-30', '2024-09-08', '2024-09-16',
    '2024-10-07', '2024-10-21', '2024-11-04', '2024-11-29', '2024-12-20'
]]

# Set to store all indices to remove
indices_to_remove = set()

# Dictionary for logging: repair_date → [dates removed]
removal_log = {}

# Loop through each repair date
for repair_date in repair_dates:
    print(f"\n📅 Processing repair date: {repair_date}")
    
    # Find the first index where 날짜 == repair_date
    matching_indices = df.index[df['날짜'].dt.date == repair_date.date()]
    if len(matching_indices) == 0:
        print(f"⚠️  Repair date {repair_date} not found in dataset.")
        continue

    start_idx = matching_indices[0]

    seen_dates = set()
    dates_to_remove = set()

    # Go backwards from the first repair date occurrence
    for idx in range(start_idx - 1, -1, -1):
        row_date = df.at[idx, '날짜'].date()

        if row_date == repair_date:
            continue  # skip same day

        if row_date not in seen_dates:
            seen_dates.add(row_date)
            if len(seen_dates) > 3:
                break  # stop after 3 distinct earlier dates

        dates_to_remove.add(row_date)

    # Log the dates to remove
    removal_log[repair_date] = sorted(dates_to_remove)
    print(f"🗑️  Will remove rows with these dates before {repair_date}: {removal_log[repair_date]}")

    # Collect indices
    rows_to_drop = df[df['날짜'].dt.date.isin(dates_to_remove)].index
    print(f"🧾 Rows to remove: {len(rows_to_drop)}")
    indices_to_remove.update(rows_to_drop)

# Final removal
print(f"\n✅ Total rows to remove: {len(indices_to_remove)}")
df_cleaned = df.drop(index=indices_to_remove)
df_cleaned = df_cleaned.sort_values(by='로개회수')

# Save
df_cleaned.to_csv('kyushu_with_weather_with_restart_counter_post_repair.csv')

# Optional: summary printout
print("\n📌 Summary of removed date blocks:")
for repair_date, removed_dates in removal_log.items():
    print(f"- {repair_date}: {removed_dates}")



📅 Processing repair date: 2024-01-22 00:00:00
🗑️  Will remove rows with these dates before 2024-01-22 00:00:00: [datetime.date(2024, 1, 19), datetime.date(2024, 1, 20), datetime.date(2024, 1, 21)]
🧾 Rows to remove: 34

📅 Processing repair date: 2024-02-14 00:00:00
🗑️  Will remove rows with these dates before 2024-02-14 00:00:00: [datetime.date(2024, 2, 11), datetime.date(2024, 2, 12), datetime.date(2024, 2, 13)]
🧾 Rows to remove: 40

📅 Processing repair date: 2024-03-11 00:00:00
🗑️  Will remove rows with these dates before 2024-03-11 00:00:00: [datetime.date(2024, 2, 23), datetime.date(2024, 2, 24), datetime.date(2024, 2, 25)]
🧾 Rows to remove: 40

📅 Processing repair date: 2024-04-17 00:00:00
🗑️  Will remove rows with these dates before 2024-04-17 00:00:00: [datetime.date(2024, 4, 14), datetime.date(2024, 4, 15), datetime.date(2024, 4, 16)]
🧾 Rows to remove: 32

📅 Processing repair date: 2024-05-22 00:00:00
🗑️  Will remove rows with these dates before 2024-05-22 00:00:00: [datetime.d