In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

# Load your dataset
file_path = 'data/fake_structured.csv'
df = pd.read_csv(file_path)  # replace with your actual dataset path
df = df.sort_values(by=['people_id', 'actual_date'])

df['index_prev1'] = df.groupby('people_id')['TotalIndex'].shift(1)
df['index_prev2'] = df.groupby('people_id')['TotalIndex'].shift(2)

df['index_3day_mean'] = df.groupby('people_id')['TotalIndex'].rolling(3).mean().reset_index(0, drop=True)
df['index_delta'] = df['TotalIndex'] - df['index_prev1']
df['escalating'] = (df['index_delta'] > 0) & (df['index_prev1'] > df['index_prev2'])
# Maybe try this as Target
incident = df['IncidentNextDay'].astype(bool)
df['IncidentInNext3Days'] = (
    incident.groupby(df['people_id']).shift(-1).fillna(False) |
    incident.groupby(df['people_id']).shift(-2).fillna(False) |
    incident.groupby(df['people_id']).shift(-3).fillna(False)
).astype(int)

# Assume target column is named 'index_score' or similar
df = df.iloc[:, 6:]
# target_col = 'IncidentNextDay'  # change this if yours is named differently
target_col = 'IncidentNextDay'
feature_cols = [col for col in df.columns if col != target_col]

# Split into features and target
X = df[feature_cols]
y = df[target_col]

smote = SMOTE(random_state=42)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]  # Keep alignment after drop
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# Initialize and train model
clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')  # 'balanced' helps with class imbalance
clf.fit(X_train_bal, y_train_bal)

# Predict
# y_pred = clf.predict(X_test)
y_probs = clf.predict_proba(X_test)[:, 1]
y_pred_custom = (y_probs > 0.2).astype(int)  # try 0.2 instead of 0.5

# Then evaluate y_pred_custom


# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_custom))
print("\nClassification Report:\n", classification_report(y_test, y_pred_custom))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_custom)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Incident', 'Incident'], yticklabels=['No Incident', 'Incident'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Optional: Feature Importance
importances = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=importances.values, y=importances.index)
plt.title("Feature Importances (Classification)")
plt.tight_layout()
plt.show()
