In [None]:
# 04_model_training.ipynb

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load data
df = pd.read_csv(r"D:\in\Task_2\Dataset\fraud_train.csv")

X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Reduce high-cardinality categories
for col in X_train.select_dtypes(include='object'):
    top_categories = X_train[col].value_counts().head(100).index
    X_train[col] = X_train[col].where(
        X_train[col].isin(top_categories), 'Other'
    )
    X_test[col] = X_test[col].where(
        X_test[col].isin(top_categories), 'Other'
    )

# Preprocessing pipeline
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train.select_dtypes(include='object').columns

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# Models
lr_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', LogisticRegression(
        class_weight='balanced', max_iter=1000, solver='saga'
    ))
])

rf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=100, class_weight='balanced', random_state=42
    ))
])

# Train
lr_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)

# Save splits & models
X_train.to_csv("../data/X_train.csv", index=False)
X_test.to_csv("../data/X_test.csv", index=False)
y_train.to_csv("../data/y_train.csv", index=False)
y_test.to_csv("../data/y_test.csv", index=False)

import joblib
joblib.dump(lr_pipeline, "../models/logistic_model.pkl")
joblib.dump(rf_pipeline, "../models/random_forest_model.pkl")

print("âœ” Models and splits saved")
