In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression


#Run for different day range

# file_paths_1_to_300 = [f"../../Data/rq23/{day}d/eth_{day}d.csv" for day in range(1, 301)]
# for file_path in file_paths_1_to_300:
csv_scam_address = file_path
df = pd.read_csv(csv_scam_address, encoding='latin-1')

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

X = df.iloc[:,1:57]
y = df.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17, test_size=0.3, stratify=y)

# hyperparameters
best_params_lr = {
    'penalty': 'l2',
    'C': 1.0,  # Regularization strength
    'solver': 'liblinear',  # Suitable for smaller datasets
    'random_state': 0  # Ensure reproducibility
}
best_lr = LogisticRegression(**best_params_lr, class_weight='balanced')
cv_scores_lr = cross_val_score(best_lr, X_train, y_train, cv=5, scoring='roc_auc')
best_lr.fit(X_train, y_train)


y_pred_lr = best_lr.predict(X_test)
y_pred_prob_lr = best_lr.predict_proba(X_test)[:, 1]

# Evaluate
print("Logistic Regression Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))