In [None]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score

# File paths
train_data_path = r"G:/My Drive/bak 2024/datasets/Santander Customer Transaction Prediction/train-8.csv"
test_data_path = r"G:/My Drive/bak 2024/datasets/Santander Customer Transaction Prediction/test-9.csv"
output_path = r"G:/My Drive/bak 2024/datasets/Santander Customer Transaction Prediction/aux-4.csv"

# Load data
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)

# Prepare training data
target = df_train.iloc[:, 0]
features = df_train.iloc[:, 1:]

# Define the function to maximize absolute correlation
def correlation_function(coefficients, features, target):
    dot_product = features @ coefficients
    return -np.abs(np.corrcoef(target, dot_product)[0, 1])

# Optimize coefficients
def optimize_coefficients(features, target):
    np.random.seed(0)
    initial_guess = np.random.uniform(-1, 1, features.shape[1])
    result = minimize(correlation_function, initial_guess, args=(features, target), method="Powell")
    return result.x

# Transform features
coefficients = optimize_coefficients(features, target)
transformed_train_features = features @ coefficients
transformed_test_features = df_test @ coefficients

# Train linear regression model
def train_linear_model(features, target):
    model = LinearRegression()
    model.fit(features.reshape(-1, 1), target)
    return model

# Convert to numpy array and reshape for the model training
transformed_train_features = np.array(transformed_train_features)
transformed_test_features = np.array(transformed_test_features)
model = train_linear_model(transformed_train_features, target)

# Cross-validation to find the best threshold
def find_best_threshold(model, features, target):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_predictions = cross_val_predict(model, features.reshape(-1, 1), target, cv=skf)
    thresholds = np.linspace(0, 1, 101)
    accuracy_scores = [accuracy_score(target, np.where(cv_predictions >= t, 1, 0)) for t in thresholds]
    best_threshold_index = np.argmax(accuracy_scores)
    return thresholds[best_threshold_index], accuracy_scores[best_threshold_index]

best_threshold, best_accuracy = find_best_threshold(model, transformed_train_features, target)

print(f"Best threshold found: {best_threshold:.2f} with accuracy: {best_accuracy:.2f}%")

# Predict and apply threshold on training data
def apply_threshold(model, features, threshold):
    predictions = model.predict(features.reshape(-1, 1))
    thresholded_predictions = np.where(predictions >= threshold, 1, 0)
    return thresholded_predictions

train_thresholded = apply_threshold(model, transformed_train_features, best_threshold)
train_accuracy = accuracy_score(target, train_thresholded) * 100

print(f"Training accuracy ratio as a percent: {train_accuracy:.2f}%")

# Predict and apply threshold on test data
test_thresholded = apply_threshold(model, transformed_test_features, best_threshold)

# Save test predictions
output_df = pd.DataFrame(test_thresholded, columns=["prediction"])
output_df.to_csv(output_path, index=False)

print("Predicted values with threshold applied on the test set:")
print(test_thresholded)

# Verify the file was saved correctly
import os
if os.path.isfile(output_path):
    print(f"The file {output_path} has been created successfully.")
else:
    print(f"Failed to create the file {output_path}.")