In [None]:
import psycopg2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import pickle

# Function to evaluate model performance
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

print("Connecting to PostgreSQL database...")
# PostgreSQL connection details
host = 'database-dmml.cluster-czyuk8c4op6k.eu-north-1.rds.amazonaws.com'
port = 5432
database = 'postgres'
username = 'postgres'
password = 'dmml-project-postgres'
schema = 'public'
table_name = 'customer_churn_analysis'

# Establish database connection
conn = psycopg2.connect(
    host=host,
    port=port,
    database=database,
    user=username,
    password=password
)
print("Successfully connected to the database.")

# Query to fetch data from the database
query = "SELECT * FROM customer_churn_analysis"
print(f"Executing query: {query}")
df = pd.read_sql(query, conn)
print("Data loaded successfully from PostgreSQL.")

# Close database connection
conn.close()
print("Database connection closed.")

print("Preprocessing the data...")
# Separate features (X) and target variable (y)
X = df.drop(['customerid','churn'], axis=1)  # Features
y = df['churn']  # Target variable

# Split data into training and testing sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split completed.")

# Normalize/scale the features
print("Applying StandardScaler to normalize features...")
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print("Feature scaling completed.")

# Initialize models
print("Initializing models...")
logistic_regression = LogisticRegression()
random_forest = RandomForestClassifier()

# Train Logistic Regression Model
print("Training Logistic Regression model...")
logistic_regression.fit(X_train, y_train)
print("Logistic Regression model training completed.")

# Predict using Logistic Regression
print("Making predictions using Logistic Regression...")
y_pred_lr = logistic_regression.predict(X_test)
print("Predictions completed.")

# Train Random Forest Model
print("Training Random Forest model...")
random_forest.fit(X_train, y_train)
print("Random Forest model training completed.")

# Predict using Random Forest
print("Making predictions using Random Forest...")
y_pred_rf = random_forest.predict(X_test)
print("Predictions completed.")

# Logistic Regression Evaluation
print("Evaluating Logistic Regression model...")
accuracy_lr, precision_lr, recall_lr, f1_lr = evaluate_model(y_test, y_pred_lr)
print("Logistic Regression Evaluation:")
print(f"Accuracy: {accuracy_lr:.2f}")
print(f"Precision: {precision_lr:.2f}")
print(f"Recall: {recall_lr:.2f}")
print(f"F1 Score: {f1_lr:.2f}")

# Random Forest Evaluation
print("Evaluating Random Forest model...")
accuracy_rf, precision_rf, recall_rf, f1_rf = evaluate_model(y_test, y_pred_rf)
print("Random Forest Evaluation:")
print(f"Accuracy: {accuracy_rf:.2f}")
print(f"Precision: {precision_rf:.2f}")
print(f"Recall: {recall_rf:.2f}")
print(f"F1 Score: {f1_rf:.2f}")

# Save the trained models
print("Saving trained models to disk...")
with open('logreg_model.pkl', 'wb') as f:
    pickle.dump(logistic_regression, f)
print("Logistic Regression model saved successfully.")

with open('rf_model.pkl', 'wb') as f:
    pickle.dump(random_forest, f)
print("Random Forest model saved successfully.")

print("Script execution completed successfully.")
