In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [4]:
import sys
from pathlib import Path

# Add the src directory to the Python path
sys.path.append(str(Path().resolve().parent))

# Now you can import the clean_data function
from data_cleaning import drop_columns, impute_missing_values
from data_preprocessing import preprocess_data

data = pd.read_csv("../input/train_folds.csv")
df = drop_columns(data)

In [5]:
for fold in range(5):
    fold_df = impute_missing_values(df, fold)
    fold_df = impute_missing_values(df, 0)
    fold_df = preprocess_data(fold_df, fold=0)
    # drop the kfold column
    fold_df = fold_df.drop("kfold", axis=1)

    # Split the data into X and y
    X = fold_df.drop("Converted", axis=1)
    y = fold_df["Converted"]
    # Split the data into test and train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the logistic regression model
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)

    # Make predictions
    y_pred = log_reg.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report: {classification_report(y_test, y_pred)}")
    print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred)}")

Accuracy: 0.797077922077922
Classification Report:               precision    recall  f1-score   support

           0       0.82      0.86      0.84      1150
           1       0.75      0.70      0.72       698

    accuracy                           0.80      1848
   macro avg       0.79      0.78      0.78      1848
weighted avg       0.80      0.80      0.80      1848

Confusion Matrix: [[985 165]
 [210 488]]
Accuracy: 0.797077922077922
Classification Report:               precision    recall  f1-score   support

           0       0.82      0.86      0.84      1150
           1       0.75      0.70      0.72       698

    accuracy                           0.80      1848
   macro avg       0.79      0.78      0.78      1848
weighted avg       0.80      0.80      0.80      1848

Confusion Matrix: [[985 165]
 [210 488]]
Accuracy: 0.797077922077922
Classification Report:               precision    recall  f1-score   support

           0       0.82      0.86      0.84      1150
  

In [9]:
# Set display options to show all rows and columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [12]:
from sklearn.feature_selection import RFE
from sklearn.metrics import precision_score

# for fold in range(5):

fold = 0
fold_df = impute_missing_values(df, fold)
fold_df = impute_missing_values(df, 0)
fold_df = preprocess_data(fold_df, fold=0)
# drop the kfold column
fold_df = fold_df.drop("kfold", axis=1)

# Split the data into X and y
X = fold_df.drop("Converted", axis=1)
y = fold_df["Converted"]
# Split the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
log_reg = LogisticRegression()
rfe = RFE(log_reg, n_features_to_select=10)
rfe.fit(X_train, y_train)

# Extract the ranking of the features
ranking = pd.Series(rfe.ranking_, index=X_train.columns)
print(f"Feature ranking: \n{ranking.sort_index()}")

# Continue with the model trained on the selected features
selected_features = X_train.columns[rfe.support_]
print(f"Selected features: {selected_features}")
log_reg.fit(X_train[selected_features], y_train)

# Make predictions
y_pred = log_reg.predict(X_test[selected_features])
y_pred_proba = log_reg.predict_proba(X_test[selected_features])[:, 1] # Probability of being 1

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")
print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred)}")

Feature ranking: 
A free copy of Mastering The Interview                54
Digital Advertisement                                 24
Do Not Call                                           30
Do Not Email                                           1
Last Activity_Converted to Lead                        1
Last Activity_Email Bounced                            4
Last Activity_Email Link Clicked                      51
Last Activity_Email Marked Spam                       27
Last Activity_Email Opened                            44
Last Activity_Email Received                          13
Last Activity_Form Submitted on Website               14
Last Activity_Had a Phone Conversation                 2
Last Activity_Olark Chat Conversation                  1
Last Activity_Page Visited on Website                  5
Last Activity_Resubscribed to emails                  19
Last Activity_SMS Sent                                20
Last Activity_Unreachable                             10
Last Activity