In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score


# Load the data
data = pd.read_csv('../Data/data.csv', sep=',')

# Object for target variable
y = data.churn

# Object for input features
X = data.drop(['churn'], axis=1)

# Display shapes of X and y
print(X.shape, y.shape)



(10000, 11) (10000,)


In [2]:
# List numerical features
num_columns = X.select_dtypes(include='number').columns.tolist()
print("Numerical features:", num_columns)

# List categorical features
cat_columns = X.select_dtypes(include='object').columns.tolist()
print("Categorical features:", cat_columns)



Numerical features: ['customer_id', 'credit_score', 'age', 'tenure', 'balance', 'products_number', 'credit_card', 'active_member', 'estimated_salary']
Categorical features: ['country', 'gender']


In [3]:
random_state = 10

# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=random_state,
                                                    stratify=data.churn)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))





7000 3000 7000 3000


In [4]:
# Define the column transformer
preprocess = make_column_transformer(
    (MinMaxScaler(), num_columns),
    (OneHotEncoder(sparse_output=False), cat_columns)
)



In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(random_state=random_state)
}

# Train and evaluate models
results = {}
classification_reports = {}

for name, model in models.items():
    pipeline = make_pipeline(preprocess, model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    
    # Generate classification report
    report = classification_report(y_test, y_pred, target_names=['0', '1'])
    classification_reports[name] = report

# Convert results to DataFrame
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
print(results_df)

# Print classification reports
for model_name, report in classification_reports.items():
    print(f"\nClassification Report for {model_name}:\n")
    print(report)


                    Model  Accuracy
0     Logistic Regression  0.807667
1  Support Vector Machine  0.834000
2           Random Forest  0.861667

Classification Report for Logistic Regression:

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      2389
           1       0.58      0.20      0.30       611

    accuracy                           0.81      3000
   macro avg       0.70      0.58      0.59      3000
weighted avg       0.78      0.81      0.77      3000


Classification Report for Support Vector Machine:

              precision    recall  f1-score   support

           0       0.83      0.99      0.90      2389
           1       0.86      0.22      0.35       611

    accuracy                           0.83      3000
   macro avg       0.85      0.61      0.63      3000
weighted avg       0.84      0.83      0.79      3000


Classification Report for Random Forest:

              precision    recall  f1-score   support

   

## Conclution : 
best model is random forest but sisnce our data is highly imalanced we see label 1 has low recall, so i will try to use optimisation technique for the three models we have to balance the labels