In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import joblib

# Load the dataset
dataset_path = "C:/Users/user/Documents/APU FYP/Dataset/cell2celltrain.csv"
df = pd.read_csv(dataset_path)

# Handle missing values
# For numerical columns, fill with median
num_imputer = SimpleImputer(strategy='median')
df[df.select_dtypes(include=['float64', 'int64']).columns] = num_imputer.fit_transform(df.select_dtypes(include=['float64', 'int64']))

# For categorical columns, fill with mode
cat_imputer = SimpleImputer(strategy='most_frequent')
df[df.select_dtypes(include=['object']).columns] = cat_imputer.fit_transform(df.select_dtypes(include=['object']))

# Check for NaN values in the target variable 'Churn'
print(df['Churn'].isna().sum())

# Drop rows where the target variable is NaN
df.dropna(subset=['Churn'], inplace=True)

# Convert target variable 'Churn' to binary
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Separate features and target
X = df.drop(['CustomerID', 'Churn'], axis=1)
y = df['Churn']

# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


0


In [3]:
# Apply feature selection
selector = SelectKBest(score_func=mutual_info_classif, k=20)  # Selecting top 20 features based on mutual information
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Get the selected feature names
selected_features = X.columns[selector.get_support()].tolist()
print("Selected features based on mutual information:", selected_features)


Selected features based on mutual information: ['MonthlyMinutes', 'ReceivedCalls', 'MonthsInService', 'CurrentEquipmentDays', 'RetentionCalls', 'ServiceArea_AIRMYR843', 'ServiceArea_BOSNSH603', 'ServiceArea_HARLON860', 'ServiceArea_KCYKCK913', 'ServiceArea_OHILAN740', 'ServiceArea_OKCLRK501', 'ServiceArea_PHIBRI856', 'ServiceArea_PHXTUC520', 'ServiceArea_SEAMTV360', 'ServiceArea_SFRPAL650', 'ServiceArea_SFRSFS650', 'HandsetPrice_100', 'HandsetPrice_Unknown', 'CreditRating_2-High', 'Occupation_Other']


In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV

# Apply RFE
model = LogisticRegression(max_iter=1000, solver='liblinear')
rfe = RFE(model, n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train_selected, y_train)
X_test_rfe = rfe.transform(X_test_selected)

# Get the final selected feature names
final_selected_features = np.array(selected_features)[rfe.support_].tolist()
print("Final selected features after RFE:", final_selected_features)

# Train the final model with regularization
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(penalty='l2', solver='liblinear', max_iter=1000))
])

# Hyperparameter tuning
param_grid = {
    'model__C': [0.01, 0.1, 1, 10, 100]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_rfe, y_train)

best_model = grid_search.best_estimator_

# Evaluate the model
from sklearn.metrics import accuracy_score
y_train_pred = best_model.predict(X_train_rfe)
y_test_pred = best_model.predict(X_test_rfe)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Train Accuracy: ", train_accuracy)
print("Test Accuracy: ", test_accuracy)

# Save the final model
joblib.dump(best_model, 'churn_model.pkl')


Final selected features after RFE: ['MonthlyMinutes', 'ReceivedCalls', 'MonthsInService', 'CurrentEquipmentDays', 'RetentionCalls', 'ServiceArea_PHIBRI856', 'ServiceArea_SEAMTV360', 'HandsetPrice_Unknown', 'CreditRating_2-High', 'Occupation_Other']
Train Accuracy:  0.7116310310086197
Test Accuracy:  0.7104799216454456


['churn_model.pkl']

In [5]:
import joblib

# Load the trained model
model = joblib.load('churn_model.pkl')

# Use the model to make predictions
# input_data should be in the same format as your training data
# prediction = model.predict(input_data)
