# AI Model Training for Customer Churn Prediction
This notebook loads Telco Churn dataset from Hugging Face, preprocesses it, trains a logistic regression model, and saves it as churn_model.pkl

In [1]:
!pip install datasets scikit-learn pandas joblib

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (1

In [2]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import joblib
import numpy as np

In [5]:
# Load dataset
dataset = load_dataset("aai510-group1/telco-customer-churn")
df = dataset['train'].to_pandas()

# Select useful features
features = [
 'Age', 'Monthly Charge', 'Tenure in Months', 'Internet Service', 'Contract',
 'Payment Method', 'Online Security', 'Online Backup', 'Device Protection Plan',
 'Streaming TV', 'Streaming Movies', 'Gender', 'Partner', 'Dependents',
 'Phone Service', 'Paperless Billing'
]

print(df.columns.tolist())


target = 'Churn'
df = df[features + [target]].dropna()

['Age', 'Avg Monthly GB Download', 'Avg Monthly Long Distance Charges', 'Churn', 'Churn Category', 'Churn Reason', 'Churn Score', 'City', 'CLTV', 'Contract', 'Country', 'Customer ID', 'Customer Status', 'Dependents', 'Device Protection Plan', 'Gender', 'Internet Service', 'Internet Type', 'Lat Long', 'Latitude', 'Longitude', 'Married', 'Monthly Charge', 'Multiple Lines', 'Number of Dependents', 'Number of Referrals', 'Offer', 'Online Backup', 'Online Security', 'Paperless Billing', 'Partner', 'Payment Method', 'Phone Service', 'Population', 'Premium Tech Support', 'Quarter', 'Referred a Friend', 'Satisfaction Score', 'Senior Citizen', 'State', 'Streaming Movies', 'Streaming Music', 'Streaming TV', 'Tenure in Months', 'Total Charges', 'Total Extra Data Charges', 'Total Long Distance Charges', 'Total Refunds', 'Total Revenue', 'Under 30', 'Unlimited Data', 'Zip Code']


In [6]:
# Label encode categorical features
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [7]:
# Split and train
X = df.drop(target, axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87       611
           1       0.68      0.61      0.65       234

    accuracy                           0.81       845
   macro avg       0.77      0.75      0.76       845
weighted avg       0.81      0.81      0.81       845



In [8]:
# Save model
joblib.dump(model, 'churn_model.pkl')
print("✅ Model saved as churn_model.pkl")

✅ Model saved as churn_model.pkl


In [9]:
from google.colab import files
files.download('churn_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>