In [206]:
import sklearn
from sklearn import tree
from sklearn import impute
from sklearn import preprocessing
from sklearn import model_selection
import seaborn as sns
import pandas as pd
import datasets
import numpy as np

In [207]:
dataset = datasets.load_dataset("scikit-learn/churn-prediction", split="train", cache_dir="../../.datasets")

In [208]:
df = dataset.to_pandas()
print(df.columns)
df

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [209]:
for col in df.select_dtypes(["object"]).columns:
  print(col, df[col].unique(), f"-- count {len(df[col].unique())}")

customerID ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK'] -- count 7043
gender ['Female' 'Male'] -- count 2
Partner ['Yes' 'No'] -- count 2
Dependents ['No' 'Yes'] -- count 2
PhoneService ['No' 'Yes'] -- count 2
MultipleLines ['No phone service' 'No' 'Yes'] -- count 3
InternetService ['DSL' 'Fiber optic' 'No'] -- count 3
OnlineSecurity ['No' 'Yes' 'No internet service'] -- count 3
OnlineBackup ['Yes' 'No' 'No internet service'] -- count 3
DeviceProtection ['No' 'Yes' 'No internet service'] -- count 3
TechSupport ['No' 'Yes' 'No internet service'] -- count 3
StreamingTV ['No' 'Yes' 'No internet service'] -- count 3
StreamingMovies ['No' 'Yes' 'No internet service'] -- count 3
Contract ['Month-to-month' 'One year' 'Two year'] -- count 3
PaperlessBilling ['Yes' 'No'] -- count 2
PaymentMethod ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)'] -- count 4
TotalCharges ['29.85' '1889.5' '108.15' ... '346.45' '306.6'

In [210]:
df = df.drop(["customerID"], axis=1)

df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)
df["TotalCharges"] = df["TotalCharges"].astype("float64")

# df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})

In [211]:
numeric_columns = df.select_dtypes(["int64", "float32", "float64"]).columns
categoric_columns = df.select_dtypes(["object"]).columns.drop("Churn")

print(numeric_columns)
print(categoric_columns)

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')
Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')


In [212]:
imputer_numeric_columns = impute.SimpleImputer(strategy="mean")
imputer_categoric_columns = impute.SimpleImputer(strategy="most_frequent")

df[numeric_columns] = imputer_numeric_columns.fit_transform(df[numeric_columns])
df[categoric_columns] = imputer_categoric_columns.fit_transform(df[categoric_columns])

In [213]:
scaler = preprocessing.StandardScaler()
encoder = preprocessing.OneHotEncoder(drop="first")

df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

encoded = encoder.fit_transform(df[categoric_columns])
encoded_columns = encoder.get_feature_names_out()
encoded_df = pd.DataFrame(data=encoded.toarray(), columns=encoded_columns, index=df.index)
df = df.drop(categoric_columns, axis=1)
df = pd.concat([df, encoded_df], axis=1)

df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})

In [214]:
df

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-0.439916,-1.277445,-1.160323,-0.994971,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,-0.439916,0.066327,-0.259629,-0.173876,0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-0.439916,-1.236724,-0.362660,-0.960399,1,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,-0.439916,0.514251,-0.746535,-0.195400,0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-0.439916,-1.236724,0.197365,-0.941193,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,-0.439916,-0.340876,0.665992,-0.129281,0,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
7039,-0.439916,1.613701,1.277533,2.242808,0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
7040,-0.439916,-0.870241,-1.168632,-0.855182,0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
7041,2.273159,-1.155283,0.320338,-0.872777,1,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [217]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, y_train, X_test, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=55)