In [70]:
import sklearn
from sklearn import tree
from sklearn import impute
from sklearn import preprocessing
import seaborn as sns
import pandas as pd
import datasets

In [71]:
dataset = datasets.load_dataset("scikit-learn/churn-prediction", split="train", cache_dir="../../.datasets")

In [72]:
df = dataset.to_pandas()
print(df.columns)
df

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [82]:
for col in df.select_dtypes(["object"]).columns:
  print(col, df[col].unique(), f"-- count {len(df[col].unique())}")

customerID ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '8361-LTMKD' '3186-AJIEK' nan] -- count 7044
gender ['Female' 'Male' nan] -- count 3
Partner ['Yes' 'No' nan] -- count 3
Dependents ['No' 'Yes' nan] -- count 3
PhoneService ['No' 'Yes' nan] -- count 3
MultipleLines ['No phone service' 'No' 'Yes' nan] -- count 4
InternetService ['DSL' 'Fiber optic' 'No' nan] -- count 4
OnlineSecurity ['No' 'Yes' 'No internet service' nan] -- count 4
OnlineBackup ['Yes' 'No' 'No internet service' nan] -- count 4
DeviceProtection ['No' 'Yes' 'No internet service' nan] -- count 4
TechSupport ['No' 'Yes' 'No internet service' nan] -- count 4
StreamingTV ['No' 'Yes' 'No internet service' nan] -- count 4
StreamingMovies ['No' 'Yes' 'No internet service' nan] -- count 4
Contract ['Month-to-month' 'One year' 'Two year' nan] -- count 4
PaperlessBilling ['Yes' 'No' nan] -- count 3
PaymentMethod ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)' nan] -- count 5
TotalCharg

In [73]:
numeric_columns = df.select_dtypes(["int64", "float32", "float64"]).columns
categoric_columns = df.select_dtypes(["object"]).columns

print(numeric_columns)
print(categoric_columns)

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges'], dtype='object')
Index(['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges',
       'Churn'],
      dtype='object')


In [74]:
imputer_numeric_columns = impute.SimpleImputer(strategy="mean")
imputer_categoric_columns = impute.SimpleImputer(strategy="most_frequent")

df[numeric_columns] = imputer_numeric_columns.fit_transform(df[numeric_columns])
df[categoric_columns] = imputer_categoric_columns.fit_transform(df[categoric_columns])

In [76]:
scaler = preprocessing.StandardScaler()
encoder = preprocessing.OneHotEncoder()

df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

encoded = encoder.fit_transform(df[categoric_columns])
encoded_columns = encoder.get_feature_names_out()
encoded_df = pd.DataFrame(data=encoded.toarray(), columns=encoded_columns, index=df.index)
df.drop(categoric_columns, axis=1)
df = pd.concat([df, encoded_df])

In [77]:
df.shape

(14086, 13638)