In [450]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [451]:
df = pd.read_csv('data/Churn.csv')
df.shape

(7043, 21)

In [452]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [453]:
df = df.drop(columns=['customerID'])
df['TotalCharges'] = df['TotalCharges'].replace(' ', 0)
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [454]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [455]:
df.isnull().sum()

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [456]:
df['churn'] = df['churn'].map({"No": 0, "Yes": 1})

## EDA

`normalize=True` me muestra los porcentajes en lugar de cuantos valores hay

In [457]:
df.churn.value_counts()

churn
0    5174
1    1869
Name: count, dtype: int64

In [458]:
df.churn.value_counts(normalize=True)

churn
0    0.73463
1    0.26537
Name: proportion, dtype: float64

## Feature importance: mutual information

In [459]:
def get_cat_cols(df, exclude=None):
    if exclude:
        return df.select_dtypes(include=['object']).columns.drop(exclude)
    else:
        return df.select_dtypes(include=['object']).columns

def get_num_cols(df, exclude=None):
    if exclude:
        return df.select_dtypes(include=['number']).columns.drop(exclude)
    else:
        return df.select_dtypes(include=['number']).columns

In [460]:
# No me funcionaba cuando usaba una variable con altisima cardinaliad, era la del id
def calculate_mi(series):
    return mutual_info_score(series, df.churn)

mi = df[get_cat_cols(df)].apply(calculate_mi)
mi.sort_values(ascending=False)

contract            0.098453
onlinesecurity      0.064677
techsupport         0.063021
internetservice     0.055574
onlinebackup        0.046792
paymentmethod       0.044519
deviceprotection    0.043917
streamingmovies     0.032001
streamingtv         0.031908
paperlessbilling    0.019194
dependents          0.014467
partner             0.011454
multiplelines       0.000801
phoneservice        0.000072
gender              0.000037
dtype: float64

## Feature importance: Correlation

In [461]:
df[get_num_cols(df)].corrwith(df.churn)

seniorcitizen     0.150889
tenure           -0.352229
monthlycharges    0.193356
totalcharges     -0.198324
churn             1.000000
dtype: float64

Como churn está en 0 y 1, al calcular el ``mean()`` me ayuda a ver la relación que tienen, en este caso indica que el 58% de las personas con 2 meses o menos de permanencia se dieron de baja.

In [462]:
df[df.tenure <= 2].churn.mean()

np.float64(0.5835266821345708)

In [463]:
df[df.tenure > 2].churn.mean()

np.float64(0.22099983821388125)

## Encoding

In [464]:
df['partner'] = df['partner'].map({"No": 0, "Yes": 1})
df['dependents'] = df['dependents'].map({"No": 0, "Yes": 1})
df['phoneservice'] = df['phoneservice'].map({"No": 0, "Yes": 1})
df['paperlessbilling'] = df['paperlessbilling'].map({"No": 0, "Yes": 1})

In [465]:
columns_to_encoding = df.select_dtypes(exclude=['number']).columns

onehot = OneHotEncoder(sparse_output=False)
df_encoding = onehot.fit_transform(df[columns_to_encoding])
df_encoding = pd.DataFrame(df_encoding, columns=onehot.get_feature_names_out(columns_to_encoding))
new_df = pd.concat([df, df_encoding], axis=1).drop(columns=columns_to_encoding, axis=1)

In [466]:
new_df.shape

(7043, 42)

## Split

In [467]:
X = new_df.drop('churn', axis=1).copy()
y = new_df['churn'].copy()

In [468]:
x_train, x_rest, y_train, y_rest = train_test_split(X, y, train_size=0.6, random_state=42, stratify=y)
x_valid, x_test, y_valid, y_test = train_test_split(x_rest, y_rest, test_size=0.5, random_state=42, stratify=y_rest)

print("Tamaño del conjunto de entrenamiento:", x_train.shape)
print("Tamaño del conjunto de validación:", x_valid.shape)
print("Tamaño del conjunto de prueba:", x_test.shape)

Tamaño del conjunto de entrenamiento: (4225, 41)
Tamaño del conjunto de validación: (1409, 41)
Tamaño del conjunto de prueba: (1409, 41)


## Feature scaling

In [469]:
# elimino las categóricas o binarias
x_train[get_num_cols(df, exclude=['churn','partner','dependents','phoneservice','seniorcitizen','paperlessbilling'])].describe()

Unnamed: 0,tenure,monthlycharges,totalcharges
count,4225.0,4225.0,4225.0
mean,32.44284,64.894509,2295.489408
std,24.629184,30.326002,2287.797684
min,0.0,18.4,0.0
25%,9.0,35.4,399.6
50%,29.0,70.55,1372.45
75%,55.0,90.1,3862.55
max,72.0,118.75,8684.8


In [470]:
scaler = MinMaxScaler()

x_train[['tenure', 'monthlycharges', 'totalcharges']] = scaler.fit_transform(x_train[['tenure','monthlycharges','totalcharges']])
x_valid[['tenure', 'monthlycharges', 'totalcharges']] = scaler.transform(x_valid[['tenure','monthlycharges','totalcharges']])

# verificando que siga el mismo tamaño, al parecer si
print(x_train.shape)

(4225, 41)


## Model

In [471]:
model = LogisticRegression()
model.fit(x_train, y_train)