In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('dataset/churn.csv')
print(df.shape)
df.head()

# DATA ANALYSIS

In [None]:
df.info()

In [70]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
num_cols = [col for col in df.columns if df[col].dtype != 'object']

print(f'{len(cat_cols)} Categorical Columns')
print(f'{len(num_cols)} Numerical Columns')

18 Categorical Columns
3 Numerical Columns


In [None]:
df.describe()

In [None]:
sns.set_theme(style="whitegrid")

plt.figure(figsize=(6, 6))
churn_counts = df['Churn'].value_counts()
plt.pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette("Set2"))
plt.title("Distribuição de Churn (Yes/No)")
plt.axis('equal')
plt.show()

In [None]:
plt.figure(figsize=(15, 6))
plt.suptitle('Distribuição das Variáveis Numéricas')

plt.subplot(1, 2, 1)
sns.histplot(data=df, x='MonthlyCharges', hue='Churn', multiple='stack', bins=30, palette="Set2", kde=True)

plt.subplot(1, 2, 2)
sns.histplot(data=df, x='tenure', hue='Churn', multiple='stack', bins=30, palette="Set2", kde=True)
plt.show()


In [None]:
plt.figure(figsize=(15, 6))
plt.suptitle('Relações entre Variáveis Categóricas e Churn')

plt.subplot(1, 3, 1)
sns.countplot(data=df, x='Contract', hue='Churn', palette='Set2')
plt.xticks(rotation=45)
plt.title('Relação entre Contract e Churn')

# Relação entre Payment Method e Churn
plt.subplot(1, 3, 2)
sns.countplot(data=df, x='PaymentMethod', hue='Churn', palette='Set2')
plt.xticks(rotation=45)
plt.title('Relação entre Payment Method e Churn')

# Distribuição de MonthlyCharges com Churn
plt.subplot(1, 3, 3)
sns.boxplot(x='Churn', y='MonthlyCharges', data=df, palette='Set2')
plt.title('Distribuição de MonthlyCharges por Churn')
plt.show()


In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print(df['TotalCharges'].isnull().sum())
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

contract_mapping = {
    'Month-to-month': 0,
    'One year': 1,
    'Two year': 2
}
df['Contract'] = df['Contract'].map(contract_mapping)

churn_data_encoded = pd.get_dummies(df, drop_first=True)
churn_data_encoded.rename(columns={'Churn_Yes': 'Churn'}, inplace=True)
churn_data_encoded.head()

# AUTOGLUON

In [None]:
from autogluon.tabular import TabularPredictor

label = 'Churn'

churn_data_encoded = pd.get_dummies(df, drop_first=True)
churn_data_encoded.rename(columns={'Churn_Yes': 'Churn'}, inplace=True)

X = churn_data_encoded.sample(frac=0.8, random_state=1)
test_data = churn_data_encoded.drop(X.index)

# Treinar o modelo com o AutoGluon sem o preset 'best_quality'
predictor = TabularPredictor(label=label, eval_metric='accuracy', problem_type='binary', path='models')
predictor.delete_models(models_to_keep='best', dry_run=False)
predictor.save_space()

In [None]:
churn_data_encoded['Churn'].value_counts()

In [None]:
predictor.fit(churn_data_encoded)

In [79]:
performance = predictor.evaluate(test_data)
print(performance)

{'accuracy': 0.8466997870830376, 'balanced_accuracy': 0.7697317272773772, 'mcc': 0.589183282867853, 'roc_auc': 0.9111375377218556, 'f1': 0.6795252225519288, 'precision': 0.7789115646258503, 'recall': 0.6026315789473684}


# SAVE BEST MODEL

In [120]:
model = predictor.load('models')
model.predict(test_data)
model.evaluate(test_data)

{'accuracy': 0.8466997870830376,
 'balanced_accuracy': 0.7697317272773772,
 'mcc': 0.589183282867853,
 'roc_auc': 0.9111375377218556,
 'f1': 0.6795252225519288,
 'precision': 0.7789115646258503,
 'recall': 0.6026315789473684}