# Credit Card Cancellation Analysis
## Identifying Customer Churn Reasons from Data

In [1]:
import pandas as pd
import plotly.express as px
import os

In [2]:
# Set file paths
DATA_FILE = 'data-bank.csv'

DATASET_DIR = '../dataset'
FILE_PATH = os.path.join(DATASET_DIR, DATA_FILE)

### Load and clean the customer data for analysis

In [3]:
def load_and_clean_data(file_path):
    """Load and clean the customer data for analysis."""
    df = pd.read_csv(file_path, encoding="latin1")
    df = df.drop(["CLIENTNUM", "Sexo"], axis=1)
    df = df.dropna()
    return df

df = load_and_clean_data(FILE_PATH)
df.head()

Unnamed: 0,Categoria,Idade,Dependentes,Educação,Estado Civil,Faixa Salarial Anual,Categoria Cartão,Meses como Cliente,Produtos Contratados,Inatividade 12m,Contatos 12m,Limite,Limite Consumido,Limite Disponível,Mudanças Transacoes_Q4_Q1,Valor Transacoes 12m,Qtde Transacoes 12m,Mudança Qtde Transações_Q4_Q1,Taxa de Utilização Cartão
0,Cliente,45,3,Ensino Médio,Casado,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1335.0,1144,42,1625.0,0.061
1,Cliente,49,5,Ensino Superior,Solteiro,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1541.0,1291,33,3714.0,0.105
2,Cliente,51,3,Ensino Superior,Casado,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2594.0,1887,20,2333.0,0.0
3,Cliente,40,4,Ensino Médio,Não informado,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1405.0,1171,20,2333.0,0.76
4,Cliente,40,3,Sem ensino formal,Casado,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2175.0,816,28,2.5,0.0


### Display information and statistics about the dataset

In [4]:
def show_data_overview(df):
    print("DataFrame Info")
    print(df.info())
    print("-" * 100)
    pd.set_option('display.max_columns', None)
    print("Full Cleaned DataFrame")
    print(df)
    print("-" * 100)
    print("Descriptive Statistics")
    print(df.describe().round(1))
    print("-" * 100)
    print("Remaining Columns")
    print(list(df.columns.values))
    print("-" * 100)

show_data_overview(df)

DataFrame Info
<class 'pandas.core.frame.DataFrame'>
Index: 10126 entries, 0 to 10126
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Categoria                      10126 non-null  object 
 1   Idade                          10126 non-null  int64  
 2   Dependentes                    10126 non-null  int64  
 3   Educação                       10126 non-null  object 
 4   Estado Civil                   10126 non-null  object 
 5   Faixa Salarial Anual           10126 non-null  object 
 6   Categoria Cartão               10126 non-null  object 
 7   Meses como Cliente             10126 non-null  int64  
 8   Produtos Contratados           10126 non-null  int64  
 9   Inatividade 12m                10126 non-null  int64  
 10  Contatos 12m                   10126 non-null  int64  
 11  Limite                         10126 non-null  float64
 12  Limite Consumido               10126

### Analyze the distribution of customer churn (cancellation)

In [5]:
def analyze_cancellation_distribution(df):
    print("Customer Churn Distribution")
    churn_count = df["Categoria"].value_counts()
    print(churn_count)
    churn_percentage = df["Categoria"].value_counts(normalize=True).round(2)
    print(churn_percentage)
    print("-" * 100)

analyze_cancellation_distribution(df)

Customer Churn Distribution
Categoria
Cliente      8499
Cancelado    1627
Name: count, dtype: int64
Categoria
Cliente      0.84
Cancelado    0.16
Name: proportion, dtype: float64
----------------------------------------------------------------------------------------------------


### Generate and display histograms for each column in the dataset

In [6]:
def plot_histograms(df):
    for column in df.columns:
        fig = px.histogram(df, x=column, color="Categoria", color_discrete_sequence=px.colors.qualitative.D3)
        fig.show()

plot_histograms(df)