In [1]:
import pandas as pd 
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import recall_score



### Carga de dados

In [2]:
df_churn = pd.read_csv('./datasets/data.csv')

In [3]:
df_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   IDCliente         7032 non-null   object 
 1   Genero            7032 non-null   object 
 2   Mais65anos        7032 non-null   int64  
 3   TemParceiro       7032 non-null   object 
 4   TemDependentes    7032 non-null   object 
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  tenure            7032 non-null   int64  
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


In [4]:
#contar clientes cusando a variavel churn como referencia
df_churn.Churn.value_counts()

Churn
No     5163
Yes    1869
Name: count, dtype: int64

In [5]:
#distribuição percentual
df_churn.Churn.value_counts(normalize=True)

Churn
No     0.734215
Yes    0.265785
Name: proportion, dtype: float64

### Preparação da base para algoritmo lof

In [8]:
#selecionar variaveis 
X = df_churn.drop(columns=['IDCliente', 'Churn'])
y = df_churn['Churn']

In [9]:
#definir uma função apra transformar yes em 1 e no em 0
def binary_transformer_function(X):
    return X.map(lambda x: 1 if x =='Yes' else 0)



In [10]:
#transformações
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = ['Genero', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod' ]
binary_features = ['TemParceiro', 'TemDependentes', 'TechSupport', 'PhoneService', 'PaperlessBilling']
no_transformation_features = ['Mais65anos']

numeric_trans = StandardScaler()
categorical_trans = OneHotEncoder()
binary_trans = FunctionTransformer(binary_transformer_function)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_trans, numeric_features),
        ('cat', categorical_trans, categorical_features),
        ('bin', binary_trans, binary_features),
        ('pass', 'passthrough', no_transformation_features)
    ]
)

X_transformed = preprocessor.fit_transform(X)


In [12]:
X_transformed.shape

(7032, 39)

### Treinar algoritmo LOF

In [13]:
#instanciar objeto lof
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.26)

In [14]:
#treinar algoritmo e gerar classificações de anomalia

y_pred = lof.fit_predict(X_transformed)


In [15]:
#mostrar valores preditos anomalia ou nao anomalia
#no sklearn o predict gera um valor = -1 anomalia e valor = 1 pontos normais
y_pred

array([ 1,  1,  1, ...,  1, -1,  1])

In [17]:
#mostrar LOF calculado para cada ponto de dados
#no sklearn o lof calculado fica na propriedade negative_outilier_factor
#o negative é o inverso do lof, quanto menor mais anormal

-lof.negative_outlier_factor_

array([1.0238333 , 1.03547225, 1.02610568, ..., 1.07053634, 1.19840027,
       1.08901757])

### Apresentar resultados

In [19]:
import numpy as np
#identificar anomalias
outliers = y_pred == -1
inliers = y_pred == 1

#contar anomalias
num_outliers = np.sum(outliers)
num_inliers = np.sum(inliers)

num_inliers

5203

In [20]:
num_outliers

1829

In [21]:
#converter y para a mesma base do y_pred
y_true = y.map(lambda x: -1 if x =='Yes' else 1)



In [22]:
#calcular o erro com base no valor de y
#usar o recall pois o objetivo é maximizar o verdadeiro positivo
recall_score(y_true, y_pred)

0.7515010652721286