## Importacao de Bibliotecas

In [646]:
import pandas as pd
import pandas_profiling as pp
import numpy as np
import seaborn as sns
import plotly.express as px

from sklearn import datasets
from matplotlib import pyplot as plt


sns.set()
# ignorando os warnings
import warnings
warnings.filterwarnings('ignore')
# plotando o matplotl
%matplotlib inline
plt.rcParams["figure.figsize"] = (5,5)
pd.set_option('display.max_columns', None)

## Importando Dataset de Amostra de Clientes

In [647]:
csv = r'C:\Users\mauri.leite\Documents\GitHub\DH\Desafio03\Telecom - AMOSTRA CLIENTES.csv'
#csv = r'/Users/mauriguedes/Documents/GitHub/DH/Desafio03/Telecom - AMOSTRA CLIENTES.csv'
#csv = r'D:\Estudos\DataScience\DigitalHouse\Git\Desafio3\Telecom - AMOSTRA CLIENTES.csv'
dados = pd.read_csv( csv, encoding = 'latin1', 
                 engine='python', 
                 delimiter=';', 
                 decimal=",")

# Explorando os dados

In [648]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null object
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: int64(2), object(19)
memor

In [649]:
dados.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,"R$ 29,85","R$ 29,85",No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,"R$ 56,95","R$ 1.889,50",No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,"R$ 53,85","R$ 108,15",Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),"R$ 42,30","R$ 1.840,75",No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,"R$ 70,70","R$ 151,65",Yes


In [650]:
#verifica a existencia de dados duplicados
dados.duplicated().sum()

0

In [651]:
#fig, axs = plt.subplots(2)
fig = px.histogram(dados, x="Contract", color="Churn", width=800, height=300)
fig2 = px.histogram(dados, x="InternetService", color="Churn", width=800, height=300, facet_col="PhoneService")

fig.show()
fig2.show()


# Inferindo que nosso publico alvo sao os contratos Month-to-month

In [652]:
dados= dados.iloc[np.where((dados["Contract"]=="Month-to-month")& (dados["tenure"]<=12))]
dados.reset_index(inplace = True, drop = True) 
dados.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,"R$ 29,85","R$ 29,85",No
1,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,"R$ 53,85","R$ 108,15",Yes
2,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,"R$ 70,70","R$ 151,65",Yes
3,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,"R$ 99,65","R$ 820,50",Yes
4,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,"R$ 29,75","R$ 301,90",No


In [653]:
fig = px.histogram(dados, x="tenure", color="Churn", width=800, height=300)
fig2 = px.histogram(dados, x="tenure", color="Churn", width=800, height=300, facet_col="InternetService" )
fig3 = px.histogram(dados, x="InternetService", color="Churn", width=800, height=300, facet_col="StreamingTV" )
fig4 = px.histogram(dados, x="InternetService", color="Churn", width=800, height=300, facet_col="StreamingMovies" )
fig.show()
fig2.show()
fig3.show()
fig4.show()


# Transformação dos dados

### Criando um Enum para as colunas que contém dados alphanumericos


#### Coluna gender
0: feminino
1: masculino

In [654]:
dados["gender"].value_counts()

Male      1009
Female     985
Name: gender, dtype: int64

In [655]:
dados['gender'].loc[dados[dados['gender'] == 'Female'].index] = 0
dados['gender'].loc[dados[dados['gender'] == 'Male'].index] = 1
dados['gender'] = dados['gender'].astype('category')

#### Coluna SeniorCitizen


In [656]:
dados["SeniorCitizen"].value_counts()

0    1676
1     318
Name: SeniorCitizen, dtype: int64

In [657]:
dados["SeniorCitizen"] = dados["SeniorCitizen"].astype('category')

#### Coluna: InternetService
0: No / 
1: Fiber optic / 
2: DSL

In [658]:
dados['InternetService'].value_counts()

Fiber optic    916
DSL            690
No             388
Name: InternetService, dtype: int64

In [659]:
dados['InternetService'].loc[dados[dados['InternetService'] == 'No'].index] = 0
dados['InternetService'].loc[dados[dados['InternetService'] == 'Fiber optic'].index] = 1
dados['InternetService'].loc[dados[dados['InternetService'] == 'DSL'].index] = 2
dados['InternetService'] = dados['InternetService'].astype('category')

#### Coluna: Contract
1: Month-to-month
2: Two years
3: One year

In [660]:
dados['Contract'].value_counts()

Month-to-month    1994
Name: Contract, dtype: int64

In [661]:
dados['Contract'].loc[dados[dados['Contract'] == 'Month-to-month'].index] = 1
dados['Contract'].loc[dados[dados['Contract'] == 'Two year'].index] = 2
dados['Contract'].loc[dados[dados['Contract'] == 'One year'].index] = 3
dados['Contract'] = dados['Contract'].astype('category')

#### Coluna: PaymentMethod
1: Electronic check
2: Mailed check
3: Bank transfer (automatic)
4: Credit card (automatic)

In [662]:
dados["PaymentMethod"].value_counts()

Electronic check             954
Mailed check                 641
Bank transfer (automatic)    208
Credit card (automatic)      191
Name: PaymentMethod, dtype: int64

In [663]:
dados['PaymentMethod'].loc[dados[dados['PaymentMethod'] == 'Electronic check'].index] = 1
dados['PaymentMethod'].loc[dados[dados['PaymentMethod'] == 'Mailed check'].index] = 2
dados['PaymentMethod'].loc[dados[dados['PaymentMethod'] == 'Bank transfer (automatic)'].index] = 3
dados['PaymentMethod'].loc[dados[dados['PaymentMethod'] == 'Credit card (automatic)'].index] = 4
dados['PaymentMethod'] = dados['PaymentMethod'].astype('category')

### Transformando os dados monetários em float

In [664]:
dados['MonthlyCharges'] = dados['MonthlyCharges'].replace('[R$]', '', regex=True)
dados['MonthlyCharges'] = dados['MonthlyCharges'].replace('[,]', '.', regex=True)
dados['MonthlyCharges'] = dados['MonthlyCharges'].astype(float)

dados['TotalCharges'] = dados['TotalCharges'].replace('[R$]', '', regex=True)
dados['TotalCharges'] = dados['TotalCharges'].replace('[.]', '', regex=True)
dados['TotalCharges'] = dados['TotalCharges'].replace('[,]', '.', regex=True)
dados['TotalCharges'] = dados['TotalCharges'].str.strip()
#dados['TotalCharges'] = dados['TotalCharges'].astype(float)

#### Inserindo valores zerado onde tinhamos dados em branco
Ao tentar realizar a conversão do campo em float deu um erro e foi verificado a presença de dados em Branco
Primeiramente vamos colocar como 0 e depois se necessário analisar se será necessário realizar algum outro ajuste

In [665]:
dados['TotalCharges'].loc[dados[dados['TotalCharges'] == ''].index] = 0
dados['TotalCharges'] = dados['TotalCharges'].astype(float)

In [666]:
dados.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,Yes,No,1,No,No phone service,2,No,Yes,No,No,No,No,1,Yes,1,29.85,29.85,No
1,3668-QPYBK,1,0,No,No,2,Yes,No,2,Yes,Yes,No,No,No,No,1,Yes,2,53.85,108.15,Yes
2,9237-HQITU,0,0,No,No,2,Yes,No,1,No,No,No,No,No,No,1,Yes,1,70.7,151.65,Yes
3,9305-CDSKC,0,0,No,No,8,Yes,Yes,1,No,No,Yes,No,Yes,Yes,1,Yes,1,99.65,820.5,Yes
4,6713-OKOMC,0,0,No,No,10,No,No phone service,2,Yes,No,No,No,No,No,1,No,2,29.75,301.9,No


### Transformação do Dado Yes/No em Bolean 

- Alguns dados que possuem o 3 valor em sua categorização (ex.: "MultipleLines" [Yes, No, No phone service]), porem vejo que é uma variável dependete "PhoneService", portanto para o 3 valor iremos usar 0.

In [667]:
dados['Partner'].loc[dados[dados['Partner'] == 'Yes'].index] = 1
dados['Partner'].loc[dados[dados['Partner'] == 'No'].index] = 0
dados['Partner'] = dados['Partner'].astype('category')

dados['Dependents'].loc[dados[dados['Dependents'] == 'Yes'].index] = 1
dados['Dependents'].loc[dados[dados['Dependents'] == 'No'].index] = 0
dados['Dependents'] = dados['Dependents'].astype('category')

dados['PhoneService'].loc[dados[dados['PhoneService'] == 'Yes'].index] = 1
dados['PhoneService'].loc[dados[dados['PhoneService'] == 'No'].index] = 0
dados['PhoneService'] = dados['PhoneService'].astype('category')

dados['MultipleLines'].loc[dados[dados['MultipleLines'] == 'Yes'].index] = 1
dados['MultipleLines'].loc[dados[dados['MultipleLines'] == 'No'].index] = 0
dados['MultipleLines'].loc[dados[dados['MultipleLines'] == 'No phone service'].index] = 0
dados['MultipleLines'] = dados['MultipleLines'].astype('category')

dados['OnlineSecurity'].loc[dados[dados['OnlineSecurity'] == 'Yes'].index] = 1
dados['OnlineSecurity'].loc[dados[dados['OnlineSecurity'] == 'No'].index] = 0
dados['OnlineSecurity'].loc[dados[dados['OnlineSecurity'] == 'No internet service'].index] = 0
dados['OnlineSecurity'] = dados['OnlineSecurity'].astype('category')

dados['OnlineBackup'].loc[dados[dados['OnlineBackup'] == 'Yes'].index] = 1
dados['OnlineBackup'].loc[dados[dados['OnlineBackup'] == 'No'].index] = 0
dados['OnlineBackup'].loc[dados[dados['OnlineBackup'] == 'No internet service'].index] = 0
dados['OnlineBackup'] = dados['OnlineBackup'].astype('category')

dados['DeviceProtection'].loc[dados[dados['DeviceProtection'] == 'Yes'].index] = 1
dados['DeviceProtection'].loc[dados[dados['DeviceProtection'] == 'No'].index] = 0
dados['DeviceProtection'].loc[dados[dados['DeviceProtection'] == 'No internet service'].index] = 0
dados['DeviceProtection'] = dados['DeviceProtection'].astype('category')

dados['TechSupport'].loc[dados[dados['TechSupport'] == 'Yes'].index] = 1
dados['TechSupport'].loc[dados[dados['TechSupport'] == 'No'].index] = 0
dados['TechSupport'].loc[dados[dados['TechSupport'] == 'No internet service'].index] = 0
dados['TechSupport'] = dados['TechSupport'].astype('category')

dados['StreamingTV'].loc[dados[dados['StreamingTV'] == 'Yes'].index] = 1
dados['StreamingTV'].loc[dados[dados['StreamingTV'] == 'No'].index] = 0
dados['StreamingTV'].loc[dados[dados['StreamingTV'] == 'No internet service'].index] = 0
dados['StreamingTV'] = dados['StreamingTV'].astype('category')

dados['StreamingMovies'].loc[dados[dados['StreamingMovies'] == 'Yes'].index] = 1
dados['StreamingMovies'].loc[dados[dados['StreamingMovies'] == 'No'].index] = 0
dados['StreamingMovies'].loc[dados[dados['StreamingMovies'] == 'No internet service'].index] = 0
dados['StreamingMovies'] = dados['StreamingMovies'].astype('category')

dados['PaperlessBilling'].loc[dados[dados['PaperlessBilling'] == 'Yes'].index] = 1
dados['PaperlessBilling'].loc[dados[dados['PaperlessBilling'] == 'No'].index] = 0
dados['PaperlessBilling'] = dados['PaperlessBilling'].astype('category')


dados['Churn'].loc[dados[dados['Churn'] == 'Yes'].index] = 1
dados['Churn'].loc[dados[dados['Churn'] == 'No'].index] = 0
dados['Churn'] = dados['Churn'].astype('category')

### Criação de categorias de valores

- Categorizar as colunas de valores com base nos quartis (0-24; 25-49; 50-74; Above_75) 
- Criar dummies?

In [668]:
dados.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tenure,1994.0,4.474925,3.523503,1.0,1.0,3.0,7.0,12.0
MonthlyCharges,1994.0,58.217904,25.996257,18.75,34.5125,62.525,79.55,112.95
TotalCharges,1994.0,276.69343,275.268999,18.85,70.0125,168.15,418.375,1384.75


In [669]:
##tenure
dados['Cat_tenure'] = np.where( dados['tenure']< 13.00 , '0-12','')
dados['Cat_tenure'] = np.where((dados['tenure']>= 13.00)  & (dados['tenure']< 25.00) , '13-24',dados['Cat_tenure'])
dados['Cat_tenure'] = np.where( dados['tenure']>= 25.00 , 'Above-24',dados['Cat_tenure'])

##MonthlyCharges
dados['Cat_MonthlyCharges'] = np.where(dados['MonthlyCharges']< 35.50, '0-24','')
dados['Cat_MonthlyCharges'] = np.where((dados['MonthlyCharges']>=35.50) & (dados['MonthlyCharges']<70.35) , '25-49',dados['Cat_MonthlyCharges'])
dados['Cat_MonthlyCharges'] = np.where((dados['MonthlyCharges']>=70.35) & (dados['MonthlyCharges']<89.85) , '50-74',dados['Cat_MonthlyCharges'])
dados['Cat_MonthlyCharges'] = np.where(dados['MonthlyCharges']>=89.85, 'Above-75',dados['Cat_MonthlyCharges'])

##TotalCharges
dados['Cat_TotalCharges'] = np.where(dados['TotalCharges']< 398.55, '0-24','')
dados['Cat_TotalCharges'] = np.where((dados['TotalCharges']>=398.55) & (dados['TotalCharges']<1394.55) , '25-49',dados['Cat_TotalCharges'])
dados['Cat_TotalCharges'] = np.where((dados['TotalCharges']>=1394.55) & (dados['TotalCharges']<3786.60) , '50-74',dados['Cat_TotalCharges'])
dados['Cat_TotalCharges'] = np.where(dados['TotalCharges']>=3786.60, 'Above-75',dados['Cat_TotalCharges'])


dados.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Cat_tenure,Cat_MonthlyCharges,Cat_TotalCharges
0,7590-VHVEG,0,0,1,0,1,0,0,2,0,1,0,0,0,0,1,1,1,29.85,29.85,0,0-12,0-24,0-24
1,3668-QPYBK,1,0,0,0,2,1,0,2,1,1,0,0,0,0,1,1,2,53.85,108.15,1,0-12,25-49,0-24
2,9237-HQITU,0,0,0,0,2,1,0,1,0,0,0,0,0,0,1,1,1,70.7,151.65,1,0-12,50-74,0-24
3,9305-CDSKC,0,0,0,0,8,1,1,1,0,0,1,0,1,1,1,1,1,99.65,820.5,1,0-12,Above-75,25-49
4,6713-OKOMC,0,0,0,0,10,0,0,2,1,0,0,0,0,0,1,0,2,29.75,301.9,0,0-12,0-24,0-24


In [670]:
dados["Cat_tenure"].value_counts()

0-12    1994
Name: Cat_tenure, dtype: int64

### Criando os dummies das categorias criadas acima

In [671]:
dados = pd.concat([dados, pd.get_dummies(dados['InternetService'].astype(object), prefix='InternetService', drop_first=True)], axis=1)
dados = pd.concat([dados, pd.get_dummies(dados['PaymentMethod'].astype(object), prefix='PaymentMethod', drop_first=True)], axis=1)
dados = pd.concat([dados, pd.get_dummies(dados['Contract'].astype(object), prefix='Contract', drop_first=True)], axis=1)

dados = pd.concat([dados, pd.get_dummies(dados['Cat_tenure'].astype(object), prefix='Cat_tenure', drop_first=True)], axis=1)
dados = pd.concat([dados, pd.get_dummies(dados['Cat_MonthlyCharges'].astype(object), prefix='Cat_MonthlyCharges', drop_first=True)], axis=1)
dados = pd.concat([dados, pd.get_dummies(dados['Cat_TotalCharges'].astype(object), prefix='Cat_TotalCharges', drop_first=True)], axis=1)

#dados.head()

In [672]:
dados["Churn"].value_counts()

1    1024
0     970
Name: Churn, dtype: int64

## Aplicando StandardScaler, Compressão e Balanceamento dos dados

A teoria na aplicação desta tecnica será verificar se as features comprimidas explicarão melhor minha variavel target (Churn)


In [673]:
dados.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Cat_tenure,Cat_MonthlyCharges,Cat_TotalCharges,InternetService_1,InternetService_2,PaymentMethod_2,PaymentMethod_3,PaymentMethod_4,Cat_MonthlyCharges_25-49,Cat_MonthlyCharges_50-74,Cat_MonthlyCharges_Above-75,Cat_TotalCharges_25-49
0,7590-VHVEG,0,0,1,0,1,0,0,2,0,1,0,0,0,0,1,1,1,29.85,29.85,0,0-12,0-24,0-24,0,1,0,0,0,0,0,0,0
1,3668-QPYBK,1,0,0,0,2,1,0,2,1,1,0,0,0,0,1,1,2,53.85,108.15,1,0-12,25-49,0-24,0,1,1,0,0,1,0,0,0
2,9237-HQITU,0,0,0,0,2,1,0,1,0,0,0,0,0,0,1,1,1,70.7,151.65,1,0-12,50-74,0-24,1,0,0,0,0,0,1,0,0
3,9305-CDSKC,0,0,0,0,8,1,1,1,0,0,1,0,1,1,1,1,1,99.65,820.5,1,0-12,Above-75,25-49,1,0,0,0,0,0,0,1,1
4,6713-OKOMC,0,0,0,0,10,0,0,2,1,0,0,0,0,0,1,0,2,29.75,301.9,0,0-12,0-24,0-24,0,1,1,0,0,0,0,0,0


### Balanceando as informacoes

In [674]:
# definindo variáveis para cada uma das classes
Churn_0 = dados[dados.Churn==0]
Churn_1 = dados[dados.Churn==1]

In [675]:
# verificando o desbalanceamento
len(Churn_0),len(Churn_1)

(970, 1024)

In [676]:
# fazendo um undersampling da classe com output zero (em maior número)

Churn_1=Churn_1.sample(n=len(Churn_0))
len(Churn_0)

970

In [677]:
# concatenando os dois DataSets com o mesmo tamanho
dadosblc = pd.concat([Churn_0,Churn_1])
dadosblc.reset_index(inplace = True, drop = True) 
dadosblc.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Cat_tenure,Cat_MonthlyCharges,Cat_TotalCharges,InternetService_1,InternetService_2,PaymentMethod_2,PaymentMethod_3,PaymentMethod_4,Cat_MonthlyCharges_25-49,Cat_MonthlyCharges_50-74,Cat_MonthlyCharges_Above-75,Cat_TotalCharges_25-49
0,7590-VHVEG,0,0,1,0,1,0,0,2,0,1,0,0,0,0,1,1,1,29.85,29.85,0,0-12,0-24,0-24,0,1,0,0,0,0,0,0,0
1,6713-OKOMC,0,0,0,0,10,0,0,2,1,0,0,0,0,0,1,0,2,29.75,301.9,0,0-12,0-24,0-24,0,1,1,0,0,0,0,0,0
2,4929-XIHVW,1,1,1,0,2,1,0,1,0,0,1,0,1,1,1,1,4,95.5,181.65,0,0-12,Above-75,0-24,1,0,0,0,1,0,0,1,0
3,7310-EGVHZ,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,3,20.2,20.2,0,0-12,0-24,0-24,0,0,0,1,0,0,0,0,0
4,3413-BMNZE,1,1,0,0,1,1,0,2,0,0,0,0,0,0,1,0,3,45.25,45.25,0,0-12,25-49,0-24,0,1,0,1,0,1,0,0,0


In [678]:
dadosblc["Churn"].value_counts()

1    970
0    970
Name: Churn, dtype: int64

### Aplicando o StandardScaler
Tenho duvida com relação as features boleanas

In [679]:
dadosblc.shape

(1940, 33)

In [680]:
from sklearn.preprocessing import StandardScaler
features = [
            'gender', 
            'SeniorCitizen', 
            'Partner', 
            'Dependents', 
            'tenure', 
            'PhoneService', 'MultipleLines',
            #'InternetService',
            'OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies',
            #'Contract',
            'PaperlessBilling',
            #'PaymentMethod',
            'MonthlyCharges',
            #'TotalCharges',
    
            ##Categorias Criadas##
            #'Cat_tenure',
            #'Cat_MonthlyCharges',
            #'Cat_TotalCharges'
            
            ##Features Dummies##
            'InternetService_1','InternetService_2',
            'PaymentMethod_2','PaymentMethod_3','PaymentMethod_4'
            #'Contract_2','Contract_3',
    
            #'Cat_tenure_13-24','Cat_tenure_Above-24',
            #'Cat_MonthlyCharges_25-49','Cat_MonthlyCharges_50-74','Cat_MonthlyCharges_Above-75',
            #'Cat_TotalCharges_25-49','Cat_TotalCharges_50-74','Cat_TotalCharges_Above-75'
    
           ]

# Separating out the features
x = dadosblc.loc[:, features].values

# Separating out the target
y = dadosblc.loc[:, ["Churn"]].values

# Standardizing the features
#x = StandardScaler().fit_transform(x)

In [681]:
x.shape

(1940, 20)

In [682]:
pd.DataFrame(data=x, columns=features)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,MonthlyCharges,InternetService_1,InternetService_2,PaymentMethod_2,PaymentMethod_3,PaymentMethod_4
0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,29.85,0,1,0,0,0
1,0,0,0,0,10,0,0,1,0,0,0,0,0,0,29.75,0,1,1,0,0
2,1,1,1,0,2,1,0,0,0,1,0,1,1,1,95.5,1,0,0,0,1
3,1,0,0,0,1,1,0,0,0,0,0,0,0,0,20.2,0,0,0,1,0
4,1,1,0,0,1,1,0,0,0,0,0,0,0,0,45.25,0,1,0,1,0
5,1,0,0,0,2,1,0,0,1,0,0,0,0,0,49.25,0,1,1,0,0
6,0,0,0,0,3,1,0,0,1,0,1,1,1,1,75.3,0,1,0,0,0
7,1,0,0,0,10,1,0,1,0,1,0,0,0,1,79.85,1,0,1,0,0
8,1,0,0,0,1,1,0,0,0,0,1,0,0,0,49.05,0,1,1,0,0
9,0,0,0,1,3,1,0,1,0,0,1,0,1,1,64.5,0,1,0,1,0


In [683]:
y.shape

(1940, 1)

In [684]:
M = pd.DataFrame(data=y, columns=["Churn"])
M.Churn.value_counts()

1    970
0    970
Name: Churn, dtype: int64

### Aplicando PCA

In [685]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2,random_state=42)

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents 
             , columns = ['A', 'B'])

In [686]:
principalDf.head()

Unnamed: 0,A,B
0,-28.244919,-2.810993
1,-28.146911,6.155043
2,37.433968,-3.383994
3,-37.891548,-2.601344
4,-12.859034,-3.200898


In [687]:
finalDf = pd.concat([principalDf, dadosblc['Churn']], axis = 1)
finalDf.head()

Unnamed: 0,A,B,Churn
0,-28.244919,-2.810993,0
1,-28.146911,6.155043,0
2,37.433968,-3.383994,0
3,-37.891548,-2.601344,0
4,-12.859034,-3.200898,0


In [688]:
finalDf.isnull().sum()

A        0
B        0
Churn    0
dtype: int64

# Avaliando os Modelos a serem aplicados

In [689]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
%matplotlib inline

In [690]:
# Separating out the features
#X = finalDf.loc[:, ["A", "B"]].values
X = x
# Separating out the target
Y = finalDf.loc[:,['Churn']].values

In [691]:
X.shape

(1940, 20)

In [692]:
Y.shape

(1940, 1)

In [693]:
# separando os dados em treino e teste
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, Y, test_size=0.25, random_state=42, stratify=Y)

In [694]:
# %load solutions/solution_13.py

# importando as bibliotecas dos modelos
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix

# definindo o tamanho da figura para o gráfico
plt.figure(figsize=(12,8)) 
# criando uma lista com todos os modelos
classifiers =[
    KNeighborsClassifier(3),
    GaussianNB(),
    LogisticRegression(),
    SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    ]

# criando uma funçào para rodas o pipeline 
for clf in classifiers:
    # ajustando o modelo
    clf.fit(X_train, y_train)
    
    # armazenando o nome do modelo
    name = clf.__class__.__name__
    # imprimindo o nome do modelo
    print("="*30)
    print(name)
    # imprimindo os resultados
    print('****Results****')
    # fazendo predições
    # calculando as métricas
    y_pred = clf.predict(X_test)
    # imprimindo as métricas
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
    print("Precision:", metrics.precision_score(y_test, y_pred))
    print("Recall:", metrics.recall_score(y_test, y_pred))

    #Printando a Matriz de Confusao
    #print("*"*30)
    #print(pd.crosstab(y_test, y_pred,rownames=['Real'], colnames=['Predito'], margins=True))
    

    # plotando a curva ROC
    #y_pred_proba = clf.predict_proba(X_test)[::,1]
    #fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
    #auc = metrics.roc_auc_score(y_test, y_pred_proba)
    #plt.plot(fpr,tpr,label=name+", auc="+str(auc))
    #plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
    #plt.legend(loc=4)

KNeighborsClassifier
****Results****
Accuracy: 0.6268041237113402
Precision: 0.630901287553648
Recall: 0.6074380165289256
GaussianNB
****Results****
Accuracy: 0.688659793814433
Precision: 0.7116279069767442
Recall: 0.6322314049586777
LogisticRegression
****Results****
Accuracy: 0.7195876288659794
Precision: 0.7208333333333333
Recall: 0.7148760330578512
SVC
****Results****
Accuracy: 0.6804123711340206
Precision: 0.6761133603238867
Recall: 0.6900826446280992
DecisionTreeClassifier
****Results****
Accuracy: 0.6309278350515464
Precision: 0.64
Recall: 0.5950413223140496
RandomForestClassifier
****Results****
Accuracy: 0.6391752577319587
Precision: 0.6558139534883721
Recall: 0.5826446280991735
GradientBoostingClassifier
****Results****
Accuracy: 0.7216494845360825
Precision: 0.7238493723849372
Recall: 0.7148760330578512


<Figure size 864x576 with 0 Axes>