## Importacao de Bibliotecas

In [51]:
import pandas as pd
import pandas_profiling as pp
import numpy as np
import seaborn as sns
import plotly.express as px

from sklearn import datasets
from matplotlib import pyplot as plt


sns.set()
# ignorando os warnings
import warnings
warnings.filterwarnings('ignore')
# plotando o matplotl
%matplotlib inline
plt.rcParams["figure.figsize"] = (5,5)
pd.set_option('display.max_columns', None)

## Importando Dataset de Amostra de Clientes

In [52]:
#csv = r'C:\Users\mauri.leite\Documents\GitHub\DH\Desafio03\Telecom - AMOSTRA CLIENTES.csv'
#csv = r'/Users/mauriguedes/Documents/GitHub/DH/Desafio03/Telecom - AMOSTRA CLIENTES.csv'
csv = r'D:\Estudos\DataScience\DigitalHouse\Git\Desafio3\Telecom - AMOSTRA CLIENTES.csv'
dados = pd.read_csv( csv, encoding = 'latin1', 
                 engine='python', 
                 delimiter=';', 
                 decimal=",")

# Explorando os dados

In [53]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null object
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: int64(2), object(19)
memor

In [54]:
dados.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,"R$ 29,85","R$ 29,85",No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,"R$ 56,95","R$ 1.889,50",No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,"R$ 53,85","R$ 108,15",Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),"R$ 42,30","R$ 1.840,75",No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,"R$ 70,70","R$ 151,65",Yes


In [55]:
#verifica a existencia de dados duplicados
dados.duplicated().sum()

0

In [56]:
#fig, axs = plt.subplots(2)
fig = px.histogram(dados, x="Contract", color="Churn", width=800, height=300)
fig2 = px.histogram(dados, x="InternetService", color="Churn", width=800, height=300, facet_col="PhoneService")

fig.show()
fig2.show()


# Inferindo que nosso publico alvo sao os contratos Month-to-month

In [57]:
#dados= dados.iloc[np.where(dados["Contract"]=="Month-to-month")]
#dados.reset_index(inplace = True, drop = True) 
#dados.head()

# Transformação dos dados

### Criando um Enum para as colunas que contém dados alphanumericos


#### Coluna gender
0: feminino
1: masculino

In [58]:
dados["gender"].value_counts()

Male      3555
Female    3488
Name: gender, dtype: int64

In [59]:
dados['gender'].loc[dados[dados['gender'] == 'Female'].index] = 0
dados['gender'].loc[dados[dados['gender'] == 'Male'].index] = 1
dados['gender'] = dados['gender'].astype('category')

#### Coluna SeniorCitizen


In [60]:
dados["SeniorCitizen"].value_counts()

0    5901
1    1142
Name: SeniorCitizen, dtype: int64

In [61]:
dados["SeniorCitizen"] = dados["SeniorCitizen"].astype('category')

#### Coluna: InternetService
0: No / 
1: Fiber optic / 
2: DSL

In [62]:
dados['InternetService'].value_counts()

Fiber optic    3096
DSL            2421
No             1526
Name: InternetService, dtype: int64

In [63]:
dados['InternetService'].loc[dados[dados['InternetService'] == 'No'].index] = 0
dados['InternetService'].loc[dados[dados['InternetService'] == 'Fiber optic'].index] = 1
dados['InternetService'].loc[dados[dados['InternetService'] == 'DSL'].index] = 2
dados['InternetService'] = dados['InternetService'].astype('category')

#### Coluna: Contract
1: Month-to-month
2: Two years
3: One year

In [64]:
dados['Contract'].value_counts()

Month-to-month    3875
Two year          1695
One year          1473
Name: Contract, dtype: int64

In [65]:
dados['Contract'].loc[dados[dados['Contract'] == 'Month-to-month'].index] = 1
dados['Contract'].loc[dados[dados['Contract'] == 'Two year'].index] = 2
dados['Contract'].loc[dados[dados['Contract'] == 'One year'].index] = 3
dados['Contract'] = dados['Contract'].astype('category')

#### Coluna: PaymentMethod
1: Electronic check
2: Mailed check
3: Bank transfer (automatic)
4: Credit card (automatic)

In [66]:
dados["PaymentMethod"].value_counts()

Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: PaymentMethod, dtype: int64

In [67]:
dados['PaymentMethod'].loc[dados[dados['PaymentMethod'] == 'Electronic check'].index] = 1
dados['PaymentMethod'].loc[dados[dados['PaymentMethod'] == 'Mailed check'].index] = 2
dados['PaymentMethod'].loc[dados[dados['PaymentMethod'] == 'Bank transfer (automatic)'].index] = 3
dados['PaymentMethod'].loc[dados[dados['PaymentMethod'] == 'Credit card (automatic)'].index] = 4
dados['PaymentMethod'] = dados['PaymentMethod'].astype('category')

### Transformando os dados monetários em float

In [68]:
dados['MonthlyCharges'] = dados['MonthlyCharges'].replace('[R$]', '', regex=True)
dados['MonthlyCharges'] = dados['MonthlyCharges'].replace('[,]', '.', regex=True)
dados['MonthlyCharges'] = dados['MonthlyCharges'].astype(float)

dados['TotalCharges'] = dados['TotalCharges'].replace('[R$]', '', regex=True)
dados['TotalCharges'] = dados['TotalCharges'].replace('[.]', '', regex=True)
dados['TotalCharges'] = dados['TotalCharges'].replace('[,]', '.', regex=True)
dados['TotalCharges'] = dados['TotalCharges'].str.strip()
#dados['TotalCharges'] = dados['TotalCharges'].astype(float)

In [69]:
dados.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,Yes,No,1,No,No phone service,2,No,Yes,No,No,No,No,1,Yes,1,29.85,29.85,No
1,5575-GNVDE,1,0,No,No,34,Yes,No,2,Yes,No,Yes,No,No,No,3,No,2,56.95,1889.5,No
2,3668-QPYBK,1,0,No,No,2,Yes,No,2,Yes,Yes,No,No,No,No,1,Yes,2,53.85,108.15,Yes
3,7795-CFOCW,1,0,No,No,45,No,No phone service,2,Yes,No,Yes,Yes,No,No,3,No,3,42.3,1840.75,No
4,9237-HQITU,0,0,No,No,2,Yes,No,1,No,No,No,No,No,No,1,Yes,1,70.7,151.65,Yes


#### Inserindo valores zerado onde tinhamos dados em branco
Ao tentar realizar a conversão do campo em float deu um erro e foi verificado a presença de dados em Branco
Primeiramente vamos colocar como 0 e depois se necessário analisar se será necessário realizar algum outro ajuste

In [70]:
dados['TotalCharges'].loc[dados[dados['TotalCharges'] == ''].index] = 0
dados['TotalCharges'] = dados['TotalCharges'].astype(float)

### Transformação do Dado Yes/No em Bolean 

- Alguns dados que possuem o 3 valor em sua categorização (ex.: "MultipleLines" [Yes, No, No phone service]), porem vejo que é uma variável dependete "PhoneService", portanto para o 3 valor iremos usar 0.

In [71]:
dados['Partner'].loc[dados[dados['Partner'] == 'Yes'].index] = 1
dados['Partner'].loc[dados[dados['Partner'] == 'No'].index] = 0
dados['Partner'] = dados['Partner'].astype('category')

dados['Dependents'].loc[dados[dados['Dependents'] == 'Yes'].index] = 1
dados['Dependents'].loc[dados[dados['Dependents'] == 'No'].index] = 0
dados['Dependents'] = dados['Dependents'].astype('category')

dados['PhoneService'].loc[dados[dados['PhoneService'] == 'Yes'].index] = 1
dados['PhoneService'].loc[dados[dados['PhoneService'] == 'No'].index] = 0
dados['PhoneService'] = dados['PhoneService'].astype('category')

dados['MultipleLines'].loc[dados[dados['MultipleLines'] == 'Yes'].index] = 1
dados['MultipleLines'].loc[dados[dados['MultipleLines'] == 'No'].index] = 0
dados['MultipleLines'].loc[dados[dados['MultipleLines'] == 'No phone service'].index] = 0
dados['MultipleLines'] = dados['MultipleLines'].astype('category')

dados['OnlineSecurity'].loc[dados[dados['OnlineSecurity'] == 'Yes'].index] = 1
dados['OnlineSecurity'].loc[dados[dados['OnlineSecurity'] == 'No'].index] = 0
dados['OnlineSecurity'].loc[dados[dados['OnlineSecurity'] == 'No internet service'].index] = 0
dados['OnlineSecurity'] = dados['OnlineSecurity'].astype('category')

dados['OnlineBackup'].loc[dados[dados['OnlineBackup'] == 'Yes'].index] = 1
dados['OnlineBackup'].loc[dados[dados['OnlineBackup'] == 'No'].index] = 0
dados['OnlineBackup'].loc[dados[dados['OnlineBackup'] == 'No internet service'].index] = 0
dados['OnlineBackup'] = dados['OnlineBackup'].astype('category')

dados['DeviceProtection'].loc[dados[dados['DeviceProtection'] == 'Yes'].index] = 1
dados['DeviceProtection'].loc[dados[dados['DeviceProtection'] == 'No'].index] = 0
dados['DeviceProtection'].loc[dados[dados['DeviceProtection'] == 'No internet service'].index] = 0
dados['DeviceProtection'] = dados['DeviceProtection'].astype('category')

dados['TechSupport'].loc[dados[dados['TechSupport'] == 'Yes'].index] = 1
dados['TechSupport'].loc[dados[dados['TechSupport'] == 'No'].index] = 0
dados['TechSupport'].loc[dados[dados['TechSupport'] == 'No internet service'].index] = 0
dados['TechSupport'] = dados['TechSupport'].astype('category')

dados['StreamingTV'].loc[dados[dados['StreamingTV'] == 'Yes'].index] = 1
dados['StreamingTV'].loc[dados[dados['StreamingTV'] == 'No'].index] = 0
dados['StreamingTV'].loc[dados[dados['StreamingTV'] == 'No internet service'].index] = 0
dados['StreamingTV'] = dados['StreamingTV'].astype('category')

dados['StreamingMovies'].loc[dados[dados['StreamingMovies'] == 'Yes'].index] = 1
dados['StreamingMovies'].loc[dados[dados['StreamingMovies'] == 'No'].index] = 0
dados['StreamingMovies'].loc[dados[dados['StreamingMovies'] == 'No internet service'].index] = 0
dados['StreamingMovies'] = dados['StreamingMovies'].astype('category')

dados['PaperlessBilling'].loc[dados[dados['PaperlessBilling'] == 'Yes'].index] = 1
dados['PaperlessBilling'].loc[dados[dados['PaperlessBilling'] == 'No'].index] = 0
dados['PaperlessBilling'] = dados['PaperlessBilling'].astype('category')


dados['Churn'].loc[dados[dados['Churn'] == 'Yes'].index] = 1
dados['Churn'].loc[dados[dados['Churn'] == 'No'].index] = 0
dados['Churn'] = dados['Churn'].astype('category')

### Criação de categorias de valores

- Categorizar as colunas de valores com base nos quartis (0-24; 25-49; 50-74; Above_75) 
- Criar dummies?

In [72]:
dados.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tenure,7043.0,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
MonthlyCharges,7043.0,64.761692,30.090047,18.25,35.5,70.35,89.85,118.75
TotalCharges,7043.0,2279.734304,2266.79447,0.0,398.55,1394.55,3786.6,8684.8


In [73]:
##tenure
dados['Cat_tenure'] = np.where( dados['tenure']< 9.00 , '0-24','')
dados['Cat_tenure'] = np.where((dados['tenure']>=9.00)  & (dados['tenure']<29.00) , '25-49',dados['Cat_tenure'])
dados['Cat_tenure'] = np.where((dados['tenure']>=29.00) & (dados['tenure']<55.00) , '50-74',dados['Cat_tenure'])
dados['Cat_tenure'] = np.where( dados['tenure']>=55.00 , 'Above-75',dados['Cat_tenure'])

##MonthlyCharges
dados['Cat_MonthlyCharges'] = np.where(dados['MonthlyCharges']< 35.50, '0-24','')
dados['Cat_MonthlyCharges'] = np.where((dados['MonthlyCharges']>=35.50) & (dados['MonthlyCharges']<70.35) , '25-49',dados['Cat_MonthlyCharges'])
dados['Cat_MonthlyCharges'] = np.where((dados['MonthlyCharges']>=70.35) & (dados['MonthlyCharges']<89.85) , '50-74',dados['Cat_MonthlyCharges'])
dados['Cat_MonthlyCharges'] = np.where(dados['MonthlyCharges']>=89.85, 'Above-75',dados['Cat_MonthlyCharges'])

##TotalCharges
dados['Cat_TotalCharges'] = np.where(dados['TotalCharges']< 398.55, '0-24','')
dados['Cat_TotalCharges'] = np.where((dados['TotalCharges']>=398.55) & (dados['TotalCharges']<1394.55) , '25-49',dados['Cat_TotalCharges'])
dados['Cat_TotalCharges'] = np.where((dados['TotalCharges']>=1394.55) & (dados['TotalCharges']<3786.60) , '50-74',dados['Cat_TotalCharges'])
dados['Cat_TotalCharges'] = np.where(dados['TotalCharges']>=3786.60, 'Above-75',dados['Cat_TotalCharges'])


dados.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Cat_tenure,Cat_MonthlyCharges,Cat_TotalCharges
0,7590-VHVEG,0,0,1,0,1,0,0,2,0,1,0,0,0,0,1,1,1,29.85,29.85,0,0-24,0-24,0-24
1,5575-GNVDE,1,0,0,0,34,1,0,2,1,0,1,0,0,0,3,0,2,56.95,1889.5,0,50-74,25-49,50-74
2,3668-QPYBK,1,0,0,0,2,1,0,2,1,1,0,0,0,0,1,1,2,53.85,108.15,1,0-24,25-49,0-24
3,7795-CFOCW,1,0,0,0,45,0,0,2,1,0,1,1,0,0,3,0,3,42.3,1840.75,0,50-74,25-49,50-74
4,9237-HQITU,0,0,0,0,2,1,0,1,0,0,0,0,0,0,1,1,1,70.7,151.65,1,0-24,50-74,0-24


In [74]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
customerID            7043 non-null object
gender                7043 non-null category
SeniorCitizen         7043 non-null category
Partner               7043 non-null category
Dependents            7043 non-null category
tenure                7043 non-null int64
PhoneService          7043 non-null category
MultipleLines         7043 non-null category
InternetService       7043 non-null category
OnlineSecurity        7043 non-null category
OnlineBackup          7043 non-null category
DeviceProtection      7043 non-null category
TechSupport           7043 non-null category
StreamingTV           7043 non-null category
StreamingMovies       7043 non-null category
Contract              7043 non-null category
PaperlessBilling      7043 non-null category
PaymentMethod         7043 non-null category
MonthlyCharges        7043 non-null float64
TotalCharges          7043 non-null float64


### Criando os dummies das categorias criadas acima

In [75]:
dados = pd.concat([dados, pd.get_dummies(dados['Cat_tenure'].astype(object), prefix='Cat_tenure', drop_first=True)], axis=1)
dados = pd.concat([dados, pd.get_dummies(dados['InternetService'].astype(object), prefix='InternetService', drop_first=True)], axis=1)
dados = pd.concat([dados, pd.get_dummies(dados['PaymentMethod'].astype(object), prefix='PaymentMethod', drop_first=True)], axis=1)
dados = pd.concat([dados, pd.get_dummies(dados['Contract'].astype(object), prefix='Contract', drop_first=True)], axis=1)

dados = pd.concat([dados, pd.get_dummies(dados['Cat_MonthlyCharges'].astype(object), prefix='Cat_MonthlyCharges', drop_first=True)], axis=1)
dados = pd.concat([dados, pd.get_dummies(dados['Cat_TotalCharges'].astype(object), prefix='Cat_TotalCharges', drop_first=True)], axis=1)

#dados.head()

### Verificando o Churn sobre o tempo de servico Tunure

In [76]:
fig = px.histogram(dados, x="tenure", color="Churn", width=800, height=300)
fig.show()


In [99]:
dados["Churn"].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

## Aplicando StandardScaler, Compressão e Balanceamento dos dados

A teoria na aplicação desta tecnica será verificar se as features comprimidas explicarão melhor minha variavel target (Churn)


In [77]:
dados.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Cat_tenure,Cat_MonthlyCharges,Cat_TotalCharges,Cat_tenure_25-49,Cat_tenure_50-74,Cat_tenure_Above-75,InternetService_1,InternetService_2,PaymentMethod_2,PaymentMethod_3,PaymentMethod_4,Contract_2,Contract_3,Cat_MonthlyCharges_25-49,Cat_MonthlyCharges_50-74,Cat_MonthlyCharges_Above-75,Cat_TotalCharges_25-49,Cat_TotalCharges_50-74,Cat_TotalCharges_Above-75
0,7590-VHVEG,0,0,1,0,1,0,0,2,0,1,0,0,0,0,1,1,1,29.85,29.85,0,0-24,0-24,0-24,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,5575-GNVDE,1,0,0,0,34,1,0,2,1,0,1,0,0,0,3,0,2,56.95,1889.5,0,50-74,25-49,50-74,0,1,0,0,1,1,0,0,0,1,1,0,0,0,1,0
2,3668-QPYBK,1,0,0,0,2,1,0,2,1,1,0,0,0,0,1,1,2,53.85,108.15,1,0-24,25-49,0-24,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0
3,7795-CFOCW,1,0,0,0,45,0,0,2,1,0,1,1,0,0,3,0,3,42.3,1840.75,0,50-74,25-49,50-74,0,1,0,0,1,0,1,0,0,1,1,0,0,0,1,0
4,9237-HQITU,0,0,0,0,2,1,0,1,0,0,0,0,0,0,1,1,1,70.7,151.65,1,0-24,50-74,0-24,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0


### Balanceando as informacoes

In [78]:
# definindo variáveis para cada uma das classes
Churn_0 = dados[dados.Churn==0]
Churn_1 = dados[dados.Churn==1]

In [79]:
# verificando o desbalanceamento
len(Churn_0),len(Churn_1)

(5174, 1869)

In [80]:
# fazendo um undersampling da classe com output zero (em maior número)
Churn_0=Churn_0.sample(n=1655)
len(Churn_0)

1655

In [81]:
# concatenando os dois DataSets com o mesmo tamanho
dadosblc = pd.concat([Churn_0,Churn_1])
dadosblc.reset_index(inplace = True, drop = True) 
dadosblc.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Cat_tenure,Cat_MonthlyCharges,Cat_TotalCharges,Cat_tenure_25-49,Cat_tenure_50-74,Cat_tenure_Above-75,InternetService_1,InternetService_2,PaymentMethod_2,PaymentMethod_3,PaymentMethod_4,Contract_2,Contract_3,Cat_MonthlyCharges_25-49,Cat_MonthlyCharges_50-74,Cat_MonthlyCharges_Above-75,Cat_TotalCharges_25-49,Cat_TotalCharges_50-74,Cat_TotalCharges_Above-75
0,9110-HSGTV,0,0,0,0,69,1,0,2,1,0,1,1,1,1,2,1,4,82.45,5555.3,0,Above-75,50-74,Above-75,0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,1
1,1197-BVMVG,0,1,0,0,4,1,1,1,0,0,0,0,0,0,1,1,1,74.45,294.45,0,0-24,50-74,0-24,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,8751-EDEKA,0,0,1,0,28,1,0,1,0,1,1,0,0,1,1,0,4,89.9,2433.5,0,25-49,Above-75,50-74,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0
3,1506-YJTYT,1,0,1,1,45,1,1,2,1,1,0,1,1,0,2,0,4,73.85,3371.0,0,50-74,50-74,50-74,0,1,0,0,1,0,0,1,1,0,0,1,0,0,1,0
4,4973-RLZVI,0,0,0,0,30,1,0,2,1,1,1,1,1,0,3,0,4,74.65,2308.6,0,50-74,50-74,50-74,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0


In [82]:
#dadosblc.isnull().sum()

### Aplicando o StandardScaler
Tenho duvida com relação as features boleanas

In [83]:
dadosblc.shape

(3524, 40)

In [84]:
from sklearn.preprocessing import StandardScaler
features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 
            'tenure', 
            'PhoneService', 'MultipleLines',
            'InternetService',
            'OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV',
            'StreamingMovies',
            'Contract',
            'PaperlessBilling',
            'PaymentMethod',
            'MonthlyCharges',
            'TotalCharges'
            
            #Features Dummies
            #'Cat_tenure_25-49','Cat_tenure_50-74','Cat_tenure_Above-75',
            #'InternetService_1','InternetService_2',
            #'PaymentMethod_2','PaymentMethod_3','PaymentMethod_4',
            #'Cat_MonthlyCharges_25-49','Cat_MonthlyCharges_50-74','Cat_MonthlyCharges_Above-75',
            #'Cat_TotalCharges_25-49','Cat_TotalCharges_50-74','Cat_TotalCharges_Above-75'
            #,'Contract_2','Contract_3'
           ]

# Separating out the features
x = dadosblc.loc[:, features].values

# Separating out the target
y = dadosblc.loc[:, ["Churn"]].values

# Standardizing the features
x = StandardScaler().fit_transform(x)

In [85]:
x.shape

(3524, 19)

In [86]:
pd.DataFrame(data=x, columns=features)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,-1.002842,-0.500532,-0.887651,-0.583467,1.766400,0.322506,-0.866384,1.261555,1.810165,-0.679883,1.460289,1.781953,1.223009,1.221565,0.666874,0.745093,1.655560,0.512249,1.674175
1,-1.002842,1.997874,-0.887651,-0.583467,-0.961012,0.322506,1.154223,-0.223321,-0.552436,-0.679883,-0.684796,-0.561182,-0.817656,-0.818622,-0.645278,0.745093,-0.974825,0.232164,-0.782858
2,-1.002842,-0.500532,1.126568,-0.583467,0.046032,0.322506,-0.866384,-0.223321,-0.552436,1.470842,1.460289,-0.561182,-0.817656,1.221565,-0.645278,-1.342115,1.655560,0.773079,0.216166
3,0.997166,-0.500532,1.126568,1.713893,0.759356,0.322506,1.154223,1.261555,1.810165,1.470842,-0.684796,1.781953,1.223009,-0.818622,0.666874,-1.342115,1.655560,0.211158,0.654017
4,-1.002842,-0.500532,-0.887651,-0.583467,0.129953,0.322506,-0.866384,1.261555,1.810165,1.470842,1.460289,1.781953,1.223009,-0.818622,1.979027,-1.342115,1.655560,0.239166,0.157833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3519,0.997166,-0.500532,-0.887651,-0.583467,-0.625331,0.322506,-0.866384,1.261555,-0.552436,-0.679883,-0.684796,1.781953,1.223009,-0.818622,1.979027,0.745093,-0.974825,-0.280742,-0.580466
3520,-1.002842,-0.500532,-0.887651,-0.583467,-0.751211,0.322506,-0.866384,1.261555,-0.552436,-0.679883,-0.684796,-0.561182,-0.817656,-0.818622,-0.645278,0.745093,0.778765,-0.826908,-0.731997
3521,0.997166,1.997874,-0.887651,-0.583467,-1.086893,0.322506,1.154223,-0.223321,-0.552436,-0.679883,-0.684796,-0.561182,-0.817656,-0.818622,-0.645278,0.745093,-0.974825,0.277678,-0.885000
3522,-1.002842,-0.500532,-0.887651,-0.583467,1.682480,0.322506,1.154223,-0.223321,1.810165,1.470842,1.460289,-0.561182,1.223009,-0.818622,-0.645278,0.745093,1.655560,1.229968,2.295783


In [87]:
y.shape

(3524, 1)

In [88]:
M = pd.DataFrame(data=y, columns=["Churn"])
M.Churn.value_counts()

1    1869
0    1655
Name: Churn, dtype: int64

### Aplicando PCA

In [89]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2,random_state=42)

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents 
             , columns = ['A', 'B'])

In [90]:
principalDf.head()

Unnamed: 0,A,B
0,3.415528,0.55075
1,-1.728772,-2.17316
2,0.867405,0.56518
3,2.625843,2.537627
4,2.143131,1.741872


In [91]:
finalDf = pd.concat([principalDf, dadosblc['Churn']], axis = 1)
finalDf.head()

Unnamed: 0,A,B,Churn
0,3.415528,0.55075,0
1,-1.728772,-2.17316,0
2,0.867405,0.56518,0
3,2.625843,2.537627,0
4,2.143131,1.741872,0


In [92]:
finalDf.isnull().sum()

A        0
B        0
Churn    0
dtype: int64

# Avaliando os Modelos a serem aplicados

In [93]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
%matplotlib inline

In [94]:
# Separating out the features
X = finalDf.loc[:, ["A", "B"]].values
#X = x
# Separating out the target
Y = finalDf.loc[:,['Churn']].values

In [95]:
X.shape

(3524, 2)

In [96]:
Y.shape

(3524, 1)

In [97]:
# separando os dados em treino e teste
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, Y, test_size=0.25, random_state=42, stratify=Y)

In [98]:
# %load solutions/solution_13.py

# importando as bibliotecas dos modelos
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix

# definindo o tamanho da figura para o gráfico
plt.figure(figsize=(12,8)) 
# criando uma lista com todos os modelos
classifiers =[
    KNeighborsClassifier(3),
    GaussianNB(),
    LogisticRegression(),
    SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    ]

# criando uma funçào para rodas o pipeline 
for clf in classifiers:
    # ajustando o modelo
    clf.fit(X_train, y_train)
    
    # armazenando o nome do modelo
    name = clf.__class__.__name__
    # imprimindo o nome do modelo
    print("="*30)
    print(name)
    # imprimindo os resultados
    print('****Results****')
    # fazendo predições
    # calculando as métricas
    y_pred = clf.predict(X_test)
    # imprimindo as métricas
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
    print("Precision:", metrics.precision_score(y_test, y_pred))
    print("Recall:", metrics.recall_score(y_test, y_pred))

    #Printando a Matriz de Confusao
    #print("*"*30)
    #print(confusion_matrix(y_test,y_pred))
    #print(pd.crosstab(y_test, y_pred,rownames=['Real'], colnames=['Predito'], margins=True))
    

    # plotando a curva ROC
    #y_pred_proba = clf.predict_proba(X_test)[::,1]
    #fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
    #auc = metrics.roc_auc_score(y_test, y_pred_proba)
    #plt.plot(fpr,tpr,label=name+", auc="+str(auc))
    #plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
    #plt.legend(loc=4)

KNeighborsClassifier
****Results****
Accuracy: 0.7116912599318955
Precision: 0.7092337917485265
Recall: 0.7730192719486081
GaussianNB
****Results****
Accuracy: 0.720771850170261
Precision: 0.7120921305182342
Recall: 0.7944325481798715
LogisticRegression
****Results****
Accuracy: 0.7128263337116912
Precision: 0.7049808429118773
Recall: 0.7880085653104925
SVC
****Results****
Accuracy: 0.7185017026106697
Precision: 0.7069943289224953
Recall: 0.8008565310492506
DecisionTreeClassifier
****Results****
Accuracy: 0.6526674233825198
Precision: 0.6709129511677282
Recall: 0.6766595289079229
RandomForestClassifier
****Results****
Accuracy: 0.6912599318955732
Precision: 0.7171492204899778
Recall: 0.6895074946466809
GradientBoostingClassifier
****Results****
Accuracy: 0.7162315550510783
Precision: 0.7148514851485148
Recall: 0.7730192719486081


<Figure size 864x576 with 0 Axes>