In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/telecom-users-dataset/telecom_users.csv


### Intro. Columns Info

* customerID - customer id
* gender - client gender (male / female)
* SeniorCitizen - is the client retired (1, 0)
* Partner - is the client married (Yes, No)
* tenure - how many months a person has been a client of the company
* PhoneService - is the telephone service connected (Yes, No)
* MultipleLines - are multiple phone lines connected (Yes, No, No phone service)
* InternetService - client's Internet service provider (DSL, Fiber optic, No)
* OnlineSecurity - is the online security service connected (Yes, No, No internet service)
* OnlineBackup - is the online backup service activated (Yes, No, No internet service)
* DeviceProtection - does the client have equipment insurance (Yes, No, No internet service)
* TechSupport - is the technical support service connected (Yes, No, No internet service)
* StreamingTV - is the streaming TV service connected (Yes, No, No internet service)
* StreamingMovies - is the streaming cinema service activated (Yes, No, No internet service)
* Contract - type of customer contract (Month-to-month, One year, Two year)
* PaperlessBilling - whether the client uses paperless billing (Yes, No)
* PaymentMethod - payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))
* MonthlyCharges - current monthly payment
* TotalCharges - the total amount that the client paid for the services for the entire time
* Churn - whether there was a churn (Yes or No)

### Step 1. Data Load & EDA

In [2]:
df = pd.read_csv('/kaggle/input/telecom-users-dataset/telecom_users.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5986 entries, 0 to 5985
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        5986 non-null   int64  
 1   customerID        5986 non-null   object 
 2   gender            5986 non-null   object 
 3   SeniorCitizen     5986 non-null   int64  
 4   Partner           5986 non-null   object 
 5   Dependents        5986 non-null   object 
 6   tenure            5986 non-null   int64  
 7   PhoneService      5986 non-null   object 
 8   MultipleLines     5986 non-null   object 
 9   InternetService   5986 non-null   object 
 10  OnlineSecurity    5986 non-null   object 
 11  OnlineBackup      5986 non-null   object 
 12  DeviceProtection  5986 non-null   object 
 13  TechSupport       5986 non-null   object 
 14  StreamingTV       5986 non-null   object 
 15  StreamingMovies   5986 non-null   object 
 16  Contract          5986 non-null   object 


* length of data - x : y = (5986 * 21) : (5986 x 1, Churn)

In [4]:
df.drop(['Unnamed: 0', 'customerID'], axis=1, inplace=True)

In [5]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,5986.0,5986.0,5986.0
mean,0.161377,32.46876,64.802213
std,0.367909,24.516391,30.114702
min,0.0,0.0,18.25
25%,0.0,9.0,35.65
50%,0.0,29.0,70.4
75%,0.0,56.0,89.9
max,1.0,72.0,118.75


In [6]:
df.gender.unique()

array(['Male', 'Female'], dtype=object)

### Step 2. Data Preprocessing

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le = LabelEncoder()
result = le.fit_transform(df['gender'])
print(result)

[1 0 0 ... 1 1 1]


In [9]:
print(df.gender)
print(result)

0         Male
1       Female
2       Female
3         Male
4         Male
         ...  
5981      Male
5982    Female
5983      Male
5984      Male
5985      Male
Name: gender, Length: 5986, dtype: object
[1 0 0 ... 1 1 1]


In [10]:
df.gender = result

In [11]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,Yes,Yes,72,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),24.10,1734.65,No
1,0,0,No,No,44,Yes,No,Fiber optic,No,Yes,Yes,No,Yes,No,Month-to-month,Yes,Credit card (automatic),88.15,3973.2,No
2,0,1,Yes,No,38,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Bank transfer (automatic),74.95,2869.85,Yes
3,1,0,No,No,4,Yes,No,DSL,No,No,No,No,No,Yes,Month-to-month,Yes,Electronic check,55.90,238.5,No
4,1,0,No,No,2,Yes,No,DSL,Yes,No,Yes,No,No,No,Month-to-month,No,Electronic check,53.45,119.5,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5981,1,0,Yes,No,1,Yes,No,Fiber optic,Yes,No,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,95.00,95,Yes
5982,0,0,Yes,Yes,23,Yes,Yes,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),91.10,2198.3,No
5983,1,0,Yes,Yes,12,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Electronic check,21.15,306.05,No
5984,1,1,No,No,12,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.45,1200.15,Yes


In [12]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5986 entries, 0 to 5985
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5986 non-null   int64  
 1   SeniorCitizen     5986 non-null   int64  
 2   Partner           5986 non-null   object 
 3   Dependents        5986 non-null   object 
 4   tenure            5986 non-null   int64  
 5   PhoneService      5986 non-null   object 
 6   MultipleLines     5986 non-null   object 
 7   InternetService   5986 non-null   object 
 8   OnlineSecurity    5986 non-null   object 
 9   OnlineBackup      5986 non-null   object 
 10  DeviceProtection  5986 non-null   object 
 11  TechSupport       5986 non-null   object 
 12  StreamingTV       5986 non-null   object 
 13  StreamingMovies   5986 non-null   object 
 14  Contract          5986 non-null   object 
 15  PaperlessBilling  5986 non-null   object 
 16  PaymentMethod     5986 non-null   object 


#### Check unique values in cate-columns

In [14]:
df.Partner.unique()

array(['Yes', 'No'], dtype=object)

In [15]:
df.Dependents.unique()

array(['Yes', 'No'], dtype=object)

#### Just make the for loop

In [16]:
for i in df.columns:
    print(i)

gender
SeniorCitizen
Partner
Dependents
tenure
PhoneService
MultipleLines
InternetService
OnlineSecurity
OnlineBackup
DeviceProtection
TechSupport
StreamingTV
StreamingMovies
Contract
PaperlessBilling
PaymentMethod
MonthlyCharges
TotalCharges
Churn


In [17]:
for i in df.columns:
    print('columns name : {} \n==> unique values :'.format(i),df['{}'.format(i)].unique())

columns name : gender 
==> unique values : [1 0]
columns name : SeniorCitizen 
==> unique values : [0 1]
columns name : Partner 
==> unique values : ['Yes' 'No']
columns name : Dependents 
==> unique values : ['Yes' 'No']
columns name : tenure 
==> unique values : [72 44 38  4  2 70 33  1 39 55 52 30 60 50 32 51 69 42 14 62  5 63 67 40
 65 16 46 11 49 68 10 53 54 15  3 71  8 64 57 20 26 31  7 35  6 13 23  9
 45 17 34 58 12 25 28 29 43 19 41 37 27 22 24 18 56 66 59 48 47 61 21  0
 36]
columns name : PhoneService 
==> unique values : ['Yes' 'No']
columns name : MultipleLines 
==> unique values : ['Yes' 'No' 'No phone service']
columns name : InternetService 
==> unique values : ['No' 'Fiber optic' 'DSL']
columns name : OnlineSecurity 
==> unique values : ['No internet service' 'No' 'Yes']
columns name : OnlineBackup 
==> unique values : ['No internet service' 'Yes' 'No']
columns name : DeviceProtection 
==> unique values : ['No internet service' 'Yes' 'No']
columns name : TechSupport 
==

#### ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'PaperlessBilling', 'Churn'] ==> Yes or No

In [18]:
YesNoCol = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'PaperlessBilling', 'Churn'] 

In [19]:
for col in YesNoCol:
    df[col] = le.fit_transform(df[col])

In [20]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,1,1,72,1,2,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,0,Credit card (automatic),24.10,1734.65,0
1,0,0,0,0,44,1,0,Fiber optic,No,Yes,Yes,No,Yes,No,Month-to-month,1,Credit card (automatic),88.15,3973.2,0
2,0,1,1,0,38,1,2,Fiber optic,No,No,No,No,No,No,Month-to-month,1,Bank transfer (automatic),74.95,2869.85,1
3,1,0,0,0,4,1,0,DSL,No,No,No,No,No,Yes,Month-to-month,1,Electronic check,55.90,238.5,0
4,1,0,0,0,2,1,0,DSL,Yes,No,Yes,No,No,No,Month-to-month,0,Electronic check,53.45,119.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5981,1,0,1,0,1,1,0,Fiber optic,Yes,No,No,No,Yes,Yes,Month-to-month,1,Electronic check,95.00,95,1
5982,0,0,1,1,23,1,2,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,1,Credit card (automatic),91.10,2198.3,0
5983,1,0,1,1,12,1,0,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,1,Electronic check,21.15,306.05,0
5984,1,1,0,0,12,1,2,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,1,Electronic check,99.45,1200.15,1


#### One more check (Yes No)

In [21]:
for i in df.columns:
    print('columns name : {} \n==> unique values :'.format(i),df['{}'.format(i)].unique())

columns name : gender 
==> unique values : [1 0]
columns name : SeniorCitizen 
==> unique values : [0 1]
columns name : Partner 
==> unique values : [1 0]
columns name : Dependents 
==> unique values : [1 0]
columns name : tenure 
==> unique values : [72 44 38  4  2 70 33  1 39 55 52 30 60 50 32 51 69 42 14 62  5 63 67 40
 65 16 46 11 49 68 10 53 54 15  3 71  8 64 57 20 26 31  7 35  6 13 23  9
 45 17 34 58 12 25 28 29 43 19 41 37 27 22 24 18 56 66 59 48 47 61 21  0
 36]
columns name : PhoneService 
==> unique values : [1 0]
columns name : MultipleLines 
==> unique values : [2 0 1]
columns name : InternetService 
==> unique values : ['No' 'Fiber optic' 'DSL']
columns name : OnlineSecurity 
==> unique values : ['No internet service' 'No' 'Yes']
columns name : OnlineBackup 
==> unique values : ['No internet service' 'Yes' 'No']
columns name : DeviceProtection 
==> unique values : ['No internet service' 'Yes' 'No']
columns name : TechSupport 
==> unique values : ['No internet service' 'No'

#### I want to encoding ['No internet service','Yes','No'] to [1,0] ('No internet service' ==> 'No') but simultaneously save 'No internet service'
#### Let's create New Column

In [22]:
df.OnlineSecurity == 'No internet service'

0        True
1       False
2       False
3       False
4       False
        ...  
5981    False
5982    False
5983     True
5984    False
5985     True
Name: OnlineSecurity, Length: 5986, dtype: bool

In [23]:
pd.Series(df.OnlineSecurity == 'No internet service')

0        True
1       False
2       False
3       False
4       False
        ...  
5981    False
5982    False
5983     True
5984    False
5985     True
Name: OnlineSecurity, Length: 5986, dtype: bool

In [24]:
temp = pd.Series(df.OnlineSecurity == 'No internet service')

In [25]:
temp

0        True
1       False
2       False
3       False
4       False
        ...  
5981    False
5982    False
5983     True
5984    False
5985     True
Name: OnlineSecurity, Length: 5986, dtype: bool

In [26]:
df['InternetService'] = le.fit_transform(temp)


In [27]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,1,1,72,1,2,1,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,0,Credit card (automatic),24.10,1734.65,0
1,0,0,0,0,44,1,0,0,No,Yes,Yes,No,Yes,No,Month-to-month,1,Credit card (automatic),88.15,3973.2,0
2,0,1,1,0,38,1,2,0,No,No,No,No,No,No,Month-to-month,1,Bank transfer (automatic),74.95,2869.85,1
3,1,0,0,0,4,1,0,0,No,No,No,No,No,Yes,Month-to-month,1,Electronic check,55.90,238.5,0
4,1,0,0,0,2,1,0,0,Yes,No,Yes,No,No,No,Month-to-month,0,Electronic check,53.45,119.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5981,1,0,1,0,1,1,0,0,Yes,No,No,No,Yes,Yes,Month-to-month,1,Electronic check,95.00,95,1
5982,0,0,1,1,23,1,2,0,Yes,Yes,Yes,Yes,Yes,Yes,Two year,1,Credit card (automatic),91.10,2198.3,0
5983,1,0,1,1,12,1,0,1,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,1,Electronic check,21.15,306.05,0
5984,1,1,0,0,12,1,2,0,No,No,Yes,No,Yes,Yes,Month-to-month,1,Electronic check,99.45,1200.15,1


#### Yes!! done, Next, remove 'No Internet service' values

In [28]:
'h' in 'heello'

True

In [29]:
df['OnlineSecurity'][-5:]

5981                    Yes
5982                    Yes
5983    No internet service
5984                     No
5985    No internet service
Name: OnlineSecurity, dtype: object

In [30]:
[0 if 'No' in x else 1 for x in df['OnlineSecurity']][-5:]

[1, 1, 0, 0, 0]

#### One-Hot Encoding

In [31]:
temp = ['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
for col in temp:
    df[col] = [0 if 'No' in x else 1 for x in df[col]]

In [32]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,1,1,72,1,2,1,0,0,0,0,0,0,Two year,0,Credit card (automatic),24.10,1734.65,0
1,0,0,0,0,44,1,0,0,0,1,1,0,1,0,Month-to-month,1,Credit card (automatic),88.15,3973.2,0
2,0,1,1,0,38,1,2,0,0,0,0,0,0,0,Month-to-month,1,Bank transfer (automatic),74.95,2869.85,1
3,1,0,0,0,4,1,0,0,0,0,0,0,0,1,Month-to-month,1,Electronic check,55.90,238.5,0
4,1,0,0,0,2,1,0,0,1,0,1,0,0,0,Month-to-month,0,Electronic check,53.45,119.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5981,1,0,1,0,1,1,0,0,1,0,0,0,1,1,Month-to-month,1,Electronic check,95.00,95,1
5982,0,0,1,1,23,1,2,0,1,1,1,1,1,1,Two year,1,Credit card (automatic),91.10,2198.3,0
5983,1,0,1,1,12,1,0,1,0,0,0,0,0,0,Month-to-month,1,Electronic check,21.15,306.05,0
5984,1,1,0,0,12,1,2,0,0,0,1,0,1,1,Month-to-month,1,Electronic check,99.45,1200.15,1


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5986 entries, 0 to 5985
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5986 non-null   int64  
 1   SeniorCitizen     5986 non-null   int64  
 2   Partner           5986 non-null   int64  
 3   Dependents        5986 non-null   int64  
 4   tenure            5986 non-null   int64  
 5   PhoneService      5986 non-null   int64  
 6   MultipleLines     5986 non-null   int64  
 7   InternetService   5986 non-null   int64  
 8   OnlineSecurity    5986 non-null   int64  
 9   OnlineBackup      5986 non-null   int64  
 10  DeviceProtection  5986 non-null   int64  
 11  TechSupport       5986 non-null   int64  
 12  StreamingTV       5986 non-null   int64  
 13  StreamingMovies   5986 non-null   int64  
 14  Contract          5986 non-null   object 
 15  PaperlessBilling  5986 non-null   int64  
 16  PaymentMethod     5986 non-null   object 


In [34]:
[float(x) for x in df.TotalCharges]

ValueError: could not convert string to float: 

#### Something wrong! check the problem!

In [35]:
count = 0
for n in df.TotalCharges:
    count += 1
    try:
        float(n)
    except ValueError as e:
        print(count)

357
635
2772
3087
3256
4327
5376
5383
5696
5952


In [36]:
df.TotalCharges[356]

' '

#### Find Null Values! Let's convert that to 0

In [37]:
idx = [357, 635,2772, 3087,3256,4327,5376,5383,5696,5952]
for i in idx:
    df.TotalCharges[i-1] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [38]:
df.TotalCharges[356]

0

In [39]:
df.TotalCharges = [float(x) for x in df.TotalCharges]

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5986 entries, 0 to 5985
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5986 non-null   int64  
 1   SeniorCitizen     5986 non-null   int64  
 2   Partner           5986 non-null   int64  
 3   Dependents        5986 non-null   int64  
 4   tenure            5986 non-null   int64  
 5   PhoneService      5986 non-null   int64  
 6   MultipleLines     5986 non-null   int64  
 7   InternetService   5986 non-null   int64  
 8   OnlineSecurity    5986 non-null   int64  
 9   OnlineBackup      5986 non-null   int64  
 10  DeviceProtection  5986 non-null   int64  
 11  TechSupport       5986 non-null   int64  
 12  StreamingTV       5986 non-null   int64  
 13  StreamingMovies   5986 non-null   int64  
 14  Contract          5986 non-null   object 
 15  PaperlessBilling  5986 non-null   int64  
 16  PaymentMethod     5986 non-null   object 


#### and object columns remained are ['Contract','PaymentMethod']|

### Let's EDA

In [41]:
df.Contract.unique()

array(['Two year', 'Month-to-month', 'One year'], dtype=object)

In [42]:
df.PaymentMethod.unique()

array(['Credit card (automatic)', 'Bank transfer (automatic)',
       'Electronic check', 'Mailed check'], dtype=object)

#### I Think that's two columns are features, Let's convert them to dummies

In [43]:
pd.get_dummies(df.Contract)

Unnamed: 0,Month-to-month,One year,Two year
0,0,0,1
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
5981,1,0,0
5982,0,0,1
5983,1,0,0
5984,1,0,0


In [44]:
dummy_col1 = pd.get_dummies(df.Contract)

In [45]:
pd.get_dummies(df.PaymentMethod)

Unnamed: 0,Bank transfer (automatic),Credit card (automatic),Electronic check,Mailed check
0,0,1,0,0
1,0,1,0,0
2,1,0,0,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
5981,0,0,1,0
5982,0,1,0,0
5983,0,0,1,0
5984,0,0,1,0


In [46]:
dummy_col2 = pd.get_dummies(df.PaymentMethod)

In [47]:
pd.concat([df, dummy_col1, dummy_col2], axis=1)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,MonthlyCharges,TotalCharges,Churn,Month-to-month,One year,Two year,Bank transfer (automatic),Credit card (automatic),Electronic check,Mailed check
0,1,0,1,1,72,1,2,1,0,0,...,24.10,1734.65,0,0,0,1,0,1,0,0
1,0,0,0,0,44,1,0,0,0,1,...,88.15,3973.20,0,1,0,0,0,1,0,0
2,0,1,1,0,38,1,2,0,0,0,...,74.95,2869.85,1,1,0,0,1,0,0,0
3,1,0,0,0,4,1,0,0,0,0,...,55.90,238.50,0,1,0,0,0,0,1,0
4,1,0,0,0,2,1,0,0,1,0,...,53.45,119.50,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5981,1,0,1,0,1,1,0,0,1,0,...,95.00,95.00,1,1,0,0,0,0,1,0
5982,0,0,1,1,23,1,2,0,1,1,...,91.10,2198.30,0,0,0,1,0,1,0,0
5983,1,0,1,1,12,1,0,1,0,0,...,21.15,306.05,0,1,0,0,0,0,1,0
5984,1,1,0,0,12,1,2,0,0,0,...,99.45,1200.15,1,1,0,0,0,0,1,0


In [48]:
df = pd.concat([df, dummy_col1, dummy_col2], axis=1)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5986 entries, 0 to 5985
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   gender                     5986 non-null   int64  
 1   SeniorCitizen              5986 non-null   int64  
 2   Partner                    5986 non-null   int64  
 3   Dependents                 5986 non-null   int64  
 4   tenure                     5986 non-null   int64  
 5   PhoneService               5986 non-null   int64  
 6   MultipleLines              5986 non-null   int64  
 7   InternetService            5986 non-null   int64  
 8   OnlineSecurity             5986 non-null   int64  
 9   OnlineBackup               5986 non-null   int64  
 10  DeviceProtection           5986 non-null   int64  
 11  TechSupport                5986 non-null   int64  
 12  StreamingTV                5986 non-null   int64  
 13  StreamingMovies            5986 non-null   int64

In [50]:
df.drop(['Contract','PaymentMethod'], axis=1).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5986 entries, 0 to 5985
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   gender                     5986 non-null   int64  
 1   SeniorCitizen              5986 non-null   int64  
 2   Partner                    5986 non-null   int64  
 3   Dependents                 5986 non-null   int64  
 4   tenure                     5986 non-null   int64  
 5   PhoneService               5986 non-null   int64  
 6   MultipleLines              5986 non-null   int64  
 7   InternetService            5986 non-null   int64  
 8   OnlineSecurity             5986 non-null   int64  
 9   OnlineBackup               5986 non-null   int64  
 10  DeviceProtection           5986 non-null   int64  
 11  TechSupport                5986 non-null   int64  
 12  StreamingTV                5986 non-null   int64  
 13  StreamingMovies            5986 non-null   int64

In [51]:
df.drop(['Contract','PaymentMethod'], axis=1,inplace=True)

In [52]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,MonthlyCharges,TotalCharges,Churn,Month-to-month,One year,Two year,Bank transfer (automatic),Credit card (automatic),Electronic check,Mailed check
0,1,0,1,1,72,1,2,1,0,0,...,24.10,1734.65,0,0,0,1,0,1,0,0
1,0,0,0,0,44,1,0,0,0,1,...,88.15,3973.20,0,1,0,0,0,1,0,0
2,0,1,1,0,38,1,2,0,0,0,...,74.95,2869.85,1,1,0,0,1,0,0,0
3,1,0,0,0,4,1,0,0,0,0,...,55.90,238.50,0,1,0,0,0,0,1,0
4,1,0,0,0,2,1,0,0,1,0,...,53.45,119.50,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5981,1,0,1,0,1,1,0,0,1,0,...,95.00,95.00,1,1,0,0,0,0,1,0
5982,0,0,1,1,23,1,2,0,1,1,...,91.10,2198.30,0,0,0,1,0,1,0,0
5983,1,0,1,1,12,1,0,1,0,0,...,21.15,306.05,0,1,0,0,0,0,1,0
5984,1,1,0,0,12,1,2,0,0,0,...,99.45,1200.15,1,1,0,0,0,0,1,0


#### Preprocessing - All Clear

### Step 3. Dataset Split (Train : Test = 0.8 : 0.2)

In [53]:
point = [0.8, 0.2]

In [54]:
np.random.permutation(len(df))

array([1359, 2217, 2849, ..., 1331, 3237, 2761])

In [55]:
np.random.permutation(len(df)).max()

5985

In [56]:
seed_idx = np.random.permutation(len(df))

#### Shuffle

In [57]:
# Shuffle Test
df.set_index(seed_idx)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,MonthlyCharges,TotalCharges,Churn,Month-to-month,One year,Two year,Bank transfer (automatic),Credit card (automatic),Electronic check,Mailed check
1270,1,0,1,1,72,1,2,1,0,0,...,24.10,1734.65,0,0,0,1,0,1,0,0
804,0,0,0,0,44,1,0,0,0,1,...,88.15,3973.20,0,1,0,0,0,1,0,0
3652,0,1,1,0,38,1,2,0,0,0,...,74.95,2869.85,1,1,0,0,1,0,0,0
3855,1,0,0,0,4,1,0,0,0,0,...,55.90,238.50,0,1,0,0,0,0,1,0
3062,1,0,0,0,2,1,0,0,1,0,...,53.45,119.50,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,1,0,1,0,1,1,0,0,1,0,...,95.00,95.00,1,1,0,0,0,0,1,0
2535,0,0,1,1,23,1,2,0,1,1,...,91.10,2198.30,0,0,0,1,0,1,0,0
5930,1,0,1,1,12,1,0,1,0,0,...,21.15,306.05,0,1,0,0,0,0,1,0
5738,1,1,0,0,12,1,2,0,0,0,...,99.45,1200.15,1,1,0,0,0,0,1,0


In [58]:
train_size = round(len(df)*point[0])
valid_size = round(len(df)*point[1])
print('train_size : ', train_size)
print('test_size : ', valid_size)

train_size :  4789
test_size :  1197


In [59]:
X = df.set_index(seed_idx).drop('Churn',axis=1)
y = df.set_index(seed_idx)['Churn']

In [60]:
# split
train_x = X[:train_size]
train_y = y[:train_size]
test_x = X[train_size:]
test_y = y[train_size:]

In [61]:
print(len(train_x), len(train_y))
print(len(test_x), len(test_y))

4789 4789
1197 1197


### Step 4. Modeling : Logistic Regression

In [62]:
from sklearn.linear_model import LogisticRegression
#from sklearn.preprocessing import StandardScaler

In [63]:
model = LogisticRegression(max_iter=500)

#### Model training

In [64]:
model.fit(train_x,train_y)

LogisticRegression(max_iter=500)

#### Prediction

In [65]:
pred = model.predict(test_x)

#### Result

In [66]:
print('[Accuracy]Simple Logistic Regresssion Model : ',(pred == test_y).sum() / len(pred))

[Accuracy]Simple Logistic Regresssion Model :  0.8061821219715957
