# Prepare Stage

## Goals for this notebook:

    1. Split data
    2. Handle missing and nulls
    3. Handle outliers
    4. Encode variables
    5. Scale data
    6. Create new features

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import prepare

In [2]:
# Acquire data from acquire.py
telco = pd.read_csv('telco_churn.csv')
# function to change the types of variables goes here, or hopefully its covered in the acquire file

In [3]:
telco.head()

Unnamed: 0.1,Unnamed: 0,contract_type_id,internet_service_type_id,payment_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,payment_type,internet_service_type,contract_type
0,0,1,1,2,0003-MKNFE,Male,0,No,No,9,...,No,No,Yes,No,59.9,542.4,No,Mailed check,DSL,Month-to-month
1,1,1,1,4,0013-MHZWF,Female,0,No,Yes,9,...,Yes,Yes,Yes,Yes,69.4,571.45,No,Credit card (automatic),DSL,Month-to-month
2,2,1,1,1,0015-UOCOJ,Female,1,No,No,7,...,No,No,No,Yes,48.2,340.35,No,Electronic check,DSL,Month-to-month
3,3,1,1,1,0023-HGHWL,Male,1,No,No,1,...,No,No,No,Yes,25.1,25.1,Yes,Electronic check,DSL,Month-to-month
4,4,1,1,3,0032-PGELS,Female,0,Yes,Yes,1,...,No,No,No,No,30.5,30.5,Yes,Bank transfer (automatic),DSL,Month-to-month


In [4]:
telco.internet_service_type.value_counts(), telco.internet_service_type_id.value_counts()

(Fiber optic    3096
 DSL            2421
 None           1526
 Name: internet_service_type, dtype: int64,
 2    3096
 1    2421
 3    1526
 Name: internet_service_type_id, dtype: int64)

### Split data

In [5]:
train, test = prepare.split_my_data(telco, 0.8)
train, valid = prepare.split_my_data(train, 0.8)

### Handle missing and null values

In [6]:
telco.isna().sum()

Unnamed: 0                   0
contract_type_id             0
internet_service_type_id     0
payment_type_id              0
customer_id                  0
gender                       0
senior_citizen               0
partner                      0
dependents                   0
tenure                       0
phone_service                0
multiple_lines               0
online_security              0
online_backup                0
device_protection            0
tech_support                 0
streaming_tv                 0
streaming_movies             0
paperless_billing            0
monthly_charges              0
total_charges               11
churn                        0
payment_type                 0
internet_service_type        0
contract_type                0
dtype: int64

In [7]:
# Look at the null values --> it looks like they come from those with tenure of 0
telco[telco.total_charges.isna() == True][['tenure', 'monthly_charges', 'total_charges', 'churn']]

Unnamed: 0,tenure,monthly_charges,total_charges,churn
5086,0,19.7,,No
5433,0,56.05,,No
5504,0,61.9,,No
5584,0,73.35,,No
5603,0,52.55,,No
5687,0,80.85,,No
6569,0,20.0,,No
6605,0,20.25,,No
6615,0,25.35,,No
6686,0,25.75,,No


In [8]:
telco[telco.tenure == 1][['tenure', 'monthly_charges', 'total_charges', 'churn']]

Unnamed: 0,tenure,monthly_charges,total_charges,churn
3,1,25.10,25.10,Yes
4,1,30.50,30.50,Yes
7,1,44.30,44.30,No
20,1,50.10,50.10,No
34,1,48.55,48.55,Yes
...,...,...,...,...
5088,1,18.80,18.80,No
5181,1,19.65,19.65,No
5217,1,20.20,20.20,No
6475,1,20.35,20.35,No


In [9]:
# These values are null because the tenure is 0, so they haven't been charged yet.
# We'll assume that they are going to be charged.
# So the monthly charge value will fill in the null values.

def handle_nulls(df):
    df.total_charges = df['total_charges'].fillna(df['monthly_charges'])
    return df

In [10]:
telco_test = handle_nulls(telco)
telco_test[telco_test.tenure == 0]

Unnamed: 0.1,Unnamed: 0,contract_type_id,internet_service_type_id,payment_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,payment_type,internet_service_type,contract_type
5086,5086,2,3,2,2923-ARZLG,Male,0,Yes,Yes,0,...,No internet service,No internet service,No internet service,Yes,19.7,19.7,No,Mailed check,,One year
5433,5433,3,1,4,1371-DWPAZ,Female,0,Yes,Yes,0,...,Yes,Yes,No,No,56.05,56.05,No,Credit card (automatic),DSL,Two year
5504,5504,3,1,3,2775-SEFEE,Male,0,No,Yes,0,...,Yes,No,No,Yes,61.9,61.9,No,Bank transfer (automatic),DSL,Two year
5584,5584,3,1,2,4075-WKNIU,Female,0,Yes,Yes,0,...,Yes,Yes,No,No,73.35,73.35,No,Mailed check,DSL,Two year
5603,5603,3,1,3,4472-LVYGI,Female,0,Yes,Yes,0,...,Yes,Yes,No,Yes,52.55,52.55,No,Bank transfer (automatic),DSL,Two year
5687,5687,3,1,2,5709-LVOEQ,Female,0,Yes,Yes,0,...,No,Yes,Yes,No,80.85,80.85,No,Mailed check,DSL,Two year
6569,6569,3,3,2,2520-SGTTA,Female,0,Yes,Yes,0,...,No internet service,No internet service,No internet service,No,20.0,20.0,No,Mailed check,,Two year
6605,6605,3,3,2,3115-CZMZD,Male,0,No,Yes,0,...,No internet service,No internet service,No internet service,No,20.25,20.25,No,Mailed check,,Two year
6615,6615,3,3,2,3213-VVOLG,Male,0,Yes,Yes,0,...,No internet service,No internet service,No internet service,No,25.35,25.35,No,Mailed check,,Two year
6686,6686,3,3,2,4367-NUYAO,Male,0,Yes,Yes,0,...,No internet service,No internet service,No internet service,No,25.75,25.75,No,Mailed check,,Two year


In [11]:
train = prepare.handle_nulls(train)
valid = prepare.handle_nulls(valid)
train = prepare.handle_nulls(test)

### Handle outliers

In [12]:
# Identify outliers -- tenure, monthly_charges, total_charges
# The outliers in this dataset are informationally valuable, so we want to keep them.
# Therefore, we'll just make sure that when we scale this dataset, we take them into account.

### Encode variables

> variables with Yes or No will be converted to 1 or 0

In [13]:
def boolean_labeler(df, col):
    le = preprocessing.LabelEncoder()
    df[f'{col}_enc'] = le.fit_transform(df[col])
    return df

In [14]:
train = prepare.boolean_labeler(train, 'gender')
valid = prepare.boolean_labeler(valid, 'gender')
test = prepare.boolean_labeler(test, 'gender')

In [15]:
# covered by new feature

train = prepare.boolean_labeler(train, 'partner')
valid = prepare.boolean_labeler(valid, 'partner')
test = prepare.boolean_labeler(test, 'partner')

In [16]:
# covered by new feature

train = prepare.boolean_labeler(train, 'dependents')
valid = prepare.boolean_labeler(valid, 'dependents')
test = prepare.boolean_labeler(test, 'dependents')

In [17]:
# covered by new feature

train = prepare.boolean_labeler(train, 'phone_service')
valid = prepare.boolean_labeler(valid, 'phone_service')
test = prepare.boolean_labeler(test, 'phone_service')

In [18]:
train = prepare.boolean_labeler(train, 'paperless_billing')
valid = prepare.boolean_labeler(valid, 'paperless_billing')
test = prepare.boolean_labeler(test, 'paperless_billing')

> Variables with Yes, No and a third option for not applicable: the third option will be converted to a 0 indicating no

In [19]:
def more_than_two_labels(df, col):
    df[f'{col}_enc'] = np.where(df[col] == 'No', '0',
                                np.where(df[col] == 'Yes', '1', '0'))
    return df

In [20]:
# covered by new feature

train = prepare.more_than_two_labels(train, 'multiple_lines')
valid = prepare.more_than_two_labels(valid, 'multiple_lines')
test = prepare.more_than_two_labels(test, 'multiple_lines')

In [21]:
# covered by new feature

train = prepare.more_than_two_labels(train, 'online_security')
valid = prepare.more_than_two_labels(valid, 'online_security')
test = prepare.more_than_two_labels(test, 'online_security')

In [22]:
# covered by new feature

train = prepare.more_than_two_labels(train, 'online_backup')
valid = prepare.more_than_two_labels(valid, 'online_backup')
test = prepare.more_than_two_labels(test, 'online_backup')

In [23]:
# covered by new feature

train = prepare.more_than_two_labels(train, 'device_protection')
valid = prepare.more_than_two_labels(valid, 'device_protection')
test = prepare.more_than_two_labels(test, 'device_protection')

In [24]:
# covered by new feature

train = prepare.more_than_two_labels(train, 'tech_support')
valid = prepare.more_than_two_labels(valid, 'tech_support')
test = prepare.more_than_two_labels(test, 'tech_support')

In [25]:
# covered by new feature

train = prepare.more_than_two_labels(train, 'streaming_tv')
valid = prepare.more_than_two_labels(valid, 'streaming_tv')
test = prepare.more_than_two_labels(test, 'streaming_tv')

In [26]:
# covered by new feature

train = prepare.more_than_two_labels(train, 'streaming_movies')
valid = prepare.more_than_two_labels(valid, 'streaming_movies')
test = prepare.more_than_two_labels(test, 'streaming_movies')

In [27]:
# Payment type needs to be encoded
telco.payment_type.value_counts()

Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: payment_type, dtype: int64

In [28]:
# Since there are 4 values, I will make one-hot columns for each, and an extra for if they pay automatically
def payment_type(df):
    df['paid_elec_check'] = np.where(df.payment_type == 'Electronic check', 1, 0)
    df['paid_mail'] = np.where(df.payment_type == 'Mailed check', 1, 0)
    df['paid_bank'] = np.where(df.payment_type == 'Bank transfer (automatic)', 1, 0)
    df['paid_cc'] = np.where(df.payment_type == 'Credit card (automatic)', 1, 0)
    df['paid_auto'] = np.where(df.payment_type == 'Credit card (automatic)', 1,
                               np.where(df.payment_type == 'Bank transfer (automatic)', 1, 0))
    return df

In [29]:
telco_test = payment_type(telco)
telco_test.head()

Unnamed: 0.1,Unnamed: 0,contract_type_id,internet_service_type_id,payment_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,...,total_charges,churn,payment_type,internet_service_type,contract_type,paid_elec_check,paid_mail,paid_bank,paid_cc,paid_auto
0,0,1,1,2,0003-MKNFE,Male,0,No,No,9,...,542.4,No,Mailed check,DSL,Month-to-month,0,1,0,0,0
1,1,1,1,4,0013-MHZWF,Female,0,No,Yes,9,...,571.45,No,Credit card (automatic),DSL,Month-to-month,0,0,0,1,1
2,2,1,1,1,0015-UOCOJ,Female,1,No,No,7,...,340.35,No,Electronic check,DSL,Month-to-month,1,0,0,0,0
3,3,1,1,1,0023-HGHWL,Male,1,No,No,1,...,25.1,Yes,Electronic check,DSL,Month-to-month,1,0,0,0,0
4,4,1,1,3,0032-PGELS,Female,0,Yes,Yes,1,...,30.5,Yes,Bank transfer (automatic),DSL,Month-to-month,0,0,1,0,1


In [30]:
train = prepare.payment_type(train)
valid = prepare.payment_type(valid)
test = prepare.payment_type(test)

### Scale data

> Columns to be scaled: tenure, monthly_charges, total_charges

In [31]:
train_scaled = train[['tenure', 'monthly_charges', 'total_charges']]
valid_scaled = valid[['tenure', 'monthly_charges', 'total_charges']]
test_scaled = test[['tenure', 'monthly_charges', 'total_charges']]

In [32]:
from sklearn import preprocessing

def uniform_scaler(train, valid, test):
    '''
    Uses the train, valid & test datasets created by the split_my_data function
    First, make new dfs including only those columns you want scaled, else this function will scale every numerical value.

    This is a non-linear transformer, and it smooths out unusual distributions.
    It spreads out the most frequent values and reduces the impact of (marginal) outliers, therefore it is robust.
    It distorts correlations and distances within and across features.

    '''
    unf_scaler = preprocessing.QuantileTransformer(n_quantiles=100, output_distribution='normal', random_state=123, copy=True).fit(train)
    train = pd.DataFrame(unf_scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    valid = pd.DataFrame(unf_scaler.transform(valid), columns=valid.columns.values).set_index([valid.index.values])
    test = pd.DataFrame(unf_scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return unf_scaler, train, valid, test

In [33]:
unf_scaler, train_scaled, valid_scaled, test_scaled = prepare.uniform_scaler(train_scaled, valid_scaled, test_scaled)
train_scaled.head()

Unnamed: 0,tenure,monthly_charges,total_charges
5691,0.559592,-0.270565,0.485586
6232,1.073988,0.971152,1.453931
5807,0.967422,-0.036837,0.819781
2746,-1.22064,0.787436,-1.011789
3143,-0.216904,0.601211,0.19124


In [34]:
train['tenure_nml'] = train_scaled.tenure
train['monthly_charges_nml'] = train_scaled.monthly_charges
train['total_charges_nml'] = train_scaled.total_charges

valid['tenure_nml'] = valid_scaled.tenure
valid['monthly_charges_nml'] = valid_scaled.monthly_charges
valid['total_charges_nml'] = valid_scaled.total_charges

test['tenure_nml'] = test_scaled.tenure
test['monthly_charges_nml'] = test_scaled.monthly_charges
test['total_charges_nml'] = test_scaled.total_charges

### Create new features

#### Tenure converted to years

In [35]:
def months_to_years(df, col):
    df[f'{col}_months'] = (df[col] / 12).round(2)
    return df

In [36]:
train = prepare.months_to_years(train, 'tenure')
valid = prepare.months_to_years(valid, 'tenure')
test = prepare.months_to_years(test, 'tenure')

#### Single variable representing the information from phone_service and multiple_lines

In [37]:
telco.phone_service.value_counts(), telco.multiple_lines.value_counts()

(Yes    6361
 No      682
 Name: phone_service, dtype: int64,
 No                  3390
 Yes                 2971
 No phone service     682
 Name: multiple_lines, dtype: int64)

In [38]:
# If they have multiple lines, they get a 2, if they only have phone, they get a 1, else 0

def extra_lines(df):
    df['extra_lines'] = np.where(df['multiple_lines'] == 'Yes', '2',
                                np.where(df['multiple_lines'] == 'No', '1', '0'))
    return df

In [39]:
telco_test = extra_lines(telco)
telco_test.extra_lines.value_counts()

1    3390
2    2971
0     682
Name: extra_lines, dtype: int64

In [40]:
train = prepare.extra_lines(train)
valid = prepare.extra_lines(valid)
test = prepare.extra_lines(test)

#### Single variable representing info from dependents and partner

In [41]:
telco.dependents.value_counts(), telco.partner.value_counts()

(No     4933
 Yes    2110
 Name: dependents, dtype: int64,
 No     3641
 Yes    3402
 Name: partner, dtype: int64)

In [42]:
# If they have neither partner or dependents, they get 0
# If they have partner but no dependents, they get 1
# If they have partner and dependents, they get 2
# If they have only dependents, they get a 3 
## --> basically, the higher the score the harder it is for them to support their family

def family_support(df):
    df['family_support'] = np.where( (df['partner'] == 'No') & (df['dependents'] == 'Yes'), 3,
                                    np.where( (df['partner'] == 'Yes') & (df['dependents'] == 'Yes'), 2,
                                             np.where( (df['partner'] == 'Yes') & (df['dependents'] == 'No'), 1, 0)))
    return df

In [43]:
telco_test = family_support(telco)
telco_test.family_support.value_counts()

0    3280
2    1749
1    1653
3     361
Name: family_support, dtype: int64

In [44]:
train = prepare.family_support(train)
valid = prepare.family_support(valid)
test = prepare.family_support(test)

#### other ways to merge variables, such as streaming_tv & streaming_movies, online_security & online_backup

> Add new column indicating whether or not they have internet service, to cover the third option I removed

In [45]:
telco.internet_service_type.value_counts(), telco.internet_service_type_id.value_counts()

(Fiber optic    3096
 DSL            2421
 None           1526
 Name: internet_service_type, dtype: int64,
 2    3096
 1    2421
 3    1526
 Name: internet_service_type_id, dtype: int64)

In [46]:
def has_internet(df):
    df['has_internet'] = np.where(df.internet_service_type_id == 3, 0, 1)
    return df

In [47]:
telco_test = has_internet(telco)
telco_test.has_internet.value_counts()

1    5517
0    1526
Name: has_internet, dtype: int64

In [48]:
train = prepare.has_internet(train)
valid = prepare.has_internet(valid)
test = prepare.has_internet(test)

> Add new column indicating how many internet services the customer has

In [49]:
# There are 6 service options
# If they have no internet, they get 0
# If they have internet but no extra services, they get 1
# If they have x extra services, they get 2 - 7

def internet_services(df):
    df['internet_services'] = (df.has_internet + 
                               df.online_security_enc + 
                               df.online_backup_enc + 
                               df.tech_support_enc + 
                               df.streaming_tv_enc + 
                               df.streaming_movies_enc + 
                               df.device_protection_enc)
    return df

In [50]:
telco_test = prepare.more_than_two_labels(telco, 'online_security')
telco_test = prepare.more_than_two_labels(telco, 'online_backup')
telco_test = prepare.more_than_two_labels(telco, 'tech_support')
telco_test = prepare.more_than_two_labels(telco, 'streaming_tv')
telco_test = prepare.more_than_two_labels(telco, 'streaming_movies')
telco_test = prepare.more_than_two_labels(telco, 'device_protection')

telco_test = internet_services(telco)
telco_test.internet_services.value_counts()

0    1526
4    1118
3    1033
2     966
5     852
1     693
6     571
7     284
Name: internet_services, dtype: int64

In [51]:
train = prepare.internet_services(train)
valid = prepare.internet_services(valid)
test = prepare.internet_services(test)

> Create final dataframes containing only those columns which are relevant to our work

In [52]:
with pd.option_context('display.max_columns', None):
    display(train)

Unnamed: 0.1,Unnamed: 0,contract_type_id,internet_service_type_id,payment_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,payment_type,internet_service_type,contract_type,gender_enc,partner_enc,dependents_enc,phone_service_enc,paperless_billing_enc,multiple_lines_enc,online_security_enc,online_backup_enc,device_protection_enc,tech_support_enc,streaming_tv_enc,streaming_movies_enc,pay_elec_check,pay_mail,pay_bank,pay_cc,pay_auto,tenure_nml,monthly_charges_nml,total_charges_nml,tenure_months,extra_lines,family_support,has_internet,internet_services
5691,5691,3,1,3,5762-TJXGK,Female,0,No,No,52,No,No phone service,Yes,Yes,Yes,No,Yes,Yes,Yes,58.75,3038.55,No,Bank transfer (automatic),DSL,Two year,0,0,0,0,1,0,1,1,1,0,1,1,0,0,1,0,1,0.559592,-0.270565,0.485586,4.33,0,0,1,6
6232,6232,3,2,2,6242-FEGFD,Male,0,Yes,No,66,Yes,Yes,Yes,Yes,Yes,Yes,No,No,No,96.60,6424.25,No,Mailed check,Fiber optic,Two year,1,1,0,1,0,1,1,1,1,1,0,0,0,1,0,0,0,1.073988,0.971152,1.453931,5.50,2,1,1,5
5807,5807,3,1,4,7359-SSBJK,Female,1,No,No,64,Yes,No,Yes,Yes,No,Yes,Yes,No,Yes,70.20,4481.00,Yes,Credit card (automatic),DSL,Two year,0,0,0,1,1,0,1,1,0,1,1,0,0,0,0,1,1,0.967422,-0.036837,0.819781,5.33,1,0,1,5
2746,2746,1,2,1,7024-OHCCK,Female,1,No,No,2,Yes,Yes,No,No,No,No,Yes,Yes,Yes,93.85,170.85,Yes,Electronic check,Fiber optic,Month-to-month,0,0,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,-1.220640,0.787436,-1.011789,0.17,2,0,1,3
3143,3143,1,2,1,9054-FOWNV,Male,0,Yes,Yes,22,Yes,Yes,No,No,Yes,No,Yes,No,Yes,88.75,1885.15,No,Electronic check,Fiber optic,Month-to-month,1,1,1,1,1,1,0,0,1,0,1,0,1,0,0,0,0,-0.216904,0.601211,0.191240,1.83,2,2,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4056,4056,2,1,3,3229-USWAR,Female,0,No,No,34,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,79.95,2727.30,No,Bank transfer (automatic),DSL,One year,0,0,0,1,1,0,1,0,1,1,1,1,0,0,1,0,1,0.114185,0.270877,0.403571,2.83,1,0,1,6
1026,1026,1,1,2,8260-NGFNY,Female,0,No,No,1,No,No phone service,No,No,No,No,No,No,No,25.20,25.20,Yes,Mailed check,DSL,Month-to-month,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,-1.639976,-0.790280,-1.827977,0.08,0,0,1,1
2386,2386,1,2,4,5334-JLAXU,Female,0,Yes,No,60,Yes,Yes,Yes,Yes,No,No,No,Yes,Yes,94.10,5475.90,No,Credit card (automatic),Fiber optic,Month-to-month,0,1,0,1,1,1,1,1,0,0,0,1,0,0,0,1,1,0.799083,0.799083,1.070816,5.00,2,1,1,4
1776,1776,1,2,2,2674-MIAHT,Female,0,No,No,4,Yes,Yes,No,Yes,No,No,No,No,Yes,80.30,324.20,No,Mailed check,Fiber optic,Month-to-month,0,0,0,1,1,1,0,1,0,0,0,0,0,1,0,0,0,-0.967422,0.289947,-0.775553,0.33,2,0,1,2


In [53]:
train = train[['customer_id', 'churn', 'tenure', 'tenure_months', 'tenure_nml',
               'monthly_charges', 'monthly_charges_nml', 'total_charges', 'total_charges_nml',
               'senior_citizen', 'gender_enc', 'family_support', 'phone_service_enc',
               'contract_type_id', 'internet_service_type_id',
               'extra_lines', 'internet_services', 'has_internet',
               'online_security_enc', 'online_backup_enc', 'device_protection_enc', 
               'tech_support_enc', 'streaming_tv_enc', 'streaming_movies_enc',
               'paperless_billing_enc', 'pay_elec_check', 'pay_mail', 'pay_bank', 'pay_cc', 'pay_auto']]

valid = valid[['customer_id', 'churn', 'tenure', 'tenure_months', 'tenure_nml',
               'monthly_charges', 'monthly_charges_nml', 'total_charges', 'total_charges_nml',
               'senior_citizen', 'gender_enc', 'family_support', 'phone_service_enc',
               'contract_type_id', 'internet_service_type_id',
               'extra_lines', 'internet_services', 'has_internet',
               'online_security_enc', 'online_backup_enc', 'device_protection_enc', 
               'tech_support_enc', 'streaming_tv_enc', 'streaming_movies_enc',
               'paperless_billing_enc', 'pay_elec_check', 'pay_mail', 'pay_bank', 'pay_cc', 'pay_auto']]

test = test[['customer_id', 'churn', 'tenure', 'tenure_months', 'tenure_nml',
               'monthly_charges', 'monthly_charges_nml', 'total_charges', 'total_charges_nml',
               'senior_citizen', 'gender_enc', 'family_support', 'phone_service_enc',
               'contract_type_id', 'internet_service_type_id',
               'extra_lines', 'internet_services', 'has_internet',
               'online_security_enc', 'online_backup_enc', 'device_protection_enc', 
               'tech_support_enc', 'streaming_tv_enc', 'streaming_movies_enc',
               'paperless_billing_enc', 'pay_elec_check', 'pay_mail', 'pay_bank', 'pay_cc', 'pay_auto']]

In [54]:
with pd.option_context('display.max_columns', None):
    display(train.head())

Unnamed: 0,customer_id,churn,tenure,tenure_months,tenure_nml,monthly_charges,monthly_charges_nml,total_charges,total_charges_nml,senior_citizen,gender_enc,family_support,phone_service_enc,contract_type_id,internet_service_type_id,extra_lines,internet_services,has_internet,online_security_enc,online_backup_enc,device_protection_enc,tech_support_enc,streaming_tv_enc,streaming_movies_enc,paperless_billing_enc,pay_elec_check,pay_mail,pay_bank,pay_cc,pay_auto
5691,5762-TJXGK,No,52,4.33,0.559592,58.75,-0.270565,3038.55,0.485586,0,0,0,0,3,1,0,6,1,1,1,1,0,1,1,1,0,0,1,0,1
6232,6242-FEGFD,No,66,5.5,1.073988,96.6,0.971152,6424.25,1.453931,0,1,1,1,3,2,2,5,1,1,1,1,1,0,0,0,0,1,0,0,0
5807,7359-SSBJK,Yes,64,5.33,0.967422,70.2,-0.036837,4481.0,0.819781,1,0,0,1,3,1,1,5,1,1,1,0,1,1,0,1,0,0,0,1,1
2746,7024-OHCCK,Yes,2,0.17,-1.22064,93.85,0.787436,170.85,-1.011789,1,0,0,1,1,2,2,3,1,0,0,0,0,1,1,1,1,0,0,0,0
3143,9054-FOWNV,No,22,1.83,-0.216904,88.75,0.601211,1885.15,0.19124,0,1,2,1,1,2,2,3,1,0,0,1,0,1,0,1,1,0,0,0,0


In [55]:
train.isna().sum()

customer_id                 0
churn                       0
tenure                      0
tenure_months               0
tenure_nml                  0
monthly_charges             0
monthly_charges_nml         0
total_charges               0
total_charges_nml           0
senior_citizen              0
gender_enc                  0
family_support              0
phone_service_enc           0
contract_type_id            0
internet_service_type_id    0
extra_lines                 0
internet_services           0
has_internet                0
online_security_enc         0
online_backup_enc           0
device_protection_enc       0
tech_support_enc            0
streaming_tv_enc            0
streaming_movies_enc        0
paperless_billing_enc       0
pay_elec_check              0
pay_mail                    0
pay_bank                    0
pay_cc                      0
pay_auto                    0
dtype: int64

In [56]:
# Save all 3 dfs into csv files for further processing
train.to_csv('train.csv')
valid.to_csv('valid.csv')
test.to_csv('test.csv')