In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
# import data
telco = pd.read_csv('../data/Churn.csv')

df = telco.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Account_Length  3333 non-null   int64  
 1   Vmail_Message   3333 non-null   int64  
 2   Day_Mins        3333 non-null   float64
 3   Eve_Mins        3333 non-null   float64
 4   Night_Mins      3333 non-null   float64
 5   Intl_Mins       3333 non-null   float64
 6   CustServ_Calls  3333 non-null   int64  
 7   Churn           3333 non-null   object 
 8   Intl_Plan       3333 non-null   object 
 9   Vmail_Plan      3333 non-null   object 
 10  Day_Calls       3333 non-null   int64  
 11  Day_Charge      3333 non-null   float64
 12  Eve_Calls       3333 non-null   int64  
 13  Eve_Charge      3333 non-null   float64
 14  Night_Calls     3333 non-null   int64  
 15  Night_Charge    3333 non-null   float64
 16  Intl_Calls      3333 non-null   int64  
 17  Intl_Charge     3333 non-null   f

In [7]:
telco['Vmail_Plan'].value_counts()

Vmail_Plan
no     2411
yes     922
Name: count, dtype: int64

In [10]:
#Replace 'no' with 0 and 'yes' with 1 in the 'Vmail_Plan' and Ã‡hurn column of telco
# Replace 'no' with 0 and 'yes' with 1 in 'Vmail_Plan'
replace_dict = {'no':0 , 'yes':1}
telco['Vmail_Plan'] = telco['Vmail_Plan'].replace(replace_dict)

# Replace 'no' with 0 and 'yes' with 1 in 'Churn'
telco['Churn'] = telco['Churn'].replace(replace_dict)

# Print the results to verify
print(telco['Vmail_Plan'].head())
print(telco['Churn'].head())

0    1
1    1
2    0
3    0
4    0
Name: Vmail_Plan, dtype: int64
0    0
1    0
2    0
3    0
4    0
Name: Churn, dtype: int64


## Hot Encoding

In [12]:
# Use the pd.get_dummies() function to apply one hot encoding on the 'State' feature of telco.

# Perform one hot encoding on 'State'
telco_state = pd.get_dummies(telco['State'], dtype=int)

# Print the head of telco_state
print(telco_state)

      AK  AL  AR  AZ  CA  CO  CT  DC  DE  FL  ...  SD  TN  TX  UT  VA  VT  WA  \
0      0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
1      0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
2      0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
3      0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
4      0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
...   ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ..  ..  ..  ..  ..  ..   
3328   0   0   0   1   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
3329   0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
3330   0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
3331   0   0   0   0   0   0   1   0   0   0  ...   0   0   0   0   0   0   0   
3332   0   0   0   0   0   0   0   0   0   0  ...   0   1   0   0   0   0   0   

      WI  WV  WY  
0      0

## Feature Scaling

In [18]:
df = telco[['Intl_Calls' ,'Night_Mins']]

# Scale telco using StandardScaler
telco_scaled = StandardScaler().fit_transform(df)

# Add column names back for readability
telco_scaled_df = pd.DataFrame(telco_scaled, columns=["Intl_Calls", "Night_Mins"])

# Print summary statistics
print(telco_scaled_df.describe())



         Intl_Calls    Night_Mins
count  3.333000e+03  3.333000e+03
mean  -8.527366e-18  7.887813e-17
std    1.000150e+00  1.000150e+00
min   -1.820289e+00 -3.513648e+00
25%   -6.011951e-01 -6.698545e-01
50%   -1.948306e-01  6.485803e-03
75%    6.178983e-01  6.808485e-01
max    6.307001e+00  3.839081e+00


## Feature Selection and Engineering

In [3]:
# Drop the unnecessary features
telco = telco.drop(telco[['Area_Code','Phone']], axis=1)

# Verify dropped features
print(telco.columns)

Index(['Account_Length', 'Vmail_Message', 'Day_Mins', 'Eve_Mins', 'Night_Mins',
       'Intl_Mins', 'CustServ_Calls', 'Churn', 'Intl_Plan', 'Vmail_Plan',
       'Day_Calls', 'Day_Charge', 'Eve_Calls', 'Eve_Charge', 'Night_Calls',
       'Night_Charge', 'Intl_Calls', 'Intl_Charge', 'State'],
      dtype='object')


In [4]:
#Create a new feature - 'Avg_Night_Calls' - that is the result of dividing 'Night_Mins by 'Night_Calls'.
telco['Avg_Night_Calls'] = telco['Night_Mins']/telco['Night_Calls']
#Print the first five rows of this new feature.
print(telco['Avg_Night_Calls'].head())

0    2.689011
1    2.469903
2    1.563462
3    2.212360
4    1.544628
Name: Avg_Night_Calls, dtype: float64
