In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as pplt

In [2]:
telco = pd.read_csv('telco.csv')
telco

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [3]:
telco.shape

(7043, 21)

In [4]:
telco.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [8]:
telco['TotalCharges'].dtype

dtype('O')

In [10]:
telco['TotalCharges'].astype(float)

ValueError: could not convert string to float: ' '

In [11]:
telco['TotalCharges'] = pd.to_numeric(telco['TotalCharges'], errors='coerce')

In [14]:
telco['TotalCharges'] = telco['TotalCharges'].astype(float)
telco['TotalCharges']


0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: TotalCharges, Length: 7043, dtype: float64

In [16]:
telco['TotalCharges'].isna().sum()

11

In [17]:
telco['TotalCharges'].fillna(0)

0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: TotalCharges, Length: 7043, dtype: float64

In [18]:
telco['Churn']

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object

In [19]:
telco['Churn'] = telco['Churn'].map({'Yes':1, 'No':0})

In [20]:
telco['Churn']

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X = telco.drop('Churn', axis=1)
Y = telco['Churn']

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size=0.2)

In [24]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']

In [25]:
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [26]:
from sklearn.preprocessing import StandardScaler

In [28]:
telco_categorical = telco[categorical]
telco_numerical = telco[numerical]

In [29]:
scaler = StandardScaler()

In [30]:
telco_numerical_scaled = scaler.fit_transform(telco_numerical)

In [31]:
telco_numerical_scaled

array([[-1.27744458, -1.16032292, -0.99419409],
       [ 0.06632742, -0.25962894, -0.17373982],
       [-1.23672422, -0.36266036, -0.95964911],
       ...,
       [-0.87024095, -1.1686319 , -0.85451414],
       [-1.15528349,  0.32033821, -0.87209546],
       [ 1.36937906,  1.35896134,  2.01234407]])

In [36]:
df_numerical = pd.DataFrame(telco_numerical_scaled, columns=numerical)
df_numerical

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.277445,-1.160323,-0.994194
1,0.066327,-0.259629,-0.173740
2,-1.236724,-0.362660,-0.959649
3,0.514251,-0.746535,-0.195248
4,-1.236724,0.197365,-0.940457
...,...,...,...
7038,-0.340876,0.665992,-0.129180
7039,1.613701,1.277533,2.241056
7040,-0.870241,-1.168632,-0.854514
7041,-1.155283,0.320338,-0.872095


In [38]:
teleco_scaled = pd.concat([telco_categorical, df_numerical ], axis=1)

In [39]:
from sklearn.preprocessing import OneHotEncoder

In [42]:
encoder = OneHotEncoder(sparse_output=False)


In [43]:
telco_categorical_encoded = encoder.fit_transform(telco_categorical)

In [45]:
column_names = encoder.get_feature_names_out(input_features=categorical)

In [47]:
encoded_df = pd.DataFrame(telco_categorical_encoded, columns=column_names)




In [48]:
X_train_combined = pd.concat([encoded_df, df_numerical], axis=1)
X_test_combined = pd.concat([encoded_df, df_numerical], axis=1)

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [52]:
y_pred = RandomForestClassifier.predict('X_test_combined')

TypeError: ForestClassifier.predict() missing 1 required positional argument: 'X'

In [56]:
rf_classifier = RandomForestClassifier(random_state=1)
rf_classifier.fit(X_train, Y_train)
rf_pred = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(Y_test, rf_pred)

ValueError: could not convert string to float: '7765-LWVVH'

In [55]:
y_pred.fit()

TypeError: BaseForest.fit() missing 2 required positional arguments: 'X' and 'y'