In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

file_path = '../data/WA_Fn-UseC_-Telco-Customer-Churn.csv'

# Data loading
if os.path.exists(file_path):
    print(f"The file has been found")
    df = pd.read_csv(file_path)

    # info about data types
    df.info()
else:
    print(f"Error: No file at {file_path}")

The file has been found
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  

In [None]:
#1. Convert to numeric. The 'errors=coerce' parameter turns invalid parsing (like spaces) into NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

#2. Check how many hidden nulls were revealed
missing_count = df['TotalCharges'].isnull().sum()
print(f"Hidden missing values (spaces) converted to NaN: {missing_count} ")

#3. Fill missing values with 0. 0 because those are usually new customers (tenure=0) who haven't been billed yet.
df['TotalCharges'] = df['TotalCharges'].fillna(0)

#4. Final check
print(f"After fix: ")
df.info()

Hidden missing values (spaces) converted to NaN: 11 
After fix
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non

In [3]:
#1. Map churn to binary values.
# Yes - 1 (positive class - customer left)
# No - 0 (negative class - customer stayed)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

#2. Check the distribution of the target variable
print("Churn distribution (0 = No, 1 = Yes)")
print(df["Churn"].value_counts())


Churn distribution (0 = No, 1 = Yes)
Churn
0    5174
1    1869
Name: count, dtype: int64


In [4]:
#1. Drop 'customerID', its unique for every row so no predictive power
df.drop(columns=["customerID"], inplace=True)

#2. One-hot encoding for remaining categorical variables, 'InternetService' for example
df_processed = pd.get_dummies(df, drop_first=True)

#3. Check if everthing numeric
print("Final Data Types:")
print(df_processed.info())

#4. Save the processed data to CSV file
df_processed.to_csv('../data/processed_churn_data.csv', index=False)

print("Data processed and saved to 'data/processed_churn_data.csv'")


Final Data Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   tenure                                 7043 non-null   int64  
 2   MonthlyCharges                         7043 non-null   float64
 3   TotalCharges                           7043 non-null   float64
 4   Churn                                  7043 non-null   int64  
 5   gender_Male                            7043 non-null   bool   
 6   Partner_Yes                            7043 non-null   bool   
 7   Dependents_Yes                         7043 non-null   bool   
 8   PhoneService_Yes                       7043 non-null   bool   
 9   MultipleLines_No phone service         7043 non-null   bool   
 10  MultipleLines_Yes                      7043 non-null  