In [5]:
# notebooks/01_eda.ipynb

# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

# Configure display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# For reproducibility
np.random.seed(42)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

Libraries imported successfully!
Pandas version: 2.0.3
NumPy version: 1.24.3


In [6]:
# Load the dataset
data_path = '../data/raw/telco_churn.csv'
df = pd.read_csv(data_path)

print("=" * 70)
print("TELCO CUSTOMER CHURN DATASET - INITIAL LOAD")
print("=" * 70)
print(f"\nDataset loaded successfully from: {data_path}")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"\nDataset Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print("=" * 70)

TELCO CUSTOMER CHURN DATASET - INITIAL LOAD

Dataset loaded successfully from: ../data/raw/telco_churn.csv
Timestamp: 2026-02-15 17:26:08

Dataset Shape: 7,043 rows × 21 columns


In [7]:
# Display basic dataset information
print("\n" + "=" * 70)
print("DATASET OVERVIEW")
print("=" * 70)

df.info()

print("\n" + "=" * 70)
print("MEMORY USAGE")
print("=" * 70)
print(f"Total memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


DATASET OVERVIEW
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 n

In [8]:
# Display first few rows
print("\n" + "=" * 70)
print("FIRST 5 ROWS")
print("=" * 70)
display(df.head())

print("\n" + "=" * 70)
print("LAST 5 ROWS")
print("=" * 70)
display(df.tail())

print("\n" + "=" * 70)
print("RANDOM SAMPLE (5 ROWS)")
print("=" * 70)
display(df.sample(5, random_state=42))


FIRST 5 ROWS


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes



LAST 5 ROWS


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,No,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No



RANDOM SAMPLE (5 ROWS)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
185,1024-GUALD,Female,0,Yes,No,1,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,24.8,24.8,Yes
2715,0484-JPBRU,Male,0,No,No,41,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Bank transfer (automatic),25.25,996.45,No
3825,3620-EHIMZ,Female,0,Yes,Yes,52,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.35,1031.7,No
1807,6910-HADCM,Female,0,No,No,1,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Electronic check,76.35,76.35,Yes
132,8587-XYZSF,Male,0,No,No,67,Yes,No,DSL,No,No,No,Yes,No,No,Two year,No,Bank transfer (automatic),50.55,3260.1,No


In [9]:
# Analyze columns
print("\n" + "=" * 70)
print("COLUMN ANALYSIS")
print("=" * 70)

print("\nTotal Columns:", df.shape[1])
print("\nColumn Names and Types:")
print("-" * 70)

for idx, (col, dtype) in enumerate(df.dtypes.items(), 1):
    unique_count = df[col].nunique()
    null_count = df[col].isnull().sum()
    null_pct = (null_count / len(df)) * 100
    
    print(f"{idx:2d}. {col:20s} | Type: {str(dtype):10s} | "
          f"Unique: {unique_count:4d} | Nulls: {null_count:4d} ({null_pct:5.2f}%)")


COLUMN ANALYSIS

Total Columns: 21

Column Names and Types:
----------------------------------------------------------------------
 1. customerID           | Type: object     | Unique: 7043 | Nulls:    0 ( 0.00%)
 2. gender               | Type: object     | Unique:    2 | Nulls:    0 ( 0.00%)
 3. SeniorCitizen        | Type: int64      | Unique:    2 | Nulls:    0 ( 0.00%)
 4. Partner              | Type: object     | Unique:    2 | Nulls:    0 ( 0.00%)
 5. Dependents           | Type: object     | Unique:    2 | Nulls:    0 ( 0.00%)
 6. tenure               | Type: int64      | Unique:   73 | Nulls:    0 ( 0.00%)
 7. PhoneService         | Type: object     | Unique:    2 | Nulls:    0 ( 0.00%)
 8. MultipleLines        | Type: object     | Unique:    3 | Nulls:    0 ( 0.00%)
 9. InternetService      | Type: object     | Unique:    3 | Nulls:    0 ( 0.00%)
10. OnlineSecurity       | Type: object     | Unique:    3 | Nulls:    0 ( 0.00%)
11. OnlineBackup         | Type: object     | Un