# Exploratory Data Analysis (EDA)
## Telecom Customer Churn Dataset

This notebook performs comprehensive exploratory data analysis on the Telco Customer Churn dataset.

**Objectives:**
- Load and understand the dataset structure
- Identify missing values and data quality issues
- Analyze the distribution of features
- Explore relationships between features and churn
- Generate visualizations for business insights

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("✅ Libraries imported successfully!")

## 1. Load Dataset

In [None]:
# Load the dataset
df = pd.read_csv('../data/raw/telco_churn.csv')

print(f"Dataset Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## 2. Dataset Overview

In [None]:
# Dataset info
print("="*50)
print("DATASET INFORMATION")
print("="*50)
print(f"\nTotal Rows: {df.shape[0]}")
print(f"Total Columns: {df.shape[1]}")
print(f"\nColumn Names and Types:")
print(df.dtypes)

In [None]:
# Check for missing values
print("\n" + "="*50)
print("MISSING VALUES")
print("="*50)
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("\n✅ No missing values found!")

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nDuplicate Rows: {duplicates}")

if duplicates > 0:
    print("⚠️ Duplicates found!")
else:
    print("✅ No duplicates found!")

In [None]:
# Statistical summary
print("\n" + "="*50)
print("STATISTICAL SUMMARY")
print("="*50)
df.describe()

## 3. Target Variable Analysis

In [None]:
# Churn distribution
print("="*50)
print("CHURN DISTRIBUTION")
print("="*50)
churn_counts = df['Churn'].value_counts()
churn_pct = df['Churn'].value_counts(normalize=True) * 100

print("\nChurn Counts:")
print(churn_counts)
print("\nChurn Percentage:")
print(churn_pct)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
axes[0].pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%',
           colors=['#2ca02c', '#d62728'], startangle=90)
axes[0].set_title('Churn Distribution', fontsize=14, fontweight='bold')

# Bar plot
sns.barplot(x=churn_counts.index, y=churn_counts.values, ax=axes[1],
           palette=['#2ca02c', '#d62728'])
axes[1].set_xlabel('Churn', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].set_title('Churn Count', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n📊 Churn Rate: {churn_pct['Yes']:.2f}%")

## 4. Numerical Features Analysis

In [None]:
# Identify numerical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f"Numerical Features ({len(numerical_features)}):")
print(numerical_features)

In [None]:
# Distribution of numerical features
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, feature in enumerate(['tenure', 'MonthlyCharges', 'TotalCharges']):
    if feature in df.columns:
        # Handle TotalCharges (may have spaces)
        data = pd.to_numeric(df[feature], errors='coerce').dropna()
        
        axes[idx].hist(data, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
        axes[idx].set_xlabel(feature, fontsize=12)
        axes[idx].set_ylabel('Frequency', fontsize=12)
        axes[idx].set_title(f'Distribution of {feature}', fontsize=13, fontweight='bold')
        axes[idx].axvline(data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {data.mean():.2f}')
        axes[idx].legend()

plt.tight_layout()
plt.show()

In [None]:
# Numerical features by churn
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, feature in enumerate(['tenure', 'MonthlyCharges', 'TotalCharges']):
    if feature in df.columns:
        # Handle TotalCharges
        data = df[[feature, 'Churn']].copy()
        data[feature] = pd.to_numeric(data[feature], errors='coerce')
        data = data.dropna()
        
        sns.boxplot(x='Churn', y=feature, data=data, ax=axes[idx],
                   palette=['#2ca02c', '#d62728'])
        axes[idx].set_title(f'{feature} by Churn', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Categorical Features Analysis

In [None]:
# Identify categorical features
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
if 'Churn' in categorical_features:
    categorical_features.remove('Churn')

print(f"Categorical Features ({len(categorical_features)}):")
print(categorical_features)

In [None]:
# Churn rate by Contract type
contract_churn = df.groupby('Contract')['Churn'].apply(lambda x: (x == 'Yes').sum() / len(x) * 100).sort_values(ascending=False)

fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Churn rate
axes[0].bar(contract_churn.index, contract_churn.values, color=['#d62728', '#ff7f0e', '#2ca02c'])
axes[0].set_xlabel('Contract Type', fontsize=12)
axes[0].set_ylabel('Churn Rate (%)', fontsize=12)
axes[0].set_title('Churn Rate by Contract Type', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Count
contract_counts = df.groupby(['Contract', 'Churn']).size().unstack()
contract_counts.plot(kind='bar', ax=axes[1], color=['#2ca02c', '#d62728'], rot=0)
axes[1].set_xlabel('Contract Type', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].set_title('Customer Count by Contract and Churn', fontsize=14, fontweight='bold')
axes[1].legend(title='Churn')

plt.tight_layout()
plt.show()

print("\n📊 Churn Rate by Contract:")
print(contract_churn)

In [None]:
# Churn rate by Internet Service
internet_churn = df.groupby('InternetService')['Churn'].apply(lambda x: (x == 'Yes').sum() / len(x) * 100).sort_values(ascending=False)

fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Churn rate
axes[0].bar(internet_churn.index, internet_churn.values, color=['#d62728', '#ff7f0e', '#2ca02c'])
axes[0].set_xlabel('Internet Service', fontsize=12)
axes[0].set_ylabel('Churn Rate (%)', fontsize=12)
axes[0].set_title('Churn Rate by Internet Service', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Count
internet_counts = df.groupby(['InternetService', 'Churn']).size().unstack()
internet_counts.plot(kind='bar', ax=axes[1], color=['#2ca02c', '#d62728'], rot=0)
axes[1].set_xlabel('Internet Service', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].set_title('Customer Count by Internet Service and Churn', fontsize=14, fontweight='bold')
axes[1].legend(title='Churn')

plt.tight_layout()
plt.show()

print("\n📊 Churn Rate by Internet Service:")
print(internet_churn)

In [None]:
# Churn rate by Payment Method
payment_churn = df.groupby('PaymentMethod')['Churn'].apply(lambda x: (x == 'Yes').sum() / len(x) * 100).sort_values(ascending=False)

fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Churn rate
axes[0].barh(payment_churn.index, payment_churn.values, color=['#d62728', '#ff7f0e', '#ffbb78', '#2ca02c'])
axes[0].set_xlabel('Churn Rate (%)', fontsize=12)
axes[0].set_ylabel('Payment Method', fontsize=12)
axes[0].set_title('Churn Rate by Payment Method', fontsize=14, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)

# Count
payment_counts = df.groupby(['PaymentMethod', 'Churn']).size().unstack()
payment_counts.plot(kind='barh', ax=axes[1], color=['#2ca02c', '#d62728'])
axes[1].set_xlabel('Count', fontsize=12)
axes[1].set_ylabel('Payment Method', fontsize=12)
axes[1].set_title('Customer Count by Payment Method and Churn', fontsize=14, fontweight='bold')
axes[1].legend(title='Churn')

plt.tight_layout()
plt.show()

print("\n📊 Churn Rate by Payment Method:")
print(payment_churn)

## 6. Correlation Analysis

In [None]:
# Create a copy and encode categorical variables for correlation
df_corr = df.copy()

# Encode binary variables
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    if col in df_corr.columns:
        df_corr[col] = df_corr[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0})

# Handle TotalCharges
if 'TotalCharges' in df_corr.columns:
    df_corr['TotalCharges'] = pd.to_numeric(df_corr['TotalCharges'], errors='coerce')

# Select numerical columns
numeric_cols = df_corr.select_dtypes(include=[np.number]).columns.tolist()
correlation_matrix = df_corr[numeric_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
           center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Top correlations with Churn
if 'Churn' in correlation_matrix.columns:
    churn_corr = correlation_matrix['Churn'].sort_values(ascending=False)
    print("="*50)
    print("CORRELATION WITH CHURN")
    print("="*50)
    print(churn_corr[churn_corr.index != 'Churn'])
    
    # Visualize
    top_features = churn_corr[churn_corr.index != 'Churn'].head(10)
    
    plt.figure(figsize=(10, 6))
    colors = ['#d62728' if x > 0 else '#2ca02c' for x in top_features.values]
    plt.barh(top_features.index, top_features.values, color=colors)
    plt.xlabel('Correlation with Churn', fontsize=12)
    plt.title('Top 10 Features Correlated with Churn', fontsize=14, fontweight='bold')
    plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()

## 7. Key Insights Summary

Based on the exploratory data analysis, we can derive the following insights:

### 📊 Churn Characteristics:
1. **Contract Type**: Month-to-month contracts have significantly higher churn rates
2. **Internet Service**: Fiber optic customers show higher churn tendency
3. **Payment Method**: Electronic check users have elevated churn rates
4. **Tenure**: Newer customers (lower tenure) are more likely to churn
5. **Charges**: Higher monthly charges correlate with increased churn

### 💡 Business Recommendations:
- Focus retention efforts on month-to-month contract customers
- Investigate fiber optic service quality issues
- Encourage automatic payment methods
- Implement early engagement programs for new customers
- Review pricing strategies for high-charge customers

### 🎯 Next Steps:
- Feature engineering to create more predictive features
- Build and evaluate machine learning models
- Implement churn prediction system

In [None]:
print("✅ Exploratory Data Analysis Complete!")
print("\nKey Findings:")
print(f"- Total Customers: {len(df):,}")
print(f"- Churn Rate: {(df['Churn'] == 'Yes').mean()*100:.2f}%")
print(f"- Features Analyzed: {len(df.columns)}")
print("\n📌 Proceed to: 02_Preprocessing_and_Features.ipynb")