# Exploratory Data Analysis - Cancer Patient Dataset

This notebook performs a comprehensive EDA on the cancer patient dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully!")

In [None]:
# Load the data
df = pd.read_csv('../cancer patient data sets.csv')

print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"Shape: {df.shape} (rows, columns)")
print(f"\nColumn Names ({len(df.columns)} total):")
print(df.columns.tolist())
df.head()

In [None]:
# Data Types and Missing Values
print("=" * 60)
print("DATA TYPES AND MISSING VALUES")
print("=" * 60)
print("\nData Types:")
print(df.dtypes)
print(f"\nMissing Values:")
missing = df.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("No missing values found!")
print(f"\nTotal missing values: {missing.sum()}")

In [None]:
# Summary Statistics
print("=" * 60)
print("SUMMARY STATISTICS")
print("=" * 60)
df.describe()

In [None]:
# Target Variable Analysis
print("=" * 60)
print("TARGET VARIABLE ANALYSIS (Level)")
print("=" * 60)
if 'Level' in df.columns:
    print("\nCount by Level:")
    print(df['Level'].value_counts())
    print(f"\nPercentage distribution:")
    print(df['Level'].value_counts(normalize=True) * 100)
    
    # Visualization
    plt.figure(figsize=(8, 5))
    df['Level'].value_counts().plot(kind='bar', color=['#2ecc71', '#f39c12', '#e74c3c'])
    plt.title('Distribution of Cancer Risk Levels', fontsize=14, fontweight='bold')
    plt.xlabel('Risk Level')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()

In [None]:
# Age Analysis
print("=" * 60)
print("AGE STATISTICS")
print("=" * 60)
if 'Age' in df.columns:
    print(f"\nAge Statistics:")
    print(df['Age'].describe())
    print(f"\nAge Range: {df['Age'].min()} - {df['Age'].max()}")
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Histogram
    df['Age'].hist(bins=30, color='skyblue', edgecolor='black', ax=axes[0])
    axes[0].set_title('Age Distribution', fontsize=12, fontweight='bold')
    axes[0].set_xlabel('Age')
    axes[0].set_ylabel('Frequency')
    
    # Boxplot by Level
    if 'Level' in df.columns:
        df.boxplot(column='Age', by='Level', ax=axes[1])
        axes[1].set_title('Age Distribution by Risk Level', fontsize=12, fontweight='bold')
        axes[1].set_xlabel('Risk Level')
        axes[1].set_ylabel('Age')
        plt.suptitle('')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Gender Analysis
print("=" * 60)
print("GENDER DISTRIBUTION")
print("=" * 60)
if 'Gender' in df.columns:
    print(df['Gender'].value_counts())
    print(f"\nNote: Gender appears to be encoded (1 or 2)")
    
    # Visualization
    plt.figure(figsize=(8, 5))
    df['Gender'].value_counts().plot(kind='bar', color=['#3498db', '#e74c3c'])
    plt.title('Gender Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Gender (1 or 2)')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()

In [None]:
# Risk Factors Analysis
print("=" * 60)
print("RISK FACTORS OVERVIEW")
print("=" * 60)

risk_factors = ['Air Pollution', 'Alcohol use', 'Dust Allergy', 'OccuPational Hazards', 
                'Genetic Risk', 'chronic Lung Disease', 'Obesity', 'Smoking', 
                'Passive Smoker', 'Chest Pain', 'Coughing of Blood', 'Fatigue', 
                'Weight Loss', 'Shortness of Breath', 'Wheezing', 'Swallowing Difficulty',
                'Clubbing of Finger Nails', 'Frequent Cold', 'Dry Cough', 'Snoring']

# Check which risk factors exist in the dataset
existing_risk_factors = [col for col in risk_factors if col in df.columns]
print(f"Found {len(existing_risk_factors)} risk factor columns")
print(f"\nRisk Factor Statistics:")
print(df[existing_risk_factors].describe())

In [None]:
# Correlation Heatmap
print("=" * 60)
print("CORRELATION ANALYSIS")
print("=" * 60)

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Remove index and Patient Id if they exist
if 'index' in numeric_cols:
    numeric_cols.remove('index')
if 'Patient Id' in df.columns:
    # Patient Id is likely string, but check
    pass

# Select top 15 numeric columns for better visualization
cols_to_plot = numeric_cols[:15] if len(numeric_cols) > 15 else numeric_cols

plt.figure(figsize=(14, 10))
corr_matrix = df[cols_to_plot].corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap (Top 15 Features)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Risk Factors Mean by Level
if 'Level' in df.columns and len(existing_risk_factors) > 0:
    print("=" * 60)
    print("AVERAGE RISK FACTOR SCORES BY LEVEL")
    print("=" * 60)
    
    level_means = df.groupby('Level')[existing_risk_factors[:10]].mean()
    print(level_means.round(2))
    
    # Visualization
    plt.figure(figsize=(14, 6))
    level_means.T.plot(kind='bar', width=0.8)
    plt.title('Average Risk Factor Scores by Level', fontsize=14, fontweight='bold')
    plt.xlabel('Risk Factors')
    plt.ylabel('Average Score')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Risk Level', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
# Key Risk Factors Distribution
if len(existing_risk_factors) > 0:
    print("=" * 60)
    print("KEY RISK FACTORS DISTRIBUTION")
    print("=" * 60)
    
    # Select top 6 risk factors
    key_factors = existing_risk_factors[:6]
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()
    
    for idx, factor in enumerate(key_factors):
        df[factor].hist(bins=20, color=plt.cm.viridis(idx/len(key_factors)), 
                       edgecolor='black', ax=axes[idx])
        axes[idx].set_title(f'{factor}', fontsize=11, fontweight='bold')
        axes[idx].set_xlabel('Score')
        axes[idx].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Smoking Analysis
if 'Smoking' in df.columns and 'Level' in df.columns:
    print("=" * 60)
    print("SMOKING ANALYSIS BY RISK LEVEL")
    print("=" * 60)
    
    smoking_by_level = pd.crosstab(df['Level'], df['Smoking'])
    print(smoking_by_level)
    
    # Visualization
    plt.figure(figsize=(12, 6))
    smoking_by_level.plot(kind='bar', stacked=True, colormap='viridis')
    plt.title('Smoking Levels by Risk Category', fontsize=14, fontweight='bold')
    plt.xlabel('Risk Level')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.legend(title='Smoking Score', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
# Boxplot of Risk Factors by Level
if 'Level' in df.columns and len(existing_risk_factors) > 0:
    print("=" * 60)
    print("RISK FACTOR DISTRIBUTIONS BY LEVEL")
    print("=" * 60)
    
    # Select a few key risk factors for boxplot
    key_factors = existing_risk_factors[:5]
    df_melted = df.melt(id_vars=['Level'], 
                       value_vars=key_factors, 
                       var_name='Risk Factor', 
                       value_name='Score')
    
    plt.figure(figsize=(14, 6))
    sns.boxplot(data=df_melted, x='Risk Factor', y='Score', hue='Level')
    plt.title('Risk Factor Scores Distribution by Level', fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Risk Level', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
# Summary Insights
print("=" * 60)
print("KEY INSIGHTS SUMMARY")
print("=" * 60)

if 'Level' in df.columns:
    print("\n1. Average Age by Risk Level:")
    age_by_level = df.groupby('Level')['Age'].mean()
    print(age_by_level.round(2))
    
    print("\n2. Risk Level Distribution:")
    print(f"   Low: {len(df[df['Level'] == 'Low'])} ({len(df[df['Level'] == 'Low'])/len(df)*100:.1f}%)")
    print(f"   Medium: {len(df[df['Level'] == 'Medium'])} ({len(df[df['Level'] == 'Medium'])/len(df)*100:.1f}%)")
    print(f"   High: {len(df[df['Level'] == 'High'])} ({len(df[df['Level'] == 'High'])/len(df)*100:.1f}%)")
    
    if len(existing_risk_factors) > 0:
        print("\n3. Top 5 Risk Factors with Highest Average Scores (High Risk Group):")
        high_risk_means = df[df['Level'] == 'High'][existing_risk_factors].mean().sort_values(ascending=False)
        print(high_risk_means.head().round(2))

print("\n" + "=" * 60)
print("EDA COMPLETE!")
print("=" * 60)