# Lab Results Analysis

This notebook analyzes lab results data and generates distribution reports for each field.


## 1. Import Libraries and Create Sample Data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


In [None]:
# Generate sample lab results data
np.random.seed(42)

# Lab types
lab_types = ['Complete Blood Count', 'Lipid Panel', 'Liver Function', 'Kidney Function', 
             'Thyroid Panel', 'Hemoglobin A1C', 'Vitamin D', 'Cholesterol']

# Generate 500 sample records
n_records = 500

# Patient IDs (100 unique patients)
patient_ids = [f'PAT{str(i).zfill(5)}' for i in range(1, 101)]

# Generate data
data = {
    'Patient ID': np.random.choice(patient_ids, n_records),
    'Lab Type': np.random.choice(lab_types, n_records, p=[0.2, 0.15, 0.15, 0.15, 0.1, 0.1, 0.1, 0.05]),
    'Lab Value': np.round(np.random.normal(100, 30, n_records), 2),
    'Lab Date': [(datetime.now() - timedelta(days=np.random.randint(0, 365))).strftime('%Y-%m-%d') 
                 for _ in range(n_records)]
}

# Create DataFrame
df = pd.DataFrame(data)

# Adjust lab values based on lab type for more realistic data
lab_value_ranges = {
    'Complete Blood Count': (4.5, 11.0),
    'Lipid Panel': (120, 200),
    'Liver Function': (10, 40),
    'Kidney Function': (0.6, 1.2),
    'Thyroid Panel': (0.5, 5.0),
    'Hemoglobin A1C': (4.0, 6.5),
    'Vitamin D': (20, 50),
    'Cholesterol': (150, 250)
}

for lab_type, (min_val, max_val) in lab_value_ranges.items():
    mask = df['Lab Type'] == lab_type
    df.loc[mask, 'Lab Value'] = np.round(np.random.uniform(min_val, max_val, mask.sum()), 2)

# Convert Lab Date to datetime
df['Lab Date'] = pd.to_datetime(df['Lab Date'])

print(f"Dataset created with {len(df)} records")
print(f"\nFirst few records:")
df.head(10)


## 2. Data Overview


In [None]:
print("Dataset Info:")
print(f"Total Records: {len(df)}")
print(f"Total Patients: {df['Patient ID'].nunique()}")
print(f"Total Lab Types: {df['Lab Type'].nunique()}")
print(f"\nDate Range: {df['Lab Date'].min().date()} to {df['Lab Date'].max().date()}")
print(f"\nData Types:")
print(df.dtypes)
print(f"\nBasic Statistics:")
df.describe()


## 3. Distribution Report: Patient ID


In [None]:
# Count of lab results per patient
patient_counts = df['Patient ID'].value_counts().sort_values(ascending=False)

print("=== Patient ID Distribution Report ===")
print(f"\nTotal unique patients: {df['Patient ID'].nunique()}")
print(f"\nTop 10 patients by number of lab results:")
print(patient_counts.head(10))
print(f"\nStatistics:")
print(f"  Mean tests per patient: {patient_counts.mean():.2f}")
print(f"  Median tests per patient: {patient_counts.median():.2f}")
print(f"  Min tests per patient: {patient_counts.min()}")
print(f"  Max tests per patient: {patient_counts.max()}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram of tests per patient
axes[0].hist(patient_counts.values, bins=20, edgecolor='black', alpha=0.7, color='skyblue')
axes[0].set_xlabel('Number of Lab Tests', fontsize=12)
axes[0].set_ylabel('Number of Patients', fontsize=12)
axes[0].set_title('Distribution of Lab Tests per Patient', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Top 15 patients bar chart
top_patients = patient_counts.head(15)
axes[1].barh(range(len(top_patients)), top_patients.values, color='coral')
axes[1].set_yticks(range(len(top_patients)))
axes[1].set_yticklabels(top_patients.index, fontsize=9)
axes[1].set_xlabel('Number of Lab Tests', fontsize=12)
axes[1].set_title('Top 15 Patients by Lab Test Count', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()


## 4. Distribution Report: Lab Type


In [None]:
# Count of each lab type
lab_type_counts = df['Lab Type'].value_counts()

print("=== Lab Type Distribution Report ===")
print(f"\nTotal unique lab types: {df['Lab Type'].nunique()}")
print(f"\nLab type frequency:")
for lab_type, count in lab_type_counts.items():
    percentage = (count / len(df)) * 100
    print(f"  {lab_type}: {count} ({percentage:.1f}%)")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
axes[0].barh(range(len(lab_type_counts)), lab_type_counts.values, color='lightgreen')
axes[0].set_yticks(range(len(lab_type_counts)))
axes[0].set_yticklabels(lab_type_counts.index, fontsize=10)
axes[0].set_xlabel('Number of Tests', fontsize=12)
axes[0].set_title('Lab Type Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='x')

# Pie chart
axes[1].pie(lab_type_counts.values, labels=lab_type_counts.index, autopct='%1.1f%%', 
            startangle=90, textprops={'fontsize': 9})
axes[1].set_title('Lab Type Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()


## 5. Distribution Report: Lab Value


In [None]:
print("=== Lab Value Distribution Report ===")
print(f"\nBasic Statistics:")
print(df['Lab Value'].describe())
print(f"\nSkewness: {df['Lab Value'].skew():.3f}")
print(f"Kurtosis: {df['Lab Value'].kurtosis():.3f}")

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Histogram
axes[0, 0].hist(df['Lab Value'], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, 0].set_xlabel('Lab Value', fontsize=12)
axes[0, 0].set_ylabel('Frequency', fontsize=12)
axes[0, 0].set_title('Lab Value Distribution (Histogram)', fontsize=14, fontweight='bold')
axes[0, 0].axvline(df['Lab Value'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["Lab Value"].mean():.2f}')
axes[0, 0].axvline(df['Lab Value'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {df["Lab Value"].median():.2f}')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Box plot
axes[0, 1].boxplot(df['Lab Value'], vert=True, patch_artist=True, 
                   boxprops=dict(facecolor='lightblue', alpha=0.7))
axes[0, 1].set_ylabel('Lab Value', fontsize=12)
axes[0, 1].set_title('Lab Value Distribution (Box Plot)', fontsize=14, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# Density plot
df['Lab Value'].plot.density(ax=axes[1, 0], color='purple', linewidth=2)
axes[1, 0].set_xlabel('Lab Value', fontsize=12)
axes[1, 0].set_ylabel('Density', fontsize=12)
axes[1, 0].set_title('Lab Value Distribution (Density Plot)', fontsize=14, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# Lab values by lab type
df.boxplot(column='Lab Value', by='Lab Type', ax=axes[1, 1], rot=45)
axes[1, 1].set_xlabel('Lab Type', fontsize=10)
axes[1, 1].set_ylabel('Lab Value', fontsize=12)
axes[1, 1].set_title('Lab Value Distribution by Lab Type', fontsize=14, fontweight='bold')
plt.setp(axes[1, 1].xaxis.get_majorticklabels(), rotation=45, ha='right')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


## 6. Distribution Report: Lab Date


In [None]:
# Extract date components
df['Year'] = df['Lab Date'].dt.year
df['Month'] = df['Lab Date'].dt.month
df['DayOfWeek'] = df['Lab Date'].dt.day_name()

print("=== Lab Date Distribution Report ===")
print(f"\nDate Range: {df['Lab Date'].min().date()} to {df['Lab Date'].max().date()}")
print(f"\nTotal days covered: {(df['Lab Date'].max() - df['Lab Date'].min()).days} days")
print(f"\nTests by Year:")
print(df['Year'].value_counts().sort_index())
print(f"\nTests by Month:")
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_counts = df['Month'].value_counts().sort_index()
for month, count in month_counts.items():
    print(f"  {month_names[month-1]}: {count}")
print(f"\nTests by Day of Week:")
print(df['DayOfWeek'].value_counts())

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Timeline of tests
daily_counts = df.groupby(df['Lab Date'].dt.date).size()
axes[0, 0].plot(daily_counts.index, daily_counts.values, marker='o', markersize=3, linewidth=1, color='darkblue')
axes[0, 0].set_xlabel('Date', fontsize=12)
axes[0, 0].set_ylabel('Number of Tests', fontsize=12)
axes[0, 0].set_title('Lab Tests Over Time', fontsize=14, fontweight='bold')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3)

# Tests by month
month_counts = df['Month'].value_counts().sort_index()
axes[0, 1].bar(range(1, 13), [month_counts.get(i, 0) for i in range(1, 13)], color='orange', alpha=0.7)
axes[0, 1].set_xticks(range(1, 13))
axes[0, 1].set_xticklabels(month_names, rotation=45, ha='right')
axes[0, 1].set_xlabel('Month', fontsize=12)
axes[0, 1].set_ylabel('Number of Tests', fontsize=12)
axes[0, 1].set_title('Lab Tests by Month', fontsize=14, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# Tests by day of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = df['DayOfWeek'].value_counts().reindex(day_order, fill_value=0)
axes[1, 0].bar(range(len(day_order)), day_counts.values, color='teal', alpha=0.7)
axes[1, 0].set_xticks(range(len(day_order)))
axes[1, 0].set_xticklabels(day_order, rotation=45, ha='right')
axes[1, 0].set_xlabel('Day of Week', fontsize=12)
axes[1, 0].set_ylabel('Number of Tests', fontsize=12)
axes[1, 0].set_title('Lab Tests by Day of Week', fontsize=14, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Histogram of dates
axes[1, 1].hist(df['Lab Date'], bins=30, edgecolor='black', alpha=0.7, color='crimson')
axes[1, 1].set_xlabel('Date', fontsize=12)
axes[1, 1].set_ylabel('Frequency', fontsize=12)
axes[1, 1].set_title('Lab Date Distribution (Histogram)', fontsize=14, fontweight='bold')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 7. Summary Statistics


In [None]:
print("=== SUMMARY STATISTICS ===")
print(f"\n{'='*60}")
print(f"PATIENT ID:")
print(f"  Total unique patients: {df['Patient ID'].nunique()}")
print(f"  Average tests per patient: {df.groupby('Patient ID').size().mean():.2f}")
print(f"\n{'='*60}")
print(f"LAB TYPE:")
print(f"  Total unique lab types: {df['Lab Type'].nunique()}")
print(f"  Most common: {df['Lab Type'].mode()[0]} ({df['Lab Type'].value_counts().max()} tests)")
print(f"\n{'='*60}")
print(f"LAB VALUE:")
print(f"  Mean: {df['Lab Value'].mean():.2f}")
print(f"  Median: {df['Lab Value'].median():.2f}")
print(f"  Std Dev: {df['Lab Value'].std():.2f}")
print(f"  Min: {df['Lab Value'].min():.2f}")
print(f"  Max: {df['Lab Value'].max():.2f}")
print(f"\n{'='*60}")
print(f"LAB DATE:")
print(f"  Date range: {df['Lab Date'].min().date()} to {df['Lab Date'].max().date()}")
print(f"  Total days: {(df['Lab Date'].max() - df['Lab Date'].min()).days}")
print(f"  Average tests per day: {len(df) / ((df['Lab Date'].max() - df['Lab Date'].min()).days + 1):.2f}")
print(f"\n{'='*60}")
