# Drug Interaction Checker - Exploratory Data Analysis

This notebook demonstrates the EDA process for the Drug Interaction Checker educational ML project.

**‚ö†Ô∏è Educational Disclaimer**: This project is for educational purposes only and should not be used for real medical decisions.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append('../src')

from data_processing import DrugDataProcessor
from evaluation import ModelEvaluator

# Set style
plt.style.use('default')
sns.set_palette("husl")

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Data Loading and Initial Exploration

In [None]:
# Initialize data processor
processor = DrugDataProcessor()
evaluator = ModelEvaluator()

# Load sample data
df = processor.load_data()

print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Basic statistics
print("Dataset Info:")
df.info()

print("\nBasic Statistics:")
df.describe()

## 2. Data Quality Assessment

In [None]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
print(missing_values)

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")

# Unique values in categorical columns
categorical_cols = ['drug1', 'drug2', 'severity', 'drug1_class', 'drug2_class']
for col in categorical_cols:
    print(f"\nUnique values in {col}: {df[col].nunique()}")
    print(f"Values: {df[col].unique()[:10]}...")  # Show first 10

## 3. Target Variable Analysis

In [None]:
# Analyze interaction distribution
plt.figure(figsize=(15, 5))

# Interaction distribution
plt.subplot(1, 3, 1)
interaction_counts = df['interaction'].value_counts()
plt.pie(interaction_counts.values, labels=['No Interaction', 'Interaction'], 
        autopct='%1.1f%%', colors=['lightblue', 'lightcoral'])
plt.title('Drug Interaction Distribution')

# Severity distribution
plt.subplot(1, 3, 2)
severity_counts = df['severity'].value_counts()
colors = {'None': 'lightblue', 'Low': 'lightgreen', 'Moderate': 'orange', 'High': 'red'}
bar_colors = [colors.get(x, 'gray') for x in severity_counts.index]
plt.bar(severity_counts.index, severity_counts.values, color=bar_colors)
plt.title('Severity Level Distribution')
plt.xticks(rotation=45)

# Interaction by severity
plt.subplot(1, 3, 3)
interaction_severity = pd.crosstab(df['interaction'], df['severity'])
interaction_severity.plot(kind='bar', stacked=True, ax=plt.gca())
plt.title('Interaction by Severity')
plt.xlabel('Interaction (0=No, 1=Yes)')
plt.xticks(rotation=0)
plt.legend(title='Severity', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

print(f"Interaction Rate: {df['interaction'].mean():.2%}")
print(f"High Severity Rate: {(df['severity'] == 'High').mean():.2%}")

## 4. Drug Analysis

In [None]:
# Most common drugs
plt.figure(figsize=(15, 10))

# Top drugs in drug1 position
plt.subplot(2, 2, 1)
top_drug1 = df['drug1'].value_counts().head(10)
plt.barh(range(len(top_drug1)), top_drug1.values)
plt.yticks(range(len(top_drug1)), top_drug1.index)
plt.title('Top 10 Drugs (Position 1)')
plt.xlabel('Frequency')

# Top drugs in drug2 position
plt.subplot(2, 2, 2)
top_drug2 = df['drug2'].value_counts().head(10)
plt.barh(range(len(top_drug2)), top_drug2.values)
plt.yticks(range(len(top_drug2)), top_drug2.index)
plt.title('Top 10 Drugs (Position 2)')
plt.xlabel('Frequency')

# Drug class distribution
plt.subplot(2, 2, 3)
drug_class_counts = pd.concat([df['drug1_class'], df['drug2_class']]).value_counts()
plt.pie(drug_class_counts.values, labels=drug_class_counts.index, autopct='%1.1f%%')
plt.title('Drug Class Distribution')

# Interaction rate by drug class combination
plt.subplot(2, 2, 4)
class_interaction = df.groupby(['drug1_class', 'drug2_class'])['interaction'].mean().reset_index()
pivot_table = class_interaction.pivot(index='drug1_class', columns='drug2_class', values='interaction')
sns.heatmap(pivot_table, annot=True, cmap='Reds', fmt='.2f')
plt.title('Interaction Rate by Drug Class Combination')
plt.xlabel('Drug 2 Class')
plt.ylabel('Drug 1 Class')

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Encode categorical variables for correlation analysis
df_encoded = processor.encode_features()

# Select numeric columns for correlation
numeric_cols = [col for col in df_encoded.columns if col.endswith('_encoded') or col == 'interaction']
correlation_data = df_encoded[numeric_cols]

# Correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = correlation_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f')
plt.title('Correlation Matrix of Encoded Features')
plt.tight_layout()
plt.show()

# Feature correlation with target
target_corr = correlation_matrix['interaction'].sort_values(key=abs, ascending=False)
print("Feature correlation with interaction target:")
print(target_corr)

## 6. High-Risk Drug Combinations

In [None]:
# Identify high-risk drug combinations
high_risk = df[df['severity'] == 'High']

if len(high_risk) > 0:
    print("High-Risk Drug Combinations:")
    print("=" * 40)
    
    # Most common high-risk pairs
    high_risk_pairs = high_risk.groupby(['drug1', 'drug2']).size().sort_values(ascending=False)
    print("\nTop High-Risk Drug Pairs:")
    print(high_risk_pairs.head(10))
    
    # Drugs most frequently involved in high-risk interactions
    high_risk_drugs = pd.concat([high_risk['drug1'], high_risk['drug2']]).value_counts()
    
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    high_risk_drugs.head(10).plot(kind='bar')
    plt.title('Drugs Most Involved in High-Risk Interactions')
    plt.xlabel('Drug Name')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    
    plt.subplot(1, 2, 2)
    severity_by_class = df.groupby(['drug1_class', 'drug2_class'])['severity'].apply(
        lambda x: (x == 'High').mean()
    ).reset_index()
    pivot_severity = severity_by_class.pivot(index='drug1_class', columns='drug2_class', values='severity')
    sns.heatmap(pivot_severity, annot=True, cmap='Reds', fmt='.2f')
    plt.title('High-Risk Rate by Drug Class Combination')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("No high-risk combinations found in the dataset.")

## 7. Data Insights Summary

In [None]:
# Generate summary insights
print("DATA INSIGHTS SUMMARY")
print("=" * 50)

print(f"üìä Dataset Overview:")
print(f"   ‚Ä¢ Total drug combinations: {len(df):,}")
print(f"   ‚Ä¢ Unique drugs: {pd.concat([df['drug1'], df['drug2']]).nunique()}")
print(f"   ‚Ä¢ Drug classes: {pd.concat([df['drug1_class'], df['drug2_class']]).nunique()}")

print(f"\n‚ö†Ô∏è  Interaction Statistics:")
print(f"   ‚Ä¢ Overall interaction rate: {df['interaction'].mean():.1%}")
print(f"   ‚Ä¢ High severity rate: {(df['severity'] == 'High').mean():.1%}")
print(f"   ‚Ä¢ Moderate severity rate: {(df['severity'] == 'Moderate').mean():.1%}")
print(f"   ‚Ä¢ Low severity rate: {(df['severity'] == 'Low').mean():.1%}")

print(f"\nüè• Most Common Drugs:")
all_drugs = pd.concat([df['drug1'], df['drug2']]).value_counts()
for i, (drug, count) in enumerate(all_drugs.head(5).items()):
    print(f"   {i+1}. {drug}: {count} combinations")

print(f"\nüíä Drug Class Insights:")
all_classes = pd.concat([df['drug1_class'], df['drug2_class']]).value_counts()
for i, (drug_class, count) in enumerate(all_classes.items()):
    print(f"   ‚Ä¢ {drug_class}: {count} occurrences")

if len(high_risk) > 0:
    print(f"\nüö® High-Risk Combinations:")
    print(f"   ‚Ä¢ Total high-risk pairs: {len(high_risk)}")
    most_dangerous = high_risk_drugs.head(3)
    for drug, count in most_dangerous.items():
        print(f"   ‚Ä¢ {drug}: involved in {count} high-risk interactions")

print(f"\nüìà Data Quality:")
print(f"   ‚Ä¢ Missing values: {df.isnull().sum().sum()}")
print(f"   ‚Ä¢ Duplicate rows: {df.duplicated().sum()}")
print(f"   ‚Ä¢ Data completeness: {((df.shape[0] * df.shape[1] - df.isnull().sum().sum()) / (df.shape[0] * df.shape[1]) * 100):.1f}%")

print(f"\n‚úÖ Ready for Machine Learning:")
print(f"   ‚Ä¢ Balanced dataset: {'Yes' if 0.3 <= df['interaction'].mean() <= 0.7 else 'No (may need balancing)'}")
print(f"   ‚Ä¢ Sufficient samples: {'Yes' if len(df) >= 500 else 'No (consider more data)'}")
print(f"   ‚Ä¢ Feature diversity: {'Good' if df.select_dtypes(include=['object']).nunique().sum() > 10 else 'Limited'}")

## 8. Recommendations for Model Development

In [None]:
print("MODEL DEVELOPMENT RECOMMENDATIONS")
print("=" * 50)

interaction_rate = df['interaction'].mean()

print("üéØ Target Variable Analysis:")
if interaction_rate < 0.3:
    print("   ‚Ä¢ Class imbalance detected - consider SMOTE or class weighting")
elif interaction_rate > 0.7:
    print("   ‚Ä¢ High interaction rate - validate data quality")
else:
    print("   ‚Ä¢ Well-balanced target variable")

print("\nüîß Feature Engineering Suggestions:")
print("   ‚Ä¢ Create drug pair combinations as features")
print("   ‚Ä¢ Consider drug class interaction features")
print("   ‚Ä¢ Add drug frequency features")
print("   ‚Ä¢ Create severity ordinal encoding")

print("\nü§ñ Model Selection Recommendations:")
print("   ‚Ä¢ Logistic Regression: Good baseline, interpretable")
print("   ‚Ä¢ Random Forest: Handle feature interactions well")
print("   ‚Ä¢ Decision Tree: Interpretable rules for medical context")
print("   ‚Ä¢ Naive Bayes: Good for categorical features")

print("\nüìä Evaluation Strategy:")
print("   ‚Ä¢ Use stratified train-test split")
print("   ‚Ä¢ Focus on Precision and Recall for medical context")
print("   ‚Ä¢ Consider F1-score for balanced evaluation")
print("   ‚Ä¢ Analyze confusion matrix for error patterns")

print("\n‚ö†Ô∏è  Important Considerations:")
print("   ‚Ä¢ This is educational data - not for clinical use")
print("   ‚Ä¢ Validate model interpretability")
print("   ‚Ä¢ Consider feature importance analysis")
print("   ‚Ä¢ Document all assumptions and limitations")