# 02 - Molecular Descriptor Calculation

**TB Drug Discovery ML Pipeline - Phase 1**

This notebook covers:
1. Loading cleaned compound data
2. Calculating RDKit molecular descriptors
3. Lipinski Rule of 5 analysis
4. Feature statistics and visualization

**Descriptors calculated:**
- Lipinski: MolWt, LogP, TPSA, HBD, HBA
- Topological: RotatableBonds, RingCount, AromaticRings, etc.
- Extended: LabuteASA, BalabanJ, Chi values, etc.

In [None]:
# Imports
import sys
from pathlib import Path

sys.path.insert(0, str(Path.cwd().parent / "src"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data.descriptor_calculator import DescriptorCalculator

plt.style.use('seaborn-v0_8-whitegrid')
print("Imports successful!")

## 1. Load Cleaned Data

In [None]:
# Load preprocessed data
data_path = Path.cwd().parent / "data" / "processed" / "cleaned_chembl_inhA.csv"

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df)} compounds")
    df.head()

## 2. Calculate Descriptors

In [None]:
# Initialize calculator
calculator = DescriptorCalculator(
    lipinski=True,
    topological=True,
    extended=True
)

print(f"Calculating {len(calculator.descriptor_names)} descriptors:")
print(calculator.descriptor_names)

In [None]:
# Calculate descriptors for all molecules
df_with_desc = calculator.calculate_from_dataframe(df, smiles_col='smiles')

print(f"\nDataset shape: {df_with_desc.shape}")
df_with_desc.head()

## 3. Descriptor Statistics

In [None]:
# Summary statistics
desc_cols = calculator.descriptor_names
df_with_desc[desc_cols].describe().T

In [None]:
# Check for missing values
missing = df_with_desc[desc_cols].isna().sum()
print("Missing values per descriptor:")
print(missing[missing > 0] if missing.sum() > 0 else "No missing values!")

## 4. Lipinski Rule of 5 Analysis

In [None]:
# Count Lipinski violations
lipinski_violations = (
    (df_with_desc['MolWt'] > 500).astype(int) +
    (df_with_desc['LogP'] > 5).astype(int) +
    (df_with_desc['HBD'] > 5).astype(int) +
    (df_with_desc['HBA'] > 10).astype(int)
)

df_with_desc['lipinski_violations'] = lipinski_violations

print("Lipinski violations distribution:")
print(lipinski_violations.value_counts().sort_index())

drug_like = (lipinski_violations <= 1).sum()
print(f"\nDrug-like compounds (≤1 violation): {drug_like}/{len(df_with_desc)} ({100*drug_like/len(df_with_desc):.1f}%)")

## 5. Visualization

In [None]:
# Lipinski property distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

properties = ['MolWt', 'LogP', 'HBD', 'HBA']
thresholds = [500, 5, 5, 10]
titles = ['Molecular Weight', 'LogP', 'H-Bond Donors', 'H-Bond Acceptors']

for ax, prop, thresh, title in zip(axes.flat, properties, thresholds, titles):
    ax.hist(df_with_desc[prop], bins=30, edgecolor='black', alpha=0.7)
    ax.axvline(x=thresh, color='red', linestyle='--', label=f'Lipinski cutoff ({thresh})')
    ax.set_xlabel(prop)
    ax.set_ylabel('Count')
    ax.set_title(title)
    ax.legend()

plt.tight_layout()
plt.savefig(Path.cwd().parent / 'results' / 'figures' / 'lipinski_distributions.png', dpi=150)
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(14, 12))

# Select numeric descriptor columns
corr_matrix = df_with_desc[desc_cols].corr()

sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Descriptor Correlation Matrix')
plt.tight_layout()
plt.savefig(Path.cwd().parent / 'results' / 'figures' / 'descriptor_correlation.png', dpi=150)
plt.show()

In [None]:
# Descriptor vs Activity (pIC50)
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

key_descriptors = ['MolWt', 'LogP', 'TPSA', 'NumRotatableBonds', 'NumAromaticRings', 'HBA']

for ax, desc in zip(axes.flat, key_descriptors):
    ax.scatter(df_with_desc[desc], df_with_desc['pIC50'], alpha=0.5, s=20)
    ax.set_xlabel(desc)
    ax.set_ylabel('pIC50')
    ax.set_title(f'{desc} vs pIC50')

plt.tight_layout()
plt.savefig(Path.cwd().parent / 'results' / 'figures' / 'descriptor_vs_activity.png', dpi=150)
plt.show()

## 6. Save Descriptors

In [None]:
# Save descriptor dataset
output_path = Path.cwd().parent / "data" / "processed" / "descriptors.csv"
df_with_desc.to_csv(output_path, index=False)

print(f"Saved descriptors to: {output_path}")
print(f"Shape: {df_with_desc.shape}")

## Summary

### Results:
- **Descriptors calculated:** Check count above
- **Drug-like compounds:** Check percentage above

### Observations:
- Note any strong correlations between descriptors
- Note relationships with activity

### Next Steps:
→ Proceed to **03_qsar_training.ipynb** for QSAR model training