In [1]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import matplotlib.pyplot as plt
import scanpy as sc
import seaborn as sns
adata = sc.read("/mnt/c/Users/sneha/BrainCellAtlas/data/human_mtg_brain_atlas_final.h5ad")

In [4]:
import scanpy as sc
import numpy as np

# Load your data
#adata = sc.read_h5ad('/mnt/c/Users/sneha/BrainCellAtlas/data/processed/human_mtg_brain_atlas.h5ad')

# Check for NaN values
print("Checking for NaN values...")
print(f"NaN in X: {np.isnan(adata.X.data).any()}")
print(f"NaN in PCA: {np.isnan(adata.obsm['X_pca']).any()}")
print(f"Number of NaN in PCA: {np.isnan(adata.obsm['X_pca']).sum()}")

# Check what preprocessing was done
print("\nChecking adata layers:")
print(adata.layers.keys())

print("\nChecking adata.uns (preprocessing info):")
print(adata.uns.keys())

Checking for NaN values...
NaN in X: False
NaN in PCA: False
Number of NaN in PCA: 0

Checking adata layers:
KeysView(Layers with keys: )

Checking adata.uns (preprocessing info):
dict_keys(['cell_type_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'rank_genes_groups', 'umap'])


In [None]:
# Check for NaN in labels
print("Checking y (labels):")
print(f"NaN in y: {y.isna().any()}")
print(f"Number of NaN in y: {y.isna().sum()}")
print(f"Total samples: {len(y)}")
print(f"Unique cell types: {y.unique()}")

# Remove rows with NaN labels
mask = ~y.isna()
x_clean = x[mask]
y_clean = y[mask]

print(f"\nAfter removing NaN:")
print(f"Samples remaining: {len(y_clean)}")

# Now do train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x_clean, y_clean, 
    test_size=0.2, 
    stratify=y_clean, 
    random_state=42
)

In [2]:
x = adata.obsm["X_pca"][:,:50]  # Use first 50 PCAs
y = adata.obs["cell_type"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

ValueError: Input contains NaN

In [None]:
print(adata.obs.columns)


In [None]:
clf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))


In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d',
            xticklabels=clf.classes_, yticklabels=clf.classes_)

plt.title('Cell type Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.savefig('../results/figures/cell_type_confusion_matrix.pdf')

In [None]:
#importance of the features and which principal components related to which genes are more important'
importances = clf.feature_importances_
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances)
plt.xlabel('Principal Component')
plt.ylabel('Importance')
plt.title('Feature Importance of Principal Components in Cell Type Classification')
plt.tight_layout()
plt.savefig('../results/figures/pc_feature_importance.pdf')
joblib.dump(clf, '../results/models/cell_type_classifier.pkl')  