In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE



In [2]:
# Import the 50 genes CSV
df = pd.read_csv('50_genes.csv')
print("DataFrame from 50_genes.csv:")
print(df.head())



DataFrame from 50_genes.csv:
           gene_id    transcript_id  transcript_position sequence  label  \
0  ENSG00000004059  ENST00000000233                  244  AAGACCA      0   
1  ENSG00000004059  ENST00000000233                  244  AAGACCA      0   
2  ENSG00000004059  ENST00000000233                  244  AAGACCA      0   
3  ENSG00000004059  ENST00000000233                  244  AAGACCA      0   
4  ENSG00000004059  ENST00000000233                  244  AAGACCA      0   

   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0    0.00299       2.06      125.0    0.01770      10.40      122.0   
1    0.00631       2.53      125.0    0.00844       4.67      126.0   
2    0.00465       3.92      109.0    0.01360      12.00      124.0   
3    0.00398       2.06      125.0    0.00830       5.01      130.0   
4    0.00664       2.92      120.0    0.00266       3.94      129.0   

   feature_7  feature_8  feature_9  
0    0.00930      10.90       84.1  
1    0.01030 

In [11]:
# Prepare X and y for random forest classifier
# Drop rows where label is None
df_valid = df[df['label'].notnull()]

# Print initial class distribution
print("\nInitial class distribution:")
print(df_valid['label'].value_counts(normalize=True))

# Remove duplicates based on transcript_id and transcript_position
df_valid = df_valid.drop_duplicates(subset=['transcript_id', 'transcript_position'], keep='first')
print(f"\nNumber of duplicate (transcript_id, transcript_position) pairs removed: {len(df) - len(df_valid)}")

# Prepare X and y
X = df_valid[[f'feature_{i+1}' for i in range(9)]]
y = df_valid['label']

# Check for multiple classes before proceeding
# Calculate train size for approximately 10000 rows
total_rows = len(df_valid)
train_size = min(10000, int(0.8 * total_rows))  # Use 80% if <10000 rows

# Create training and test sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=train_size, stratify=y, random_state=42
)
test_indices = X_test.index

# Ensure enough minority samples for SMOTE on training set
minority_count_train = len(y_train[y_train == 1])
if minority_count_train < 2:
    print("Warning: Too few minority class samples for SMOTE in training set. Using original training data.")
    X_train_balanced, y_train_balanced = X_train, y_train
else:
    # Balance the training set using SMOTE
    smote = SMOTE(random_state=42, k_neighbors=min(5, minority_count_train - 1))
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Ensure enough minority samples for SMOTE on test set
minority_count_test = len(y_test[y_test == 1])
if minority_count_test < 2:
    print("Warning: Too few minority class samples for SMOTE in test set. Using original test data.")
    X_test_balanced, y_test_balanced = X_test, y_test
else:
    # Balance the test set using SMOTE
    smote = SMOTE(random_state=42, k_neighbors=min(5, minority_count_test - 1))
    X_test_balanced, y_test_balanced = smote.fit_resample(X_test, y_test)

# Verify class distribution
print("\nClass distribution in training set (after SMOTE):")
print(pd.Series(y_train_balanced).value_counts(normalize=True))
print("Class distribution in test set (after SMOTE):")
print(pd.Series(y_test_balanced).value_counts(normalize=True))

# Create and train the random forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_balanced, y_train_balanced)

# Generate predicted probabilities and evaluate on balanced test set

# Get probability for class 1 if it exists, else class 0
scores_balanced = clf.predict_proba(X_test_balanced)[:, 1]

# For output_scores.csv, use only original test samples to avoid synthetic data
scores_original = clf.predict_proba(X_test)[:, class_index]
output_df = df_valid.loc[test_indices, ['transcript_id', 'transcript_position']].copy()
output_df['score'] = scores_original

# Verify no duplicates in output
duplicates = output_df[['transcript_id', 'transcript_position']].duplicated().sum()
print(f"\nNumber of duplicate (transcript_id, transcript_position) pairs in output: {duplicates}")

# Save to CSV
output_df.to_csv('output_scores.csv', index=False)
print("\nOutput saved to 'output_scores.csv'")
print("\nOutput DataFrame:")
print(output_df)

# Evaluate ROC AUC and PR AUC on balanced test set
if len(clf.classes_) > 1:
    roc_auc = roc_auc_score(y_test_balanced, scores_balanced)
    precision, recall, _ = precision_recall_curve(y_test_balanced, scores_balanced)
    pr_auc = auc(recall, precision)
    print(f"\nEvaluation Metrics (on balanced test set):")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")
    print("\nNote: Metrics are computed on the balanced test set, which includes synthetic samples and may not reflect real-world performance.")
else:
    print("\nEvaluation Metrics not computed: only one class present")

# Verify the data and classifier
print("\nDataFrame for training (first 5 rows):")
print(df_valid.head())
print(f"\nTraining set shape: {X_train_balanced.shape}")
print(f"Test set shape (original): {X_test.shape}")
print(f"Test set shape (balanced): {X_test_balanced.shape}")
print(f"Number of unique gene_ids: {df_valid['gene_id'].nunique()}")
print(f"Feature names: {X.columns.tolist()}")
print(f"\nRandom Forest Classifier:")
print(clf)
print(f"Number of training samples: {len(X_train_balanced)}")
print(f"Number of features: {X_train_balanced.shape[1]}")
print(f"Classes: {clf.classes_}")


Initial class distribution:
label
0    0.970948
1    0.029052
Name: proportion, dtype: float64

Number of duplicate (transcript_id, transcript_position) pairs removed: 126941

Class distribution in training set (after SMOTE):
label
0    0.5
1    0.5
Name: proportion, dtype: float64
Class distribution in test set (after SMOTE):
label
0    0.5
1    0.5
Name: proportion, dtype: float64

Number of duplicate (transcript_id, transcript_position) pairs in output: 0

Output saved to 'output_scores.csv'

Output DataFrame:
          transcript_id  transcript_position  score
118881  ENST00000175506                 1381   0.19
15345   ENST00000005257                  873   0.29
1497    ENST00000000233                  471   0.01
57386   ENST00000005286                 3462   0.02
116639  ENST00000170447                 2409   0.06
...                 ...                  ...    ...
115846  ENST00000169551                  574   0.50
127684  ENST00000177742                  755   0.05
17464   ENST