In [1]:
!pip install xgboost==1.7.6




In [2]:
# Install dependencies in Colab


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
import xgboost as xgb

# === Load dataset ===
# Replace with correct path to your file (upload to Colab first)
df = pd.read_csv("50_genes.csv")

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
print("m6A rate:", df['label'].mean())

# === Features & labels ===
# Exclude metadata columns
X = df.drop(columns=['gene_id','transcript_id','transcript_position','sequence','label'])
y = df['label']

# === Train/val/test split ===
# You can adjust test_size; here 20% test, 20% validation
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test   = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)

# === Handle imbalance ===
# Compute scale_pos_weight for XGBoost (negative/positive ratio)
pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = neg / pos
print("scale_pos_weight:", scale_pos_weight)



Dataset shape: (128461, 14)
Columns: ['gene_id', 'transcript_id', 'transcript_position', 'sequence', 'label', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']
m6A rate: 0.029051618779240392
Train: (77076, 9) Val: (25692, 9) Test: (25693, 9)
scale_pos_weight: 33.42429656096471


In [5]:
import xgboost as xgb

model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    use_label_encoder=False,   # deprecated but harmless
    scale_pos_weight=scale_pos_weight,
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# Train with early stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=20,
    verbose=True
)

print("✅ Training finished. Model ready.")




[0]	validation_0-auc:0.72164
[1]	validation_0-auc:0.73440
[2]	validation_0-auc:0.75903
[3]	validation_0-auc:0.76106
[4]	validation_0-auc:0.77069
[5]	validation_0-auc:0.77703
[6]	validation_0-auc:0.77480
[7]	validation_0-auc:0.77578
[8]	validation_0-auc:0.78151
[9]	validation_0-auc:0.78221
[10]	validation_0-auc:0.78306
[11]	validation_0-auc:0.78408
[12]	validation_0-auc:0.78533
[13]	validation_0-auc:0.78610
[14]	validation_0-auc:0.78606
[15]	validation_0-auc:0.78725
[16]	validation_0-auc:0.78772
[17]	validation_0-auc:0.78915
[18]	validation_0-auc:0.79045
[19]	validation_0-auc:0.79047
[20]	validation_0-auc:0.79016
[21]	validation_0-auc:0.79013
[22]	validation_0-auc:0.79001
[23]	validation_0-auc:0.78999
[24]	validation_0-auc:0.79140
[25]	validation_0-auc:0.79236
[26]	validation_0-auc:0.79272
[27]	validation_0-auc:0.79332
[28]	validation_0-auc:0.79332
[29]	validation_0-auc:0.79275
[30]	validation_0-auc:0.79266
[31]	validation_0-auc:0.79224
[32]	validation_0-auc:0.79270
[33]	validation_0-au

In [6]:
# === Evaluate model ===
y_pred_prob = model.predict_proba(X_test)[:,1]

roc = roc_auc_score(y_test, y_pred_prob)
pr  = average_precision_score(y_test, y_pred_prob)

print("Test ROC AUC:", roc)
print("Test PR AUC:", pr)

# === Save predictions in required format ===
output = df.loc[y_test.index, ['transcript_id','transcript_position']].copy()
output['score'] = y_pred_prob

output.to_csv("xgb_predictions.csv", index=False)
print("Saved predictions to xgb_predictions.csv")

output.head()

Test ROC AUC: 0.7948975141057026
Test PR AUC: 0.1383150697027917
Saved predictions to xgb_predictions.csv


Unnamed: 0,transcript_id,transcript_position,score
111150,ENST00000601697,1098,0.334869
94907,ENST00000549920,403,0.483256
28339,ENST00000009589,324,0.150741
71625,ENST00000044462,816,0.327024
61357,ENST00000012443,889,0.359495
