In [3]:
!pip install xgboost==1.7.6





[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
import xgboost as xgb

# === Load dataset ===
df = pd.read_csv("C:/Users/user10/Desktop/DSA4262/Project data/50_genes.csv")

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
print("m6A rate:", df['label'].mean())

# === Features & labels ===
X = df.drop(columns=['gene_id','transcript_id','transcript_position','sequence','label'])
y = df['label']

# === Train/val/test split ===
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test   = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)

# === Handle imbalance ===
pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = neg / pos
print("scale_pos_weight:", scale_pos_weight)



Dataset shape: (128461, 14)
Columns: ['gene_id', 'transcript_id', 'transcript_position', 'sequence', 'label', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']
m6A rate: 0.029051618779240392
Train: (77076, 9) Val: (25692, 9) Test: (25693, 9)
scale_pos_weight: 33.42429656096471


In [6]:
import xgboost as xgb

model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    use_label_encoder=False,   
    scale_pos_weight=scale_pos_weight,
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# Train with early stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=20,
    verbose=True
)

print("Training finished.")




[0]	validation_0-auc:0.75744
[1]	validation_0-auc:0.76976
[2]	validation_0-auc:0.77738
[3]	validation_0-auc:0.78396
[4]	validation_0-auc:0.78418
[5]	validation_0-auc:0.78403
[6]	validation_0-auc:0.78383
[7]	validation_0-auc:0.78474
[8]	validation_0-auc:0.78683
[9]	validation_0-auc:0.78637
[10]	validation_0-auc:0.78647
[11]	validation_0-auc:0.78618
[12]	validation_0-auc:0.78633
[13]	validation_0-auc:0.78719
[14]	validation_0-auc:0.78616
[15]	validation_0-auc:0.78669
[16]	validation_0-auc:0.78639
[17]	validation_0-auc:0.78917
[18]	validation_0-auc:0.78897
[19]	validation_0-auc:0.78887
[20]	validation_0-auc:0.78933
[21]	validation_0-auc:0.78973
[22]	validation_0-auc:0.79139
[23]	validation_0-auc:0.79201
[24]	validation_0-auc:0.79248
[25]	validation_0-auc:0.79248
[26]	validation_0-auc:0.79287
[27]	validation_0-auc:0.79325
[28]	validation_0-auc:0.79496
[29]	validation_0-auc:0.79493
[30]	validation_0-auc:0.79455
[31]	validation_0-auc:0.79392
[32]	validation_0-auc:0.79435
[33]	validation_0-au

In [7]:
# === Evaluate model ===
y_pred_prob = model.predict_proba(X_test)[:,1]

roc = roc_auc_score(y_test, y_pred_prob)
pr  = average_precision_score(y_test, y_pred_prob)

print("Test ROC AUC:", roc)
print("Test PR AUC:", pr)

# Save the results
output = df.loc[y_test.index, ['transcript_id','transcript_position']].copy()
output['score'] = y_pred_prob

output.to_csv("xgb_predictions.csv", index=False)
print("Saved predictions to xgb_predictions.csv")

output.head()

Test ROC AUC: 0.7903206132743379
Test PR AUC: 0.13412537775568553
Saved predictions to xgb_predictions.csv


Unnamed: 0,transcript_id,transcript_position,score
111150,ENST00000601697,1098,0.397032
94907,ENST00000549920,403,0.578367
28339,ENST00000009589,324,0.137155
71625,ENST00000044462,816,0.522233
61357,ENST00000012443,889,0.297478
