In [182]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import pandas as pd
import numpy as np


import warnings
warnings.filterwarnings('ignore')

In [268]:
train = pd.read_csv('data/train_data_2000.csv')
test = pd.read_csv('data/test_data_2000.csv')
contact = pd.read_csv('data/contact_data_2000.csv')

In [269]:
train.head()

Unnamed: 0,cellid,phase,order_within_phase,order
0,SCG0088_CTATGAGGTACCGGAT-1,G1,0,0
1,SCG0088_GCTAAGCGTATTGGTG-1,G1,0,0
2,SCG0089_TCCATTGTCTGTAAGC-1,G1,0,0
3,SCG0092_GTTTATCTCATGCTAA-1,G1,0,0
4,SCG0092_AACCGCTCAGCTCATA-1,G1,0,0


In [270]:
train["order"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43])

In [271]:
train["order_within_phase"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [272]:
test.head()

Unnamed: 0,cellid,phase,order_within_phase,order
0,SCG0089_CTTATGTTCCCGCCTA-1,,,
1,SCG0093_AGAACCGCACAGCCAT-1,,,
2,SCG0090_CCAGGATGTGCTCACC-1,,,
3,SCG0089_TACCTTTAGCACTTGG-1,,,
4,SCG0092_TGATCAGGTTTGAGGC-1,,,


In [273]:
contact.head()

Unnamed: 0,chr1,start1,end1,chr2,start2,end2,cellid
0,chr13-M,74316813,74316959,chr13-M,72727004,72727154,SCG0088_TTTAACCTCAGCCAAT-1
1,chr1-M,79322530,79322563,chr12-M,4538118,4538268,SCG0088_TATAGGTGTCCCGGAA-1
2,chr2-M,75633331,75633491,chr8-P,125695812,125695962,SCG0088_CGTTAACAGTACCGCA-1
3,chr7-P,136324163,136324313,chr7-P,136352442,136352592,SCG0088_TTTAACCTCAGCCAAT-1
4,chr6-M,49253365,49253515,chr6-M,49323546,49323680,SCG0088_CGTTAACAGTACCGCA-1


In [274]:
print(contact.shape)
total_contacts = contact.shape[0]

(5679074, 7)


## Feature Engineering

In [None]:
# BIG MISTAKE!!!
# I CAN'T GROUP BY ORDER,PHASE, ORDER_WITHIN_PHASE!!!


# # order
# agg_stats_order = train.groupby("phase")["order"].agg(
#     phase_order_te_mean="mean",
#     phase_order_te_median="median",
#     phase_order_te_min="min",
#     phase_order_te_max="max",
#     phase_order_te_std="std",
#     phase_order_te_count="size"
# ).reset_index()

# # order_within_phase
# agg_stats_order_wp = train.groupby("phase")["order_within_phase"].agg(
#     phase_order_wp_te_mean="mean",
#     phase_order_wp_te_median="median",
#     phase_order_wp_te_min="min",
#     phase_order_wp_te_max="max",
#     phase_order_wp_te_std="std",
#     phase_order_wp_te_count="size"
# ).reset_index()

# agg_stats_order = train.groupby("cellid")["order"].agg(
#     cellid_order_mean="mean",
#     cellid_order_median="median",
#     cellid_order_min="min",
#     cellid_order_max="max",
#     cellid_order_std="std",
#     cellid_order_count="size"
# ).reset_index()

# agg_stats_order_wp = train.groupby("cellid")["order_within_phase"].agg(
#     cellid_order_wp_mean="mean",
#     cellid_order_wp_median="median",
#     cellid_order_wp_min="min",
#     cellid_order_wp_max="max",
#     cellid_order_wp_std="std",
#     cellid_order_wp_count="size"
# ).reset_index()

In [None]:
# train = train.merge(agg_stats_order, on="cellid", how="left")
# train = train.merge(agg_stats_order_wp, on="cellid", how="left")

In [275]:
contact["distance_1"] = contact["start1"] - contact["end1"]
contact["distance_2"] = contact["start2"] - contact["end2"]

In [276]:
contact_features = contact.groupby("cellid").agg(
    total_contacts=("chr1", "count"),
    median_distance_1=("distance_1", "median"),
    median_distance_2=("distance_2", "median"),
    intra_contacts=("chr1", lambda x: (
        x == contact.loc[x.index, "chr2"]).sum()),
    inter_contacts=("chr1", lambda x: (
        x != contact.loc[x.index, "chr2"]).sum())
).assign(
    ratio_intra=lambda df: df["intra_contacts"] / df["total_contacts"],
    ratio_inter=lambda df: df["inter_contacts"] / df["total_contacts"]
).fillna(0)

In [277]:
train = train.merge(contact_features, on="cellid", how="left")

In [278]:
train.head()

Unnamed: 0,cellid,phase,order_within_phase,order,total_contacts,median_distance_1,median_distance_2,intra_contacts,inter_contacts,ratio_intra,ratio_inter
0,SCG0088_CTATGAGGTACCGGAT-1,G1,0,0,2544,-85.0,-150.0,2272,272,0.893082,0.106918
1,SCG0088_GCTAAGCGTATTGGTG-1,G1,0,0,2245,-55.0,-149.0,1981,264,0.882405,0.117595
2,SCG0089_TCCATTGTCTGTAAGC-1,G1,0,0,2038,-59.0,-148.0,1798,240,0.882237,0.117763
3,SCG0092_GTTTATCTCATGCTAA-1,G1,0,0,4164,-63.0,-150.0,3733,431,0.896494,0.103506
4,SCG0092_AACCGCTCAGCTCATA-1,G1,0,0,2451,-60.0,-147.0,2086,365,0.851081,0.148919


## Model

In [279]:
y = train["phase"]
X = train.drop(["cellid", "phase", "order", "order_within_phase"], axis=1)

In [280]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [281]:
print("Encoded y:", y_encoded[:5])
print("X features:", X.columns.tolist())

Encoded y: [0 0 0 0 0]
X features: ['total_contacts', 'median_distance_1', 'median_distance_2', 'intra_contacts', 'inter_contacts', 'ratio_intra', 'ratio_inter']


In [288]:
params = {
    'objective': 'multiclass',
    # 3 phases
    'num_class': 3,
    # Multiclass-compatible metric
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'max_depth': 4,
    'n_jobs': -1,
    'verbose': -1
}

In [283]:
y.unique()

array(['G1', 'S', 'G2M'], dtype=object)

In [284]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [285]:
accuracy_scores = []

In [None]:
for train_idx, val_idx in folds.split(X, y_encoded):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]

    # Train LightGBM model
    model = lgb.LGBMClassifier(**params, n_estimators=100)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='multi_logloss',
        callbacks=[
            lgb.early_stopping(stopping_rounds=50,
                               verbose=False),  
        ]
    )

    # Predict on validation set
    y_pred = model.predict(X_val, num_iteration=model.best_iteration_)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    accuracy_scores.append(accuracy)
    print(f"Fold accuracy: {accuracy:.4f}")

Fold accuracy: 0.5781
Fold accuracy: 0.5900
Fold accuracy: 0.5933
Fold accuracy: 0.5867
Fold accuracy: 0.6067


In [None]:
# Average accuracy across folds
mean_accuracy = np.mean(accuracy_scores)
print(f"Mean cross-validated accuracy: {mean_accuracy:.4f}")

Mean cross-validated accuracy: 0.5909
