# Classical Baseline: SVM with k-mer Features

## Load HG38 Dataset

In [38]:
import os
from google.colab import drive
drive.mount('/content/drive')
PROJECT_DIR = "/content/drive/MyDrive/bioproj01"
DATA_DIR = os.path.join(PROJECT_DIR, "data")
print(f"Data directory found: {PROJECT_DIR}\nContents: {os.listdir(PROJECT_DIR)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data directory found: /content/drive/MyDrive/bioproj01
Contents: ['data', 'results']


In [39]:
import pandas as pd

In [40]:
hg38_df = pd.read_csv(
    f"{DATA_DIR}/hg38/human_promoter_vs_nonpromoter_10k_400bp.csv"
)

hg38_df = hg38_df[["sequence", "label"]]
hg38_df.head()

Unnamed: 0,sequence,label
0,TGAACCCCGGGAGGCAAGGGCTGCCATGGCAGGGGTGGGGTTTCAT...,0
1,GGCCCAGCTCTGACGCCAGGCTGTCTTGCCTCTGCTCACCTGCAGC...,1
2,TCATGCCTGGCCAGCAAAATTGTTTTTTAAAAGTTTATGCTACTAA...,1
3,AAGTTAAATAAATCAGGGTTTTCACCTGGTTCTTTAAGATCTGTTG...,0
4,AATGGAAGAAGCCAAAATTTTGCAGAACAAGAGAATATGCAAGAGA...,0


## k-mer Feature Extraction

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
def kmer_tokenizer(sequence, k):
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

k = 6

vectorizer = CountVectorizer(
    tokenizer=lambda x: kmer_tokenizer(x, k),
    token_pattern=None,
    lowercase=False
)

X = vectorizer.fit_transform(hg38_df["sequence"])
y = hg38_df["label"].values

print("Feature matrix shape:", X.shape)

Feature matrix shape: (19994, 4096)


## Explicit Train/Test Split

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(X_train.shape, X_test.shape)

(15995, 4096) (3999, 4096)


## Train SVM (Linear Kernel)

In [45]:
from sklearn.svm import LinearSVC

In [46]:
svm_model = LinearSVC(
    C=1.0,
    class_weight="balanced",
    random_state=42
)

## Cross-Validation Evaluation

In [47]:
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold, cross_validate

In [48]:
scoring = {
    "accuracy": "accuracy",
    "f1": "f1",
    "roc_auc": "roc_auc",
    "mcc": make_scorer(matthews_corrcoef)
}

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

cv_results = cross_validate(
    svm_model,
    X,
    y,
    cv=cv,
    scoring=scoring,
    n_jobs=-1
)

## Cross Validation Results

In [49]:
import numpy as np

In [50]:
for metric in scoring.keys():
    scores = cv_results[f"test_{metric}"]
    print(f"{metric.upper():10s}: {scores.mean():.4f} ± {scores.std():.4f}")

ACCURACY  : 0.7118 ± 0.0057
F1        : 0.7114 ± 0.0058
ROC_AUC   : 0.7909 ± 0.0046
MCC       : 0.4235 ± 0.0113


## Inspect Top k-mers

In [51]:
svm_model.fit(X, y)

feature_names = vectorizer.get_feature_names_out()
weights = svm_model.coef_[0]

top_pos = np.argsort(weights)[-20:]
top_neg = np.argsort(weights)[:20]

print("Top promoter-associated k-mers:")
print(feature_names[top_pos])

print("\nTop non-promoter-associated k-mers:")
print(feature_names[top_neg])

Top promoter-associated k-mers:
['CCCGAC' 'ATACGA' 'GATCGT' 'CCGTTT' 'CGGTTG' 'ATCGGT' 'CCCCGG' 'TAGTCG'
 'CGAGCG' 'TCTACC' 'GGCCGA' 'CCTACC' 'CACGGC' 'CACGGG' 'GCGCAT' 'CGACTA'
 'ATACGT' 'TCCGTA' 'TCCGTG' 'CGACTG']

Top non-promoter-associated k-mers:
['CTCCGT' 'CTACCA' 'TTCCGT' 'CCGACT' 'ATCGTT' 'CTACCT' 'TACGTG' 'CGCATA'
 'GTCATT' 'TCCCCG' 'GATGCC' 'TGTAGC' 'TATACG' 'TCCACG' 'ATCCGT' 'CCCACG'
 'ACCGTT' 'TTTCGG' 'GAGCGA' 'ACCGCG']


## Train SVM on Training Data

In [52]:
svm_final = LinearSVC(
    C=1.0,
    class_weight="balanced",
    random_state=42
)

svm_final.fit(X_train, y_train)

## Test-Set Predictions

In [53]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef
)

In [54]:
y_pred = svm_final.predict(X_test)
y_score = svm_final.decision_function(X_test)

acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_score)
mcc = matthews_corrcoef(y_test, y_pred)

acc, f1, auc, mcc

(0.7126781695423856,
 0.7125344008006005,
 np.float64(0.7915155077538769),
 np.float64(0.4253562844813251))

## Save Reports to Drive

### Create Output Directory

In [55]:
OUT_DIR = os.path.join(PROJECT_DIR, "results/svm")
os.makedirs(OUT_DIR, exist_ok=True)

### Save Performance Metrics

In [56]:
svm_results = pd.DataFrame([{
    "model": "SVM_kmer",
    "accuracy": acc,
    "f1": f1,
    "auroc": auc,
    "mcc": mcc,
    "kmer_size": k,
    "feature_dim": X.shape[1],
    "evaluation": "80/20 holdout"
}])

svm_results.to_csv(
    os.path.join(OUT_DIR, "svm_performance.csv"),
    index=False
)

svm_results

Unnamed: 0,model,accuracy,f1,auroc,mcc,kmer_size,feature_dim,evaluation
0,SVM_kmer,0.712678,0.712534,0.791516,0.425356,6,4096,80/20 holdout


### Save Top k-mers

In [57]:
kmer_df = pd.DataFrame({
    "top_promoter_kmers": feature_names[top_pos],
    "top_non_promoter_kmers": feature_names[top_neg]
})

kmer_df.to_csv(
    os.path.join(OUT_DIR, "svm_top_kmers.csv"),
    index=False
)

kmer_df.head()

Unnamed: 0,top_promoter_kmers,top_non_promoter_kmers
0,CCCGAC,CTCCGT
1,ATACGA,CTACCA
2,GATCGT,TTCCGT
3,CCGTTT,CCGACT
4,CGGTTG,ATCGTT


### Save ROC Probabilities

In [58]:
y_score = svm.decision_function(X_test)

roc_df = pd.DataFrame({
    "y_true": y_test,
    "y_score": y_score
})

roc_df.to_csv(
    os.path.join(OUT_DIR, "svm_roc_data.csv"),
    index=False
)

roc_df.head()

Unnamed: 0,y_true,y_score
0,1,0.807878
1,1,0.257551
2,1,0.402727
3,0,-0.474846
4,0,-0.316546
