# Classical Baseline: SVM with k-mer Features

## Load HG38 Dataset

In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')
PROJECT_DIR = "/content/drive/MyDrive/bioproj01"
DATA_DIR = os.path.join(PROJECT_DIR, "data")
print(f"Data directory found: {PROJECT_DIR}\nContents: {os.listdir(PROJECT_DIR)}")

Mounted at /content/drive
Data directory found: /content/drive/MyDrive/bioproj01
Contents: ['data']


In [2]:
import pandas as pd

In [3]:
hg38_df = pd.read_csv(
    f"{DATA_DIR}/hg38/human_promoter_vs_nonpromoter_10k_400bp.csv"
)

hg38_df = hg38_df[["sequence", "label"]]
hg38_df.head()

Unnamed: 0,sequence,label
0,TGCATATTATTTTATATGCATCTATTTTGAATCTTCATAAATGTAA...,0
1,GGCCCAGCTCTGACGCCAGGCTGTCTTGCCTCTGCTCACCTGCAGC...,1
2,TCATGCCTGGCCAGCAAAATTGTTTTTTAAAAGTTTATGCTACTAA...,1
3,TGCCTGGTTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCAT...,0
4,AATAATTGAAATAAGCTTAATAAATGGGCTCAAAAGAATGAAAGAG...,0


## k-mer Feature Extraction

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
def kmer_tokenizer(sequence, k):
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

k = 6

vectorizer = CountVectorizer(
    tokenizer=lambda x: kmer_tokenizer(x, k),
    token_pattern=None,
    lowercase=False
)

X = vectorizer.fit_transform(hg38_df["sequence"])
y = hg38_df["label"].values

print("Feature matrix shape:", X.shape)

Feature matrix shape: (19994, 4096)


## Train SVM (Linear Kernel)

In [6]:
from sklearn.svm import LinearSVC

In [7]:
svm = LinearSVC(
    C=1.0,
    class_weight="balanced",
    max_iter=10000
)

## Cross-Validation Evaluation

In [8]:
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold, cross_validate

In [9]:
scoring = {
    "accuracy": "accuracy",
    "f1": "f1",
    "roc_auc": "roc_auc",
    "mcc": make_scorer(matthews_corrcoef)
}

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

cv_results = cross_validate(
    svm,
    X,
    y,
    cv=cv,
    scoring=scoring,
    n_jobs=-1
)

## Summarize Results

In [10]:
import numpy as np

In [11]:
for metric in scoring.keys():
    scores = cv_results[f"test_{metric}"]
    print(f"{metric.upper():10s}: {scores.mean():.4f} ± {scores.std():.4f}")

ACCURACY  : 0.7160 ± 0.0043
F1        : 0.7127 ± 0.0040
ROC_AUC   : 0.7944 ± 0.0035
MCC       : 0.4321 ± 0.0087


## Inspect Top k-mers

In [12]:
svm.fit(X, y)

feature_names = vectorizer.get_feature_names_out()
weights = svm.coef_[0]

top_pos = np.argsort(weights)[-20:]
top_neg = np.argsort(weights)[:20]

print("Top promoter-associated k-mers:")
print(feature_names[top_pos])

print("\nTop non-promoter-associated k-mers:")
print(feature_names[top_neg])

Top promoter-associated k-mers:
['GCCAAC' 'TCTATG' 'TTAGGG' 'CGACAA' 'CGCTGC' 'TGATGC' 'AGTTAC' 'TGACAG'
 'CCGCGT' 'ACTCCG' 'CAGATG' 'CGCAAA' 'CGAGCG' 'TCTCCG' 'GTACGT' 'ACGATC'
 'CTTACT' 'CGCGGG' 'TTAGGA' 'CGACAG']

Top non-promoter-associated k-mers:
['CTCCGC' 'CTCCGT' 'GAGCGA' 'GCCGCG' 'TCGCGG' 'ATATTG' 'TTGATA' 'CTCCGA'
 'ATCCAG' 'GATGCT' 'TATCGC' 'TTTAGG' 'TTTCGT' 'TTACTA' 'ACAATC' 'CCGACA'
 'TATCCG' 'ATTCCG' 'GTAACT' 'CGATCC']
