In [1]:
!git clone https://github.com/chaitjo/geometric-rna-design.git
%cd geometric-rna-design


Cloning into 'geometric-rna-design'...
remote: Enumerating objects: 482, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 482 (delta 26), reused 21 (delta 21), pack-reused 444 (from 1)[K
Receiving objects: 100% (482/482), 312.55 MiB | 25.77 MiB/s, done.
Resolving deltas: 100% (220/220), done.
Updating files: 100% (123/123), done.
/content/geometric-rna-design


In [4]:
%cd /content/geometric-rna-design
!ls


/content/geometric-rna-design
checkpoints  data	LICENSE  notebooks  src    tutorial
configs      gRNAde.py	main.py  README.md  tools


In [5]:
# 基本科學運算 & 生物資訊套件
!pip install numpy scipy biopython pandas

# 安裝 PyTorch Geometric 家族（注意是 torch_geometric）
import torch, sys, subprocess

print("Torch version:", torch.__version__)

# 先試最簡單版本（有時候現在 Colab 直接支援 CPU 版）
subprocess.check_call([sys.executable, "-m", "pip", "install", "torch_geometric"])


Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86
Torch version: 2.9.0+cu126


0

In [7]:
# 看看有什麼 notebook 或教學腳本
!ls notebooks
!ls tutorial


data_stats.ipynb  design.ipynb	split_das.ipynb  split_structsim_v2.ipynb
demo_data  fig	outputs  README.md  tutorial.ipynb


In [10]:
import pandas as pd

# 讀 repo 裡現成的 CSV 檔（選一個就好）
df = pd.read_csv('/content/geometric-rna-design/data/processed_df.csv')

# 看前幾列長什麼樣子
print(df.head())
print(df.columns)


                                             id_list  \
0  ['6WD5_1_2', '6WD1_1_2', '6WD9_1_2', '6OGI_1_2...   
1                                   ['3B58_1_B-C-A']   
2  ['7M57_1_n-Y', '7M57_1_qq-bb', '4OQ9_1_S-h', '...   
3                                     ['6DTI_1_X-W']   
4                                       ['3KTW_1_C']   

                                           rfam_list  \
0  ['5S_rRNA', '5S_rRNA', '5S_rRNA', '5S_rRNA', '...   
1                                        ['unknown']   
2  ['unknown', 'unknown', 'unknown', 'unknown', '...   
3                              ['SSU_rRNA_bacteria']   
4                                        ['unknown']   

                                       eq_class_list  \
0  ['10157', '10157', '10157', '10157', '10157', ...   
1                                          ['02086']   
2  ['20373', '20373', '24132', '20373', '20373', ...   
3                                          ['63494']   
4                                          ['3

In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# 1. 讀資料
df = pd.read_csv('/content/geometric-rna-design/data/processed_df.csv')

# 2. 基本清理：關鍵欄位不能是 NaN
base_cols = [
    'sequence',
    'length',
    'mean_rmsd',
    'median_rmsd',
    'num_structures',
    'cluster_seqid0.8',
    'cluster_structsim0.45',
    'type_list'
]
df = df.dropna(subset=base_cols).copy()

print("原始資料筆數：", len(df))

# 3. 把 type_list 清成比較簡單的類別（label cleaning）
#    例如：['Protein-RNA Complex', 'Protein-RNA Complex'] -> 'Protein-RNA Complex'
def normalize_type(t):
    s = str(t)
    if 'Protein-RNA Complex' in s:
        return 'Protein-RNA Complex'
    elif 'Solo RNA' in s:
        return 'Solo RNA'
    elif 'unknown' in s:
        return 'unknown'
    else:
        # 其他類別就原樣保留
        return s

df['type_simple'] = df['type_list'].apply(normalize_type)

print("前幾個 type_simple 類別：")
print(df['type_simple'].value_counts().head())

# 4. 從 RNA 序列多做幾個簡單特徵（長度 / A U G C 比例 / GC 內容）
def seq_features(seq):
    seq = str(seq).upper()
    L = len(seq)
    counts = {b: seq.count(b) for b in "AUGC"}
    if L == 0:
        L = 1  # 避免除以 0
    return pd.Series({
        'len_seq': L,
        'frac_A': counts['A'] / L,
        'frac_U': counts['U'] / L,
        'frac_G': counts['G'] / L,
        'frac_C': counts['C'] / L,
        'GC_content': (counts['G'] + counts['C']) / L,
        'AU_content': (counts['A'] + counts['U']) / L,
    })

seq_feat = df['sequence'].apply(seq_features)
df_feat = pd.concat([df, seq_feat], axis=1)

# 5. 只保留「樣本數夠多」的類別，例如每類至少 30 筆
min_count = 30
vc = df_feat['type_simple'].value_counts()
keep_classes = vc[vc >= min_count].index
df_clf = df_feat[df_feat['type_simple'].isin(keep_classes)].copy()

print("篩完後筆數：", len(df_clf))
print("保留的類別數：", len(keep_classes))
print(df_clf['type_simple'].value_counts())

# 6. 選特徵欄位
feature_cols = [
    'length',
    'mean_rmsd',
    'median_rmsd',
    'num_structures',
    'cluster_seqid0.8',
    'cluster_structsim0.45',
    'len_seq',
    'frac_A', 'frac_U', 'frac_G', 'frac_C',
    'GC_content', 'AU_content'
]

X = df_clf[feature_cols]

# label 編碼
le = LabelEncoder()
y = le.fit_transform(df_clf['type_simple'])

print("X 形狀：", X.shape)
print("y 類別數：", len(le.classes_))

# 7. 切訓練 / 測試集（這裡類別已經都 ≥ 30，可以 stratify）
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 8. 建一個稍微調過參數的 Random


原始資料筆數： 4223
前幾個 type_simple 類別：
type_simple
Protein-RNA Complex                     3274
Solo RNA                                 737
unknown                                  182
['DNA-RNA Hybrid']                        18
['DNA-RNA Hybrid', 'DNA-RNA Hybrid']       6
Name: count, dtype: int64
篩完後筆數： 4193
保留的類別數： 3
type_simple
Protein-RNA Complex    3274
Solo RNA                737
unknown                 182
Name: count, dtype: int64
X 形狀： (4193, 13)
y 類別數： 3


In [16]:
import pandas as pd

df = pd.read_csv('/content/geometric-rna-design/data/processed_df.csv')
print(df.head())
print(df.columns)


                                             id_list  \
0  ['6WD5_1_2', '6WD1_1_2', '6WD9_1_2', '6OGI_1_2...   
1                                   ['3B58_1_B-C-A']   
2  ['7M57_1_n-Y', '7M57_1_qq-bb', '4OQ9_1_S-h', '...   
3                                     ['6DTI_1_X-W']   
4                                       ['3KTW_1_C']   

                                           rfam_list  \
0  ['5S_rRNA', '5S_rRNA', '5S_rRNA', '5S_rRNA', '...   
1                                        ['unknown']   
2  ['unknown', 'unknown', 'unknown', 'unknown', '...   
3                              ['SSU_rRNA_bacteria']   
4                                        ['unknown']   

                                       eq_class_list  \
0  ['10157', '10157', '10157', '10157', '10157', ...   
1                                          ['02086']   
2  ['20373', '20373', '24132', '20373', '20373', ...   
3                                          ['63494']   
4                                          ['3

In [17]:
def extra_seq_features(seq):
    seq = str(seq).upper()
    # 最長連續相同字元
    max_run = 1
    cur_run = 1
    for i in range(1, len(seq)):
        if seq[i] == seq[i-1]:
            cur_run += 1
            max_run = max(max_run, cur_run)
        else:
            cur_run = 1
    return pd.Series({
        'max_run_len': max_run,
    })

extra = df['sequence'].apply(extra_seq_features)
df_feat = pd.concat([df_feat, extra], axis=1)

feature_cols += ['max_run_len']


In [18]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [200, 400, 600],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

rf = RandomForestClassifier(
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=42
)

search = RandomizedSearchCV(
    rf,
    param_distributions=param_grid,
    n_iter=10,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train)
best_clf = search.best_estimator_
y_pred = best_clf.predict(X_test)
print("Tuned Test Accuracy:", accuracy_score(y_test, y_pred))


Tuned Test Accuracy: 0.8367103694874851
