In [29]:
from pathlib import Path

from catboost import CatBoostClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
SEED = 1234
TEST_SIZE = 0.2
PROJECT_DIR = Path("./").resolve().parent

In [3]:
df = pd.read_csv(PROJECT_DIR / 'data/interim/smd_features/train.csv', index_col=0)

In [5]:
model = CatBoostClassifier(
    auto_class_weights="SqrtBalanced",
    iterations=1000,
    eval_metric="F1",
)
X_train, X_test, y_train, y_test = train_test_split(
    df.drop("Active", axis=1),
    df["Active"].astype(int),
    test_size=TEST_SIZE,
    random_state=SEED,
)

In [None]:
model.fit(
    X_train,
    y_train,
    use_best_model=True,
    eval_set=(X_test, y_test),
)

In [45]:
def get_impotante_features(df: pd.DataFrame):
    X = df.drop("Active", axis=1).__array__()
    y = df["Active"].astype(int).__array__()

    kf = KFold(n_splits=5,random_state=SEED, shuffle=True)
    print(kf)
    
    feature_importance = list()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = CatBoostClassifier(
            auto_class_weights="SqrtBalanced",
            iterations=500,
            eval_metric="F1",
        )
        
        model = model.fit(
            X_train,
            y_train,
            use_best_model=True,
            eval_set=(X_test, y_test),
        )

        feature_importance.append(model.get_feature_importance(prettified=True))
    
    return feature_importance

In [46]:
feature_importance_list = get_impotante_features(df)

KFold(n_splits=5, random_state=1234, shuffle=True)


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.061924
0:	learn: 0.0455581	test: 0.0000000	best: 0.0000000 (0)	total: 123ms	remaining: 1m 1s
1:	learn: 0.0344432	test: 0.0000000	best: 0.0000000 (0)	total: 233ms	remaining: 58.1s
2:	learn: 0.0568182	test: 0.0555556	best: 0.0555556 (2)	total: 348ms	remaining: 57.6s
3:	learn: 0.0344828	test: 0.0000000	best: 0.0555556 (2)	total: 426ms	remaining: 52.8s
4:	learn: 0.0231214	test: 0.0000000	best: 0.0555556 (2)	total: 544ms	remaining: 53.8s
5:	learn: 0.0231214	test: 0.0000000	best: 0.0555556 (2)	total: 642ms	remaining: 52.9s
6:	learn: 0.0116279	test: 0.0000000	best: 0.0555556 (2)	total: 730ms	remaining: 51.4s
7:	learn: 0.0000000	test: 0.0000000	best: 0.0555556 (2)	total: 828ms	remaining: 51s
8:	learn: 0.0000000	test: 0.0000000	best: 0.0555556 (2)	total: 946ms	remaining: 51.6s
9:	learn: 0.0000000	test: 0.0000000	best: 0.0555556 (2)	total: 1.03s	remaining: 50.5s
10:	learn: 0.0000000	test: 0.0000000	best: 0.0555556 (2)	total: 1.14s	remaining: 50.7s
11:	learn: 0.0000000	test

In [79]:
important_features = set()
for feature_importance in feature_importance_list[4:5]:
    for feature_id, importance in  feature_importance.values:
        if importance > 0:
            important_features.add(str(int(feature_id)+1))
        # print(feature_id, importance)
    # break

In [80]:
len(important_features)

6

In [81]:
important_features

{'114', '165', '299', '630', '746', '82'}

In [86]:
df[list(important_features)]

Unnamed: 0_level_0,746,165,630,299,82,114
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.850,0.000,4.680,0.068,2.233,2.242
1,2.994,0.000,7.438,0.056,2.679,2.686
2,3.906,0.000,9.647,0.086,2.901,2.892
3,3.789,0.000,13.347,0.087,2.972,2.974
4,3.534,0.000,8.792,0.064,2.835,2.840
...,...,...,...,...,...,...
5552,4.265,0.000,12.082,0.100,2.964,2.949
5553,3.306,0.333,6.438,0.142,2.186,2.156
5554,2.411,0.000,3.164,0.089,1.501,1.512
5555,4.408,0.000,12.250,0.077,2.962,2.958


In [96]:
model = CatBoostClassifier(
    auto_class_weights="SqrtBalanced",
    iterations=3000,
    eval_metric="F1",
)
X_train, X_test, y_train, y_test = train_test_split(
    df[list(important_features)],
    df["Active"].astype(int),
    test_size=TEST_SIZE,
    random_state=SEED,
)

In [97]:
model.fit(
    X_train,
    y_train,
    use_best_model=True,
    eval_set=(X_test, y_test),
)

Learning rate set to 0.028403
0:	learn: 0.1478345	test: 0.1030917	best: 0.1030917 (0)	total: 2.56ms	remaining: 7.68s
1:	learn: 0.1577279	test: 0.1030917	best: 0.1030917 (0)	total: 7.75ms	remaining: 11.6s
2:	learn: 0.0885934	test: 0.0000000	best: 0.1030917 (0)	total: 12.7ms	remaining: 12.7s
3:	learn: 0.0456100	test: 0.0000000	best: 0.1030917 (0)	total: 16ms	remaining: 12s
4:	learn: 0.0230947	test: 0.0000000	best: 0.1030917 (0)	total: 20.3ms	remaining: 12.2s
5:	learn: 0.0231214	test: 0.0000000	best: 0.1030917 (0)	total: 22.8ms	remaining: 11.4s
6:	learn: 0.0231214	test: 0.0000000	best: 0.1030917 (0)	total: 25.8ms	remaining: 11s
7:	learn: 0.0116279	test: 0.0000000	best: 0.1030917 (0)	total: 29.2ms	remaining: 10.9s
8:	learn: 0.0231214	test: 0.0000000	best: 0.1030917 (0)	total: 32.6ms	remaining: 10.8s
9:	learn: 0.0231214	test: 0.0000000	best: 0.1030917 (0)	total: 36.6ms	remaining: 11s
10:	learn: 0.0231214	test: 0.0000000	best: 0.1030917 (0)	total: 43.3ms	remaining: 11.8s
11:	learn: 0.0231214

<catboost.core.CatBoostClassifier at 0x7fa429ee9580>

In [5]:
from biopandas.pdb import PandasPdb

In [9]:
ppdb = PandasPdb()
ppdb = ppdb.read_pdb(str(PROJECT_DIR / 'data/raw/target.pdb'))

In [10]:
print('PDB Code: %s' % ppdb.code)
print('PDB Header Line: %s' % ppdb.header)
print('\nRaw PDB file contents:\n\n%s\n...' % ppdb.pdb_text[:1000])

PDB Code: 7kr0
PDB Header Line:     VIRAL PROTEIN                           18-NOV-20   7KR0

Raw PDB file contents:

HEADER    VIRAL PROTEIN                           18-NOV-20   7KR0              
TITLE     CRYSTAL STRUCTURE OF SARS-COV-2 NSP3 MACRODOMAIN (C2 CRYSTAL FORM, 100
TITLE    2 K)                                                                   
CAVEAT     7KR0    RESIDUES LEU A 169 AND GLU A 170 THAT ARE NEXT TO EACH OTHER 
CAVEAT   2 7KR0    IN THE SAMPLE SEQUENCE ARE NOT PROPERLY LINKED IN THE B      
CAVEAT   3 7KR0    CONFORMER.                                                   
COMPND    MOL_ID: 1;                                                            
COMPND   2 MOLECULE: NON-STRUCTURAL PROTEIN 3;                                  
COMPND   3 CHAIN: A;                                                            
COMPND   4 FRAGMENT: MACRODOMAIN (UNP RESIDUES 1024-1192);                      
COMPND   5 SYNONYM: NSP3,PL2-PRO,PAPAIN-LIKE PROTEASE,PAPAIN-LIKE PROTEI

In [13]:
ppdb.df.keys()

dict_keys(['ATOM', 'HETATM', 'ANISOU', 'OTHERS'])

In [14]:
ppdb.df['ATOM']#.head(3)

Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,x_coord,y_coord,z_coord,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx
0,ATOM,1,,N,A,GLU,,A,2,,...,1.413,3.054,16.418,0.58,24.52,,,N,,412
1,ATOM,2,,N,B,GLU,,A,2,,...,8.380,3.465,17.791,0.42,23.12,,,N,,414
2,ATOM,3,,CA,A,GLU,,A,2,,...,2.443,2.107,16.011,0.58,24.04,,,C,,416
3,ATOM,4,,CA,B,GLU,,A,2,,...,8.918,2.812,16.605,0.42,22.97,,,C,,418
4,ATOM,5,,C,A,GLU,,A,2,,...,3.667,2.847,15.500,0.58,22.02,,,C,,420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3979,ATOM,3980,,HB3,B,GLU,,A,170,,...,11.126,16.033,-17.321,0.16,26.02,,,H,,6385
3980,ATOM,3981,,HG2,A,GLU,,A,170,,...,9.832,17.381,-16.154,0.65,22.05,,,H,,6386
3981,ATOM,3982,,HG2,B,GLU,,A,170,,...,10.093,17.690,-16.274,0.16,27.09,,,H,,6387
3982,ATOM,3983,,HG3,A,GLU,,A,170,,...,9.923,17.010,-14.612,0.65,22.05,,,H,,6388


In [16]:
ppdb.df['HETATM']

Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,x_coord,y_coord,z_coord,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx
0,HETATM,3986,,O,,HOH,,A,201,,...,36.693,21.765,14.383,0.58,11.17,,,O,,6391
1,HETATM,3987,,O,,HOH,,A,202,,...,23.676,22.427,25.958,1.00,34.56,,,O,,6393
2,HETATM,3988,,O,,HOH,,A,203,,...,27.139,20.610,20.666,0.49,8.77,,,O,,6395
3,HETATM,3989,,O,,HOH,,A,204,,...,14.780,16.222,-8.434,1.00,13.21,,,O,,6397
4,HETATM,3990,,O,,HOH,,A,205,,...,34.198,26.105,3.501,1.00,15.61,,,O,,6399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,HETATM,4297,,O,,HOH,,A,508,,...,38.541,19.208,8.546,1.00,31.15,,,O,,7013
312,HETATM,4298,,O,,HOH,,A,509,,...,1.920,10.997,-12.891,1.00,49.70,,,O,,7015
313,HETATM,4299,,O,,HOH,,A,510,,...,37.731,18.662,4.166,1.00,31.47,,,O,,7017
314,HETATM,4300,,O,,HOH,,A,511,,...,19.152,31.584,21.760,1.00,30.92,,,O,,7019
