# HLA class I supertype classification via nearest neighbor method
Classify HLA class I alleles into supertypes based on binding groove structure similarity
Clustering using nearest neighbor method for improved speed

Author: Shawn Shen\
Created on: July 2022

In [1]:
from src.SupertypeTools import CGAnchorMat, NearestNeighbor_cluster

## 1. Specify anchor alleles and represented supertypes / sub-supertypes

In [2]:
supertype_anchors = {
    "A01_01":"A01-A03-A66",
    "A02_01":"A02",
    "A02_03":"A02",
    "A02_06":"A02",
    "A02_07":"A02",
    "A03_01":"A01-A03-A66",
    "A11_01":"A01-A03-A66",
    "A24_02":"A24",
    "A30_01":"A01-A03-A66",
    "A68_01":"A02",
    "B07_02":"B07-B35",
    "B08_01":"B08-B18-B39",
    "B14_02":"B14",
    "B15_01":"B15-B40",
    "B18_01":"B08-B18-B39",
    "B27_05":"B27",
    "B35_01":"B07-B35",
    "B39_01":"B08-B18-B39",
    "B40_01":"B15-B40",
    "B40_02":"B15-B40",
    "B42_01":"B07-B35",
    "B44_02":"B44",
    "B44_03":"B44",
    "B46_01":"C01-C02",
    "B51_01":"B51-B58",
    "B57_01":"B51-B58",
    "B58_01":"B51-B58",
    "C04_01":"C01-C02",
    "C05_01":"C01-C02",
    "C06_02":"C01-C02",
    "C08_02":"C01-C02",
    "A26_01":"A01-A03-A66",
    "C07_01":"C07"
}

subtype_anchors = {
    "A01_01":"A01",
    "A02_01":"A02",
    "A02_03":"A02",
    "A02_06":"A02",
    "A02_07":"A02",
    "A03_01":"A03",
    "A11_01":"A03",
    "A24_02":"A24",
    "A30_01":"A03",
    "A68_01":"A02",
    "B07_02":"B07",
    "B08_01":"B08",
    "B14_02":"B14",
    "B15_01":"B15",
    "B18_01":"B18",
    "B27_05":"B27",
    "B35_01":"B35",
    "B39_01":"B39",
    "B40_01":"B40",
    "B40_02":"B15",
    "B42_01":"B07",
    "B44_02":"B44",
    "B44_03":"B44",
    "B46_01":"C02",
    "B51_01":"B51",
    "B57_01":"B58",
    "B58_01":"B58",
    "C04_01":"C01",
    "C05_01":"C01",
    "C06_02":"C02",
    "C08_02":"C01",
    "A26_01":"A66",
    "C07_01":"C07"
}

## 2. Nearest neighbor clustering

In [3]:
# Structure distances between target alleles and anchor alleles
Mat = CGAnchorMat("HLA1_models/CG_DAT", supertype_anchors, AlleleListFile="Dataset_split/unpopular.list")

In [4]:
# Supertype level clustering result
NearestNeighbor_cluster(Mat, supertype_anchors)

Unnamed: 0,Nearest_anchor,Cluster
B08_02,B08_01,B08-B18-B39
B27_09,B27_05,B27


In [5]:
# Sub-supertype level clustering result
NearestNeighbor_cluster(Mat, subtype_anchors)

Unnamed: 0,Nearest_anchor,Cluster
B08_02,B08_01,B08
B27_09,B27_05,B27
