In [358]:
import pandas as pd
# from src.preprocess import merge_lsa, summerize2allel, add_id
from src.leuven.idsvspath import idvspath_csv
from src.leuven.preprocess import (
    aggregate_lsa_per_patients,
    erratic_missing_values,
    clean_index2int, 
    select_cols, 
    strip_column, 
    merge_lsa, 
    summerize2allel, 
    add_id
)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Information on Leuven Data
You should discard the SAB data for 2790? as this patient is from the remaining cohort with missing complete HR HLA data, which was excluded from our JASN paper.

We also do not have the files for #Tx 3529 and 3546.

There are two files wit postTx name

## Join All files with multiple Transplants

## CSV Files

### Batch Files

In [269]:


multi = [
    'LSA1 2801-3188.csv', 'LSA2 2974-3188.csv', 'LSA1-2860,72 2963 3003,06,75 3104.csv', 
    'LSA1-170131-IWT1.csv', 'LSA1-170214-IWT2.csv', 'LSA1-170221-IWT.csv', 'LSA1-170509-IWT6.csv',
]

single = ['LSA2-3703.csv', '3179.csv', 'LSA1 2767.csv', 'LSA1-2817.csv', 'LSA2-3703.csv', '3288.csv', '2797 LSA2.csv']
paths = ['~/UMCUtrecht/Leuven/Original/lsa_per_multi_tx/' + file for file in multi] + \
        ['~/UMCUtrecht/Leuven/Original/lsa_per_tx/' + file for file in single] 
        

cols = [
    'sample ID', 'patient Name', 'patient DOB', 'hla Type', 'draw Dt', 'donor Number', 
    'donor Center Number', 'bead', 'allele', 'raw Value', 'BCM', 'BCR', 'AD-BCR', 
    'assignment', 'Probe 77', 'CON1', 'CON2', 'CON3'
]

dfs = [pd.read_csv(path, names=cols, header=0).set_index('sample ID') for path in paths]
df_batch = (
    pd.concat(dfs)
    .reset_index()
    .rename(columns={'bead': 'Bead', 'raw Value': 'Raw Value', 'sample ID': 'tx_id'})
    .pipe(clean_index2int, 'tx_id')
    .pipe(select_cols, 'tx_id', 'Bead', 'Raw Value', 'BCM', 'BCR', 'AD-BCR', 'assignment', 'allele')
    .pipe(strip_column, 'assignment')
    .sort_values(by='tx_id', ascending=True).reset_index(drop=True)
    # .drop_duplicates()
)

df_batch.to_csv('/Users/Danial/UMCUtrecht/Leuven/Processed/batch_lsa.csv')

df_batch

Unnamed: 0,tx_id,Bead,Raw Value,BCM,BCR,AD-BCR,assignment,allele
0,2767,262,1562.0,1344.0,14.00,15.12,Positive,B*44:02
1,2767,289,182.5,-166.5,-1.21,-1.47,Negative,C*06:02
2,2767,290,260.0,-129.0,-0.93,-1.49,Negative,C*07:01
3,2767,291,207.5,-114.5,-0.83,-1.21,Negative,C*07:02
4,2767,292,138.0,-134.0,-0.97,-1.22,Negative,C*08:01
...,...,...,...,...,...,...,...,...
7239,3703,136,130.0,-16.0,-0.16,-0.18,Negative,DQA1*02:01
7240,3703,135,101.0,-34.0,-0.35,-0.40,Negative,DQB1*03:01
7241,3703,135,101.0,-34.0,-0.35,-0.40,Negative,DQA1*05:01
7242,3703,140,153.0,-5.0,-0.05,-0.06,Negative,DQB1*04:01


### Single Files:

In [348]:

# df_single_csv[df_single_csv.tx_id == 3463]
d = (
    merge_lsa(*idvspath_csv[3456])
    .pipe(summerize2allel)
    .dropna(how='all', axis=0)
    .pipe(erratic_missing_values, 'Assignment') 
    .pipe(add_id, 3463)
    # .pipe(strip_column, 'Assignment')
)

# df.Assignment.isna().sum()

    Bead  Raw Value  BCM  BCR         AD-BCR Assignment allele
95   NaN        NaN  NaN  NaN  Patient Name:        NaN    NaN


In [375]:
df_single_csv = (
    aggregate_lsa_per_patients(idvspath_csv)
    .rename(columns={'Assignment': 'assignment'})
)

    Bead  Raw Value  BCM  BCR         AD-BCR Assignment allele
95   NaN        NaN  NaN  NaN  Patient Name:        NaN    NaN
     Bead  Raw Value  BCM  BCR  AD-BCR Assignment allele
177   NaN        NaN  NaN  NaN     NaN        NaN    NaN
    Bead  Raw Value  BCM  BCR  AD-BCR Assignment allele
93   NaN        NaN  NaN  NaN     NaN        NaN    NaN
94   NaN        NaN  NaN  NaN     NaN        NaN    NaN
95   NaN        NaN  NaN  NaN     NaN        NaN    NaN


# Join Batch Files & Single Files

In [428]:
df_total = pd.concat([df_single_csv, df_batch]).reset_index(drop=True)
df_total.to_csv('/Users/Danial/UMCUtrecht/Leuven/Processed/lsa_joined.csv')


In [378]:
lsa_ids = df_total.tx_id.unique()
len(lsa_ids)

85

In [427]:
df_total

Unnamed: 0,Bead,Raw Value,BCM,BCR,AD-BCR,assignment,allele,tx_id
0,127.0,659.0,477.0,12.23,12.04,Positive,A*66:01,2790
1,187.0,656.0,411.0,3.73,4.53,Positive,C*04:03,2790
2,183.0,623.0,373.0,3.39,5.4,Positive,C*02:02,2790
3,128.0,579.0,415.0,10.63,9.86,Positive,A*66:02,2790
4,129.0,421.0,248.0,6.36,6.76,Positive,A*68:01,2790
...,...,...,...,...,...,...,...,...
12728,136.0,130.0,-16.0,-0.16,-0.18,Negative,DQA1*02:01,3703
12729,135.0,101.0,-34.0,-0.35,-0.4,Negative,DQB1*03:01,3703
12730,135.0,101.0,-34.0,-0.35,-0.4,Negative,DQA1*05:01,3703
12731,140.0,153.0,-5.0,-0.05,-0.06,Negative,DQB1*04:01,3703


### Excel Files

In [379]:
excel_files = ['LSA2 2838.xlsx', 'LSA2-2801.xlsx', 'LSA2-2835.xlsx', 'LSA2-2896.xlsx', 'LSA2-2907.xlsx']
excel_paths = ['~/UMCUtrecht/Leuven/Original/lsa_per_tx/' + file for file in excel_files] 

In [380]:
# dfs = [pd.read_excel(path, skiprows=2) for path in excel_paths]
# pd.concat(dfs)

# pd.read_excel(excel_paths[0], skiprows=2) 


# Single Files

In [5]:
class_I = {2867, 2885, 2974, 2990, 3002, 3026, 3042, 3105, 3118, 3135, 3140, 3152, 3188}
class_II = {2974, 2990, 3026, 3042, 3118, 3135, 3140, 3188}
all_new = class_I.union(class_II).union({2767, 2838, 2801, 2835, 2896, 2907, 3703})

In [122]:
# import os
# from collections import Counter
# files = os.listdir('/Users/Danial/UMCUtrecht/Leuven/lsa_per_tx')
# csv_ids = []
# for file in files:
#     id = file.split('.')[0].split(' ')[0]
#     if id.isdigit():
#         csv_ids.append(int(id))

In [414]:
@dataclass
class HLA:
    """ Class for all High Resolution HLA's """
    string: str = field(repr=True)
    gene: str = field(init=False, repr=False)
    allele: str = field(init=False, repr=False)
    protein: str = field(init=False, repr=False)
    _class: str = field(init=False, repr=False)
    locus: str = field(init=False, repr=False)
    # low_res: str = field(init=False, repr=True)

    def __post_init__(self):
        self.gene, specificity = self.string.split('*')
        _spec = specificity.replace('G','').split(':')
        # filter higher resolutions with 2 or more ':'
        self.allele, self.protein = _spec[0], _spec[1]
        self.string = self.gene + '*' + self.allele + ':' + self.protein
        self._class = 'I' if self.gene in ['A', 'B', 'C'] else 'II'
        self.locus = {'Cw': 'C', 'Bw': 'B'}.get(self.gene[0:2], self.gene[0:2])
        self.locus = self.gene[0:2]
        # self.low_res = self.gene + str(int(self.allel))

In [415]:
from typing import Set
@dataclass
class Luminex():
    """ Class contatining Luminex data """
    _specificity: InitVar[set] = field(repr=False)
    assignment: str = field(repr=True)
    specificity: Set[HLA] = field(init=False, default_factory=list, repr=True)

    def __post_init__(self, _specificity):
        if isinstance(_specificity, str):
            self.specificity  = [HLA(hla) for hla in _specificity.split(',')]
        if isinstance(_specificity, set):
            self.specificity  = [HLA(hla) for hla in _specificity]

In [416]:
Luminex

__main__.Luminex

In [417]:
for id in df_total.tx_id.unique():
   df_id = df_total[df_total.tx_id == id]
   df_id.iterrows
   Luminex(df_id.allele, df_id.assignment)

In [426]:
df_id.allele.values

array(['DQB1*03:03', 'DQA1*03:02', 'DQB1*03:02', 'DQB1*02:01',
       'DQB1*03:01', 'DQA1*06:01', 'DQA1*05:01', 'DQA1*03:01',
       'DQA1*03:01', 'DQB1*04:02', 'DRB1*16:02', 'DQB1*06:01',
       'DQA1*01:02', 'DQB1*06:02', 'DQA1*01:02', 'DQB1*06:04',
       'DRB1*04:03', 'DRB1*14:04', 'DRB5*02:02', 'DRB1*03:02',
       'DRB1*11:04', 'DQA1*01:03', 'DRB5*01:01', 'DRB1*04:01',
       'DRB3*03:01', 'DRB1*01:03', 'DRB1*01:02', 'DRB1*01:01',
       'DQB1*06:03', 'DRB1*04:02', 'DRB1*04:05', 'DRB1*07:01',
       'DRB1*08:01', 'DRB1*09:01', 'DRB1*10:01', 'DRB1*11:01',
       'DRB1*12:01', 'DRB1*13:01', 'DRB1*14:01', 'DRB1*15:01',
       'DRB1*15:02', 'DRB1*16:01', 'DRB1*03:01', 'DRB1*03:03',
       'DRB3*01:01', 'DRB3*02:02', 'DRB4*01:01', 'DRB1*13:03',
       'DQA1*05:01', 'DRB1*11:04', 'DPA1*02:02', 'DPB1*01:01',
       'DPA1*03:01', 'DPB1*01:01', 'DPA1*01:03', 'DPB1*02:01',
       'DPA1*01:03', 'DPB1*03:01', 'DPA1*01:03', 'DPB1*04:01',
       'DPA1*02:01', 'DPB1*04:01', 'DPA1*02:02', 'DPB1*

In [419]:
lsas = []
for row in df_id.iterrows():
    *_, assignment, allele, _= row[1]
    lsas.append(Luminex(_specificity=allele, assignment=assignment))

In [425]:
# lsas[0].specificity[0]
lsas

[Luminex(assignment='Negative', specificity=[HLA(string='DQB1*03:03')]),
 Luminex(assignment='Negative', specificity=[HLA(string='DQA1*03:02')]),
 Luminex(assignment='Positive', specificity=[HLA(string='DQB1*03:02')]),
 Luminex(assignment='Positive', specificity=[HLA(string='DQB1*02:01')]),
 Luminex(assignment='Negative', specificity=[HLA(string='DQB1*03:01')]),
 Luminex(assignment='Negative', specificity=[HLA(string='DQA1*06:01')]),
 Luminex(assignment='Positive', specificity=[HLA(string='DQA1*05:01')]),
 Luminex(assignment='Negative', specificity=[HLA(string='DQA1*03:01')]),
 Luminex(assignment='Positive', specificity=[HLA(string='DQA1*03:01')]),
 Luminex(assignment='Negative', specificity=[HLA(string='DQB1*04:02')]),
 Luminex(assignment='Negative', specificity=[HLA(string='DRB1*16:02')]),
 Luminex(assignment='Negative', specificity=[HLA(string='DQB1*06:01')]),
 Luminex(assignment='Negative', specificity=[HLA(string='DQA1*01:02')]),
 Luminex(assignment='Negative', specificity=[HLA(st

In [421]:
print(lsas[1].specificity[0].string)
print(lsas[1].specificity[0].locus)
print(lsas[1].specificity[0].gene)
print(lsas[1].specificity[0].allele)
print(lsas[1].specificity[0].protein)
print(lsas[1].specificity[0]._class)

DQA1*03:02
DQ
DQA1
03
02
II
