In [144]:
import pandas as pd
from src.leuven.idsvspath import idvspath_csv
from src.leuven.preprocess import (
    aggregate_lsa_per_patients,
    drop_samples_with_na,
    flag_missing_values,
    clean_index2int, 
    select_cols, 
    strip_column, 
    merge_lsa, 
    summerize2allel, 
    add_id
)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Information on Leuven Data
You should discard the SAB data for 2790? as this patient is from the remaining cohort with missing complete HR HLA data, which was excluded from our JASN paper.

We also do not have the files for #Tx 3529 and 3546.

There are two files wit postTx name

## Join All files with multiple Transplants

## CSV Files

### Batch Files

In [145]:
multi = [
    'LSA1 2801-3188.csv', 'LSA2 2974-3188.csv', 'LSA1-2860,72 2963 3003,06,75 3104.csv', 
    'LSA1-170131-IWT1.csv', 'LSA1-170214-IWT2.csv', 'LSA1-170221-IWT.csv', 'LSA1-170509-IWT6.csv',
]

single = ['LSA2-3703.csv', '3179.csv', 'LSA1 2767.csv', 'LSA1-2817.csv', 'LSA2-3703.csv', '3288.csv', '2797 LSA2.csv']
paths = ['~/UMCUtrecht/Leuven/Original/lsa_per_multi_tx/' + file for file in multi] + \
        ['~/UMCUtrecht/Leuven/Original/lsa_per_tx/' + file for file in single] 
        

cols = [
    'sample ID', 'patient Name', 'patient DOB', 'hla Type', 'draw Dt', 'donor Number', 
    'donor Center Number', 'bead', 'allele', 'raw Value', 'BCM', 'BCR', 'AD-BCR', 
    'assignment', 'Probe 77', 'CON1', 'CON2', 'CON3'
]

dfs = [pd.read_csv(path, names=cols, header=0).set_index('sample ID') for path in paths]
df_batch = (
    pd.concat(dfs)
    .drop_duplicates()
    .reset_index()
    .rename(columns={'bead': 'Bead', 'raw Value': 'Raw Value', 'sample ID': 'tx_id'})
    .pipe(clean_index2int, 'tx_id')
    .pipe(select_cols, 'tx_id', 'Bead', 'Raw Value', 'BCM', 'BCR', 'AD-BCR', 'assignment', 'allele')
    .pipe(strip_column, 'assignment')
    .sort_values(by='tx_id', ascending=True).reset_index(drop=True) 
)

df_batch.to_csv('/Users/Danial/UMCUtrecht/Leuven/Processed/batch_lsa.csv')

df_batch

Unnamed: 0,tx_id,Bead,Raw Value,BCM,BCR,AD-BCR,assignment,allele
0,2767,298,223.0,-155.0,-1.12,-1.61,Negative,C*17:01
1,2767,273,160.0,-67.0,-0.70,-0.77,Negative,B*54:01
2,2767,272,145.0,-64.0,-0.67,-0.72,Negative,B*53:01
3,2767,271,166.0,-71.0,-0.74,-0.84,Negative,B*52:01
4,2767,270,155.0,-84.0,-0.88,-0.96,Negative,B*51:01
...,...,...,...,...,...,...,...,...
5016,3703,124,454.5,310.5,3.79,4.15,Weak,DPA1*02:01
5017,3703,123,363.5,214.5,2.62,2.75,Negative,DPB1*05:01
5018,3703,123,363.5,214.5,2.62,2.75,Negative,DPA1*03:01
5019,3703,126,1541.0,1376.0,16.78,19.63,Positive,DPB1*14:01


### Single Files:

#### CSV Files:

In [184]:
df_single_csv = (
    aggregate_lsa_per_patients(idvspath_csv)
    .rename(columns={'Assignment': 'assignment'})
)
df_single_csv

Indexes with mostly NaN #:1, index:Int64Index([95], dtype='int64')
Indexes with mostly NaN #:1, index:Int64Index([177], dtype='int64')
Indexes with mostly NaN #:3, index:Int64Index([93, 94, 95], dtype='int64')


Unnamed: 0,Bead,Raw Value,BCM,BCR,AD-BCR,assignment,allele,tx_id
0,127.0,659.0,477.0,12.23,12.04,Positive,A*66:01,2790
1,187.0,656.0,411.0,3.73,4.53,Positive,C*04:03,2790
2,183.0,623.0,373.0,3.39,5.4,Positive,C*02:02,2790
3,128.0,579.0,415.0,10.63,9.86,Positive,A*66:02,2790
4,129.0,421.0,248.0,6.36,6.76,Positive,A*68:01,2790
...,...,...,...,...,...,...,...,...
75,152.0,283.0,-142.0,-0.60,-0.79,Negative,DRB1*01:02,3861
76,118.0,268.0,-233.0,-0.87,-1.64,Negative,"DPB1*04:01, DPA1*04:01",3861
77,138.0,262.0,-77.0,-0.30,-0.41,Negative,"DQA1*04:01, DQB1*03:03",3861
78,179.0,257.0,-109.0,-0.42,-0.56,Negative,"DQB1*03:02, DQA1*03:01",3861


#### Excel Files:

In [185]:
from src.leuven.preprocess import aggregate_lsa_excel
df_single_excel = aggregate_lsa_excel()
df_single_excel

Unnamed: 0,tx_id,Bead,Raw Value,BCM,BCR,AD-BCR,assignment,allele
0,2838,132,16469,16323,76.10,94.53,Positive,"DQB1*02:02, DQA1*03:02"
1,2838,131,16660,16502,76.90,92.58,Positive,"DQA1*02:01, DQB1*02:02"
2,2838,133,15428,15253,71.10,92.23,Positive,"DQB1*02:02, DQA1*05:01"
3,2838,177,15381,15213,70.90,91.87,Positive,"DQB1*02:01, DQA1*05:01"
4,2838,136,16813,16667,77.70,84.64,Positive,"DQB1*03:02, DQA1*02:01"
...,...,...,...,...,...,...,...,...
77,2907,143,83,-92,-2.54,-3.58,Negative,"DQA1*01:01, DQB1*05:01"
78,2907,150,49,-96,-2.67,-4.10,Negative,"DQA1*01:03, DQB1*06:03"
79,2907,146,36,-144,-4.00,-4.54,Negative,"DQA1*02:01, DQB1*04:01"
80,2907,118,42,-182,-4.73,-4.80,Negative,"DPB1*04:01, DPA1*04:01"


#### PDF Files:

In [12]:
pdf_ids = {3279, 3371, 3453, 3466, 3478, 3502, 3608, 3524, 3565, 3656}
# 3453 classI & II

# Join Batch Files & Single Files

In [186]:
df_total = pd.concat([df_single_csv, df_single_excel, df_batch]).reset_index(drop=True)
df_total.to_csv('/Users/Danial/UMCUtrecht/Leuven/Processed/lsa_joined.csv')


In [187]:
lsa_ids = df_total.tx_id.unique()
# print(set(lsa_ids))
# {2801, 2835, 2838, 2896, 2907} & set(lsa_ids)

In [225]:
df_total[df_total.tx_id.eq(3478)]

Unnamed: 0,Bead,Raw Value,BCM,BCR,AD-BCR,assignment,allele,tx_id
2487,281.0,2744.0,2538.0,2538.0,2619.2,Positive,B*82:02,3478
2488,264.0,2657.0,2463.0,2463.0,2742.76,Positive,B*45:01,3478
2489,245.0,1600.0,1405.0,1405.0,1478.95,Positive,B*15:12,3478
2490,263.0,1259.0,980.0,979.5,1440.44,Positive,B*44:03,3478
2491,211.0,1173.0,869.0,869.0,1016.37,Positive,A*02:05,3478
...,...,...,...,...,...,...,...,...
2575,250.0,0.0,-400.0,-400.0,-643.09,Negative,B*27:03,3478
2576,252.0,0.0,-181.0,-181.0,-193.38,Negative,B*27:08,3478
2577,277.0,0.0,-265.0,-265.0,-320.82,Negative,B*58:01,3478
2578,288.0,0.0,-312.0,-312.0,-337.3,Negative,C*05:01,3478


In [202]:
df_total[df_total.tx_id.eq(3026) & (df_total.assignment == 'Bead Failure')].allele.unique()

array(['DQA1*06:01', 'DQB1*03:01', 'DQA1*03:01', 'DQB1*03:02',
       'DQA1*03:02', 'DQB1*03:03', 'DQB1*04:02', 'DQA1*01:03',
       'DQB1*06:01', 'DQA1*01:02', 'DQB1*06:02', 'DQB1*06:04',
       'DRB1*04:03', 'DRB1*16:02', 'DRB1*03:02', 'DRB1*11:04',
       'DQB1*02:01', 'DQA1*05:01', 'DPA1*03:01', 'DPB1*05:01',
       'DPA1*02:01', 'DPB1*13:01', 'DPA1*04:01', 'DPB1*14:01',
       'DPB1*17:01', 'DPB1*19:01', 'DPA1*02:02', 'DPB1*28:01',
       'DQA1*02:01', 'DQB1*02:02', 'DPB1*04:02', 'DPA1*01:03',
       'DPB1*01:01', 'DPB1*03:01', 'DPB1*04:01', 'DRB1*01:01',
       'DRB1*01:02', 'DRB1*01:03', 'DRB1*04:01', 'DRB1*04:02',
       'DRB1*04:05', 'DRB1*07:01', 'DRB1*08:01', 'DRB1*09:01',
       'DRB1*10:01', 'DQB1*06:03', 'DRB1*13:03', 'DRB1*15:02',
       'DRB1*16:01', 'DRB3*01:01', 'DRB3*02:02', 'DRB4*01:01',
       'DQB1*04:01', 'DQA1*04:01', 'DQA1*01:01', 'DQB1*05:01',
       'DQA1*01:04', 'DQB1*05:03'], dtype=object)

In [None]:
# res.LuminexBeads.values
res.iloc[94, :]

Epitope_Mismatch    {'30H[DQ]': 'DQB1*06:03', '156WA': 'C*12:03', ...
Tx_id                                                            3861
Donor_HLA           Participant(hlas=[HLA(high_res='DQB1*06:03', l...
Recipient_HLA       Participant(hlas=[HLA(high_res='C*07:01', low_...
LuminexBeads        {'Positive': {'HighResolution': {'DQA1*01:04',...
DESA_Status                                                   No DESA
DSA_Status                                                        DSA
DESA_info                                                          ()
DSA_info                                                  [DQ6, DR52]
Name: 94, dtype: object

In [208]:
df_total[df_total.tx_id.eq(3026) & df_total.allele.apply(lambda x: x.split('*')[0] not in ['A', 'B', 'C']) & (df_total.assignment != 'Bead Failure')]

Unnamed: 0,Bead,Raw Value,BCM,BCR,AD-BCR,assignment,allele,tx_id
8262,186.0,198.0,51.0,0.37,0.39,Negative,DRB1*14:04,3026
8332,175.0,254.0,126.0,0.92,1.35,Negative,DRB5*01:01,3026
8333,122.0,227.0,68.0,0.55,0.57,Negative,DPB1*05:01,3026
8344,122.0,227.0,68.0,0.55,0.57,Negative,DPA1*02:02,3026
8345,128.0,140.0,19.0,0.15,0.16,Negative,DPA1*01:03,3026
8357,128.0,140.0,19.0,0.15,0.16,Negative,DPB1*18:01,3026
8363,109.0,141.0,13.0,0.1,0.11,Negative,DPA1*02:01,3026
8364,109.0,141.0,13.0,0.1,0.11,Negative,DPB1*01:01,3026
8365,110.0,198.0,56.0,0.45,0.48,Negative,DPA1*02:02,3026
8366,110.0,198.0,56.0,0.45,0.48,Negative,DPB1*01:01,3026


### Excel Files

In [380]:
# dfs = [pd.read_excel(path, skiprows=2) for path in excel_paths]
# pd.concat(dfs)

# pd.read_excel(excel_paths[0], skiprows=2) 


# Single Files

In [5]:
class_I = {2867, 2885, 2974, 2990, 3002, 3026, 3042, 3105, 3118, 3135, 3140, 3152, 3188}
class_II = {2974, 2990, 3026, 3042, 3118, 3135, 3140, 3188}
all_new = class_I.union(class_II).union({2767, 2838, 2801, 2835, 2896, 2907, 3703})

In [122]:
# import os
# from collections import Counter
# files = os.listdir('/Users/Danial/UMCUtrecht/Leuven/lsa_per_tx')
# csv_ids = []
# for file in files:
#     id = file.split('.')[0].split(' ')[0]
#     if id.isdigit():
#         csv_ids.append(int(id))