In [1]:
import pandas as pd
import feather 

# Load data

In [2]:
other_imid = pd.read_spss("../data/interim/OtherIMID.sav") 
all_first = pd.read_spss("../data/interim/Alles 1e sample.sav") # SLE patients (also has non- and other-IMID data)
non_imid = pd.read_spss("../data/interim/Non-Imid control set.sav")
tmo = pd.read_csv("../data/interim/TMO.csv") # blood bank controls (also has data from SLE patients)

# load translation table of columns in TMO.csv vs. the .sav files
df_cols = pd.read_csv("../references/chip_colnames.csv", sep=";")

In [4]:
other_imid.samplenr = other_imid.samplenr + 'A' # these are misformatted; add an "A" to mimic others (and distinguish from plain numeric)
other_imid = other_imid.set_index('samplenr') 
all_first = all_first.set_index('samplenr')
non_imid = non_imid.set_index('samplenr')

# Process

The `all` and `tmo` datasets contain data from more than one group

In [5]:
sle = all_first[all_first.SLE == 1] # keep only SLE patients
blood_bank = tmo[tmo.Class == "nonSLE"] # keep only blood bank controls

The chip columns are called differently in the `blood_bank` dataset than in the others.

In [6]:
df_cols

Unnamed: 0,TF,TB_all,TB_selection
0,Actinin,Actinin,Actinin
1,anti-IgE,antiIgE,
2,ASCA,ASCA,ASCA
3,Beta2GP1,Beta2GP1,Beta2GP1
4,C1q,C1q,C1q
...,...,...,...
96,,Strep15,
97,,Strep16,
98,TIF1gamma,TIF1gamma,TIF1gamma
99,TPO,TPO,TPO


- `TF` are the names in `blood_bank`
- `TB_all` are the names in the other dfs
- `TB_selection` are names of the variables that should be most interesting (e.g. excluding control spots on the chip).

Each row corresponds to the same variable, but it might have a different name in each column!

Rename the columns in `blood_bank` as in the other data sets:

In [7]:
new_colnames = df_cols.TB_all[df_cols.TF.notnull()].tolist() # list of new names for blood bank columns
blood_bank = blood_bank.drop(columns='Class') # this column is in blood_bank, but not in the list (we'll add it back later)
blood_bank.columns = new_colnames # rename columns as in other datasets

We want only the rows that have an entry in all three columns: these are the variables we want to use

In [8]:
keep_cols = df_cols.dropna().TB_all.tolist() # names of variables that exist in both datasets, and that are of interest
keep_cols

['Actinin',
 'ASCA',
 'Beta2GP1',
 'C1q',
 'C3b',
 'Cardiolipin',
 'CCP1arg',
 'CCP1cit',
 'CENP',
 'CMV',
 'CollagenII',
 'CpGmot',
 'CRP1',
 'DFS70',
 'dsDNA2',
 'Enolasearg',
 'Enolasecit',
 'EphB2',
 'FcER',
 'Fibrillarin',
 'Ficolin',
 'GAPDH',
 'GBM',
 'H2Bp',
 'H2Bpac',
 'H4p',
 'H4pac',
 'Histones',
 'IFNLambda',
 'IFNOmega',
 'Jo1',
 'Ku',
 'LaSSB',
 'MBL2',
 'Mi2',
 'Nucleosome',
 'PCNA',
 'Pentraxin3',
 'PmScl100',
 'RA33',
 'RipP0',
 'RipP0peptide',
 'RipP1',
 'RipP2',
 'RNAPolIII',
 'RNP70',
 'RNPA',
 'RNPC',
 'Ro52',
 'Ro60',
 'RPP25ThTo',
 'Scl70',
 'SmBB',
 'SMP',
 'TIF1gamma',
 'TPO',
 'tTG']

In all datasets, keep only columns of interest

In [9]:
blood_bank = blood_bank.loc[:,keep_cols] # keep only columns of interest
other_imid = other_imid.loc[:,keep_cols] 
non_imid = non_imid.loc[:,keep_cols]
sle = sle.loc[:,keep_cols]

Discard one SLE patient with missing data

In [None]:
sle.dropna() # serum from one SLE patient was not run on chip

And row-bind all the data frames together

In [11]:
# add class to distinguish from others
blood_bank['Class'] = "nonSLE"
other_imid['Class'] = "IMID" 
non_imid['Class'] = "nonIMID" 
sle['Class'] = "SLE"
# join all data frames together by binding rows
df_all = pd.concat([sle, other_imid, non_imid, blood_bank])

In [12]:
df_all['Class'].value_counts()

SLE        484
nonSLE     361
IMID       346
nonIMID    218
Name: Class, dtype: int64

# Write data

In [13]:
feather.write_dataframe(df_all, "../data/processed/imid.feather")