In [None]:
import numpy as np
import pandas as pd

In [None]:
# row expression
pat_info_fname = 'data/Selected_sampleInfo.txt'

lipid_fname = 'data/clean_lipidB_135exp_new.txt'
metabolic_fname = 'data/clean_metabolicB_135exp_new.txt'
protein_fname = 'data/clean_protein_unique_imputated_135Expr.txt'

# the following is to format the data/mRNA_GeneExpr625_filteredFPKM1.txt
# update on 6.25
xmrna_fname = 'data/clean_mRNA_135exp.txt'
fmrna_fname = 'data/mRNA_GeneExpr625_filteredFPKM1.txt'


In [None]:
def get_df(fname, pat_info_fname=None, speical_patid=None, col_prefix=None, special_colname='sampleID', scale=True, sep='\t'):
    
    init_df = pd.read_csv(fname, sep=sep)
    # f_cols: gene name
    if col_prefix is not None:
        f_cols = [col_prefix + '_' + x for x in init_df[special_colname].values]
    else:
        f_cols = init_df[special_colname].values
    
    # patient id
    pat_id = list(init_df.columns)
    pat_id.remove(special_colname)
    
    # get measure values and transpose
    m_val = init_df.iloc[:,1:].values.T
    
    if scale:
        # we are using MinMaxScaler
        from sklearn.preprocessing import MinMaxScaler, StandardScaler
        # m_val_scal = MinMaxScaler().fit_transform(m_val)
        m_val_scal = StandardScaler().fit_transform(m_val)
        re_df = pd.DataFrame(data=m_val_scal, columns=f_cols)
    else:
        re_df = pd.DataFrame(data=m_val, columns=f_cols)
        
    re_df['raw_specific_pat_id']=pat_id
    if pat_info_fname is not None:
        re_df = pd.merge(re_df, pat_info_fname, how='inner', 
                 left_on='raw_specific_pat_id', right_on=speical_patid)
        re_df.drop(columns=[speical_patid], inplace=True)
        re_df['label_trans_str'] = np.where(re_df['DiseaseState'].isin(['Death', 'Critical']), 'Critical', re_df['DiseaseState'])
        re_df['label_trans'] = re_df['label_trans_str'].map({'Asymptomatic':0,'Mild':1, 'Severe':2, 'Critical':3})
    return re_df

# 6-25 update mRNA data
def get_mRNA(xmrna_fname, pat_info_df, fmrna_fname):
    """
    xmrna_fname: origanl mRNA for obtaining mRNA patient information, file name string
    
    pat_info_df: all patient information, DataFrame
    
    fmrna_fname: upadted mRNA data, file name string
    
    """
    
    xmrna_df = get_df(xmrna_fname,pat_info_df[['RNA_sampleID','PatientID','DiseaseState']], 
                  speical_patid='RNA_sampleID',col_prefix='mrna', scale=False)
    
    # get patient info
    xmrna_pats_df = xmrna_df.iloc[:,-5:].copy()
    
    fmrna_df = pd.read_csv(fmrna_fname, sep='\t')
    fmrna_df = fmrna_df.T
    fmrna_df.reset_index(inplace=True)
    
    full_colname = fmrna_df.columns.to_list()
    full_colname_col_prefix = ['mrna_'+ x for x in full_colname]
    
    fmrna_df.rename(columns=dict(zip(full_colname,full_colname_col_prefix)), inplace=True)
    fmrna_df['PatientID']=fmrna_df['mrna_index'].apply(lambda x:x.split('_')[0])
    fmrna_df.drop(columns=['mrna_index'], inplace=True)
    
    fmrna_merge_df = xmrna_pats_df.merge(fmrna_df, on='PatientID')
    com_fmrna_colums = fmrna_merge_df.columns.to_list()
    pat_info_columns = com_fmrna_colums[:5]
    fmrna_info_colums = com_fmrna_colums[5:]
    f_fmrna_columns = fmrna_info_colums + pat_info_columns
    fmrna_merge_df = fmrna_merge_df[f_fmrna_columns]
    return fmrna_merge_df

# Get patient information

In [None]:
pat_info_df = pd.read_csv(pat_info_fname, sep='\t')

In [None]:
print(pat_info_df.shape)
pat_info_df.head()

# Get Omics data

In [None]:
# lipid 
lipid_df = get_df(lipid_fname,pat_info_df[['Lipid_sampleID','PatientID','DiseaseState']], 
                  speical_patid='Lipid_sampleID',col_prefix='lipid', scale=False)

# metabolic
metabolic_df = get_df(metabolic_fname,pat_info_df[['Metabolic_sampleID','PatientID','DiseaseState']], 
                  speical_patid='Metabolic_sampleID',col_prefix='metabolic', scale=False)

# protein
protein_df = get_df(protein_fname,pat_info_df[['Protein_sampleID','PatientID','DiseaseState']], 
                  speical_patid='Protein_sampleID',col_prefix='protein', scale=False)

# mRNA
mrna_df = get_mRNA(xmrna_fname,pat_info_df, fmrna_fname)

lipid_df.to_csv('f_data/orignal_lipid_df_135.csv', sep='\t', index=False)
metabolic_df.to_csv('f_data/orignal_metabolic_df_135.csv', sep='\t', index=False)
protein_df.to_csv('f_data/orignal_protein_df_135.csv', sep='\t', index=False)
mrna_df.to_csv('f_data/orignal_mrna_df_135.csv', sep='\t', index=False)

print('lipid_df size: {}'.format(lipid_df.shape[1]-5))
print('metabolic_df size: {}'.format(metabolic_df.shape[1]-5))
print('protein_df size: {}'.format(protein_df.shape[1]-5))
print('mrna_df size: {}'.format(mrna_df.shape[1]-5))

# Generate train and test data set

In [None]:
def gen_train_test(x, test_ratio=0.2):
    from sklearn.model_selection import train_test_split
    
    x_lab = x['label_trans'].values
    x_index = np.arange(x.shape[0])
    
    y_train, y_test, idx_train, idx_test = train_test_split(x_lab,
                                                    x_index,
                                                    stratify=x_lab, 
                                                    test_size=test_ratio, random_state=123)
    pat_id = x['PatientID'].values
    pid_train = pat_id[idx_train]
    pid_test = pat_id[idx_test]
    
    return pid_train, pid_test, y_train, y_test

In [None]:
pid_train, pid_test, y_train, y_test = gen_train_test(lipid_df, test_ratio=0.2)
train_pid_df = pd.DataFrame(data={'pid':pid_train, 'label':y_train})
test_pid_df = pd.DataFrame(data={'pid':pid_test,'label':y_test})
train_pid_df.to_csv('f_data/train_pid.csv', sep='\t', index=False)
test_pid_df.to_csv('f_data/test_pid.csv', sep='\t', index=False)
print('len(pid_train):',len(pid_train))
print(train_pid_df['label'].value_counts()) 
print('len(pid_test):',len(pid_test))
print(test_pid_df['label'].value_counts()) 


# Normalization data

In [None]:
def normalize_data(in_df, out_fname=None, test_df=None):
    from sklearn.preprocessing import StandardScaler
    import pickle
    x_df = in_df.iloc[:,:-5].reset_index(drop=True)
    y_df = in_df.iloc[:,-5:].reset_index(drop=True)
    
    m_val_scal = StandardScaler().fit(x_df)
    if out_fname is not None:
        with open(out_fname, "wb") as output_file:
            pickle.dump(m_val_scal, output_file)
    x_df_norm = pd.DataFrame(data=m_val_scal.transform(x_df),columns=x_df.columns.to_list())
    
    if test_df is not None:
        x_test_df = test_df.iloc[:,:-5].reset_index(drop=True)
        y_test_df = test_df.iloc[:,-5:].reset_index(drop=True)
        x_test_df_norm = pd.DataFrame(data=m_val_scal.transform(x_test_df),columns=x_test_df.columns.to_list())
        return pd.concat([x_df_norm, y_df],axis=1), pd.concat([x_test_df_norm, y_test_df],axis=1) 
    else:
        return pd.concat([x_df_norm, y_df],axis=1) 
    # the concat is done according to the index, so to be correct, the index of x_df_norm, y_df must be the same
    
        

In [None]:
# normalize full data
norm_lipid_df = normalize_data(lipid_df, out_fname='f_model/norm_full_lipid_df.pkl')
norm_lipid_df.to_csv('f_data/norm_full_lipid_df_135.csv', sep='\t', index=False)

norm_metabolic_df = normalize_data(metabolic_df, out_fname='f_model/norm_full_metabolic_df.pkl')
norm_metabolic_df.to_csv('f_data/norm_full_metabolic_df_135.csv', sep='\t', index=False)

norm_protein_df = normalize_data(protein_df, out_fname='f_model/norm_full_protein_df.pkl')
norm_protein_df.to_csv('f_data/norm_full_protein_df_135.csv', sep='\t', index=False)

norm_mrna_df = normalize_data(mrna_df, out_fname='f_model/norm_full_mrna_df.pkl')
norm_mrna_df.to_csv('f_data/norm_full_mrna_df_135.csv', sep='\t', index=False)


In [None]:
# normalize train and test data seperately
# lipid
norm_train_lipid_df, norm_test_lipid_df = normalize_data(
    lipid_df[lipid_df['PatientID'].isin(pid_train)],
    out_fname='f_model/norm_train_test_lipid_df.pkl',
    test_df=lipid_df[lipid_df['PatientID'].isin(pid_test)])
norm_train_lipid_df.to_csv('f_data/norm_train_lipid_df_108.csv', sep='\t', index=False)
norm_test_lipid_df.to_csv('f_data/norm_test_lipid_df_27.csv', sep='\t', index=False)

# metabolic
norm_train_metabolic_df, norm_test_metabolic_df = normalize_data(
    metabolic_df[metabolic_df['PatientID'].isin(pid_train)], 
    out_fname='f_model/norm_train_test_metabolic_df.pkl',
    test_df=metabolic_df[metabolic_df['PatientID'].isin(pid_test)])
norm_train_metabolic_df.to_csv('f_data/norm_train_metabolic_df_108.csv', sep='\t', index=False)
norm_test_metabolic_df.to_csv('f_data/norm_test_metabolic_df_27.csv', sep='\t', index=False)

# protein
norm_train_protein_df,norm_test_protein_df = normalize_data(
    protein_df[protein_df['PatientID'].isin(pid_train)], 
    out_fname='f_model/norm_train_test_protein_df.pkl',
    test_df=protein_df[protein_df['PatientID'].isin(pid_test)])
norm_train_protein_df.to_csv('f_data/norm_train_protein_df_108.csv', sep='\t', index=False)
norm_test_protein_df.to_csv('f_data/norm_test_protein_df_27.csv', sep='\t', index=False)

# mrna
norm_train_mrna_df,norm_test_mrna_df = normalize_data(
    mrna_df[mrna_df['PatientID'].isin(pid_train)], 
    out_fname='f_model/norm_train_test_mrna_df.pkl', 
    test_df=mrna_df[mrna_df['PatientID'].isin(pid_test)])
norm_train_mrna_df.to_csv('f_data/norm_train_mrna_df_108.csv', sep='\t', index=False)
norm_test_mrna_df.to_csv('f_data/norm_test_mrna_df_27.csv', sep='\t', index=False)

In [None]:
norm_train_mrna_df.shape

In [None]:
norm_train_mrna_df.head()