In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [62]:
plasma = pd.read_csv('dataset/plasma_processed.csv')
serum = pd.read_csv('dataset/serum_processed.csv')
plasma = plasma.drop(['Unnamed: 0'], axis=1)
serum = serum.drop(['Unnamed: 0'], axis=1)
plasma

Unnamed: 0,1_5-anhydroglucitol,1-monostearin,2_3-dihydroxybutanoic acid NIST,2_4-diaminobutyric acid,2-deoxyerythritol,2-deoxytetronic acid,2-hydroxybutanoic acid,2-hydroxyglutaric acid,2-hydroxyhippuric acid,2-hydroxyvaleric acid,...,tryptophan,tyrosine,UDP-glucuronic acid,urea,uric acid,uridine,valine,xylitol,xylose,Class
0,9483,103,101,963,389,123,20744,139,36,955,...,27450,52403,73,255067,10057,63,157332,154,1380,disease
1,27468,207,155,335,227,178,8611,78,91,103,...,17898,37179,157,180080,8386,118,103083,324,929,disease
2,13976,86,78,176,152,83,22792,92,97,1010,...,29503,30670,211,138861,15822,119,92045,253,610,disease
3,19449,178,104,467,270,157,29418,123,92,289,...,16501,27573,71,183633,14024,102,97349,223,746,disease
4,17209,258,103,979,314,211,16713,76,42,1126,...,24023,30096,222,255081,23154,58,159671,189,2022,disease
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,25149,184,154,444,142,244,20728,51,128,186,...,30518,36411,114,226793,17880,57,114920,252,856,control
78,24181,142,135,786,226,146,10688,64,59,80,...,23731,26506,110,365044,29018,61,127248,359,1003,control
79,23848,118,65,132,352,294,10077,42,56,867,...,30277,35764,158,327943,19736,76,78152,453,1279,control
80,13848,97,65,881,444,122,14869,75,203,1104,...,14930,18921,2176,323303,17479,108,96830,1450,1045,control


In [63]:
plasma.dtypes

1_5-anhydroglucitol                 int64
1-monostearin                       int64
2_3-dihydroxybutanoic acid NIST     int64
2_4-diaminobutyric acid             int64
2-deoxyerythritol                   int64
                                    ...  
uridine                             int64
valine                              int64
xylitol                             int64
xylose                              int64
Class                              object
Length: 159, dtype: object

In [64]:
from scipy import stats

pearson_p = []
for i in plasma.columns:
    if plasma[i].dtypes!='O':
        pearson_coef, p_value = stats.pearsonr(plasma[i], plasma['Class'].map({"control": 0, "disease": 1}))
        pearson_p.append([i, pearson_coef, p_value])
pearson_p_df = pd.DataFrame(pearson_p, columns=['Features', 'Pearson Coefficient', 'p-Value'])
pearson_p_df

Unnamed: 0,Features,Pearson Coefficient,p-Value
0,1_5-anhydroglucitol,-0.05,0.65
1,1-monostearin,0.18,0.11
2,2_3-dihydroxybutanoic acid NIST,0.08,0.49
3,2_4-diaminobutyric acid,0.08,0.50
4,2-deoxyerythritol,0.06,0.59
...,...,...,...
153,uric acid,-0.31,0.00
154,uridine,-0.00,0.99
155,valine,-0.09,0.40
156,xylitol,-0.07,0.54


In [65]:
pearson_p_final = []
for i in range(158):
    if pearson_p_df.loc[i, 'p-Value'] < 0.01:
        pearson_p_final.append(pearson_p_df.loc[i, 'Features'])
len(pearson_p_final)

22

# Serum

In [66]:
pearson_p_serum = []
for i in serum.columns:
    if serum[i].dtypes!='O':
        pearson_coef, p_value = stats.pearsonr(serum[i], serum['Class'].map({"control": 0, "disease": 1}))
        pearson_p_serum.append([i, pearson_coef, p_value])
pearson_p_df_serum = pd.DataFrame(pearson_p_serum, columns=['Features', 'Pearson Coefficient', 'p-Value'])
pearson_p_df_serum

Unnamed: 0,Features,Pearson Coefficient,p-Value
0,1_5-anhydroglucitol,-0.07,0.50
1,1-monostearin,-0.05,0.67
2,2_3-dihydroxybutanoic acid NIST,-0.18,0.11
3,2_4-diaminobutyric acid,-0.14,0.21
4,2-deoxyerythritol,0.02,0.88
...,...,...,...
153,uric acid,-0.28,0.01
154,uridine,0.07,0.52
155,valine,-0.10,0.39
156,xylitol,-0.13,0.24


In [67]:
pearson_p_final_serum = []
for i in range(158):
    if pearson_p_df_serum.loc[i, 'p-Value'] < 0.01:
        pearson_p_final_serum.append(pearson_p_df_serum.loc[i, 'Features'])
len(pearson_p_final_serum)

10

### So important variables we derived:

'3-phosphoglycerate',
 '5-methoxytryptamine',
 'adenosine-5-monophosphate',
 'asparagine',
 'aspartic acid',
 'lactic acid',
 'phenol',
 'pyrophosphate',
 'pyruvic acid',
 'taurine'

In [68]:
# necessary functions
from sklearn.decomposition import PCA
labels = np.reshape(plasma['Class'].to_numpy(), (82,1))

def pca_analysis(df, n_components):
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(df.iloc[:, :-1].values)

    pca_concatenate = np.concatenate([pca_result, labels], axis=1)
    return pca_concatenate

def make_dataframe(array, n_features):
    col = []
    for i in range(1, n_features+1):
        col.append(f'pc{i}')
    col.append('Class')
    return pd.DataFrame(array, columns = col)

from sklearn.manifold import TSNE
def tsne_analysis(df, n_components):
    tsne = TSNE(n_components=n_components)
    tsne_result = tsne.fit_transform(df.iloc[:, :-1].values)

    tsne_concatenate = np.concatenate([tsne_result, labels], axis=1)
    return tsne_concatenate
def tsne_dataframe(array, n_features):
    col = []
    for i in range(1, n_features+1):
        col.append(f'tsne{i}')
    col.append('Class')
    return pd.DataFrame(array, columns = col)

def get_xy(df):
  X = df.drop('Class', axis=1)
  X = X.apply(pd.to_numeric)
  y = df['Class'].copy()
  y = y.map({"control": 0, "disease": 1})
  return X, y

# Lzypredict

In [69]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import classification_report

from sklearn.model_selection import ShuffleSplit, train_test_split, StratifiedKFold

In [70]:
imp0 = pearson_p_final
imp1 = pearson_p_final_serum

In [71]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder

In [72]:
accuracy = []
for k in [0, 2, 3, 4, 5, 22, 23]:
    
    if k==0:
        name = ['Plasma(Exploratory)', 'Serum(Exploratory)']
        dfs = [plasma[imp0], serum[imp1]]
    
    elif k==22 or k==23:
        k -= 20
        name = [f'tsne{k}_plasma(Exploratory)', f'tsne{k}_serum(Exploratory)']
        vars()[f'tsne{k}_plasma'] = tsne_analysis(plasma[imp0], k)
        vars()[f'tsne{k}_plasma_df'] = make_dataframe(vars()[f'tsne{k}_plasma'], k)
        
        vars()[f'tsne{k}_serum'] = pca_analysis(serum[imp1], k)
        vars()[f'tsne{k}_serum_df'] = make_dataframe(vars()[f'tsne{k}_serum'], k)
        
        dfs = [vars()[f'tsne{k}_plasma_df'], vars()[f'tsne{k}_serum_df']]
    
    else:
        name = [f'pca{k}_plasma(Exploratory)', f'pca{k}_serum(Exploratory)']
        vars()[f'pca{k}_plasma'] = pca_analysis(plasma[imp0], k)
        vars()[f'pca{k}_plasma_df'] = make_dataframe(vars()[f'pca{k}_plasma'], k)
        
        vars()[f'pca{k}_serum'] = pca_analysis(serum[imp1], k)
        vars()[f'pca{k}_serum_df'] = make_dataframe(vars()[f'pca{k}_serum'], k)
        
        dfs = [vars()[f'pca{k}_plasma_df'], vars()[f'pca{k}_serum_df']]
    
    random_state = 42

    for idx, data in enumerate(dfs):
        result = []
        if data.columns[-1]=='Class':
            X = data.drop('Class', axis=1)
        else:
            X = data
        y = plasma['Class'].copy()
        y = y.map({"control": 0, "disease": 1})
        rs = ShuffleSplit(n_splits=10, test_size=.2, random_state=random_state)

        for train_index, test_index in rs.split(X):
            clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=matthews_corrcoef, predictions=True)
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            scaler = RobustScaler()

            cols = X_train.columns
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            X_train = pd.DataFrame(X_train, columns=[cols])
            X_test = pd.DataFrame(X_test, columns=[cols])

            models, predictions = clf.fit(X_train, X_test, y_train, y_test)
            result.append(models)
        #     print(result)

        acc = []
        for i in range(10):
            acc.append(result[i].iloc[0]['Balanced Accuracy'])

        print(f'\n{name[idx]}: {sum(acc)/len(acc)}')
        accuracy.append([f'{name[idx]}', f'{sum(acc)/len(acc)}'])

100%|██████████| 29/29 [00:00<00:00, 39.79it/s]
100%|██████████| 29/29 [00:00<00:00, 39.52it/s]
100%|██████████| 29/29 [00:00<00:00, 39.31it/s]
100%|██████████| 29/29 [00:00<00:00, 39.87it/s]
100%|██████████| 29/29 [00:00<00:00, 42.00it/s]
100%|██████████| 29/29 [00:00<00:00, 39.87it/s]
100%|██████████| 29/29 [00:00<00:00, 41.74it/s]
100%|██████████| 29/29 [00:00<00:00, 41.48it/s]
100%|██████████| 29/29 [00:00<00:00, 41.37it/s]
100%|██████████| 29/29 [00:00<00:00, 30.41it/s]
  7%|▋         | 2/29 [00:00<00:01, 16.67it/s]


Plasma(Exploratory): 0.878559634809635


100%|██████████| 29/29 [00:00<00:00, 31.58it/s]
100%|██████████| 29/29 [00:00<00:00, 33.48it/s]
100%|██████████| 29/29 [00:00<00:00, 41.25it/s]
100%|██████████| 29/29 [00:00<00:00, 42.85it/s]
100%|██████████| 29/29 [00:00<00:00, 41.92it/s]
100%|██████████| 29/29 [00:00<00:00, 39.56it/s]
100%|██████████| 29/29 [00:00<00:00, 42.99it/s]
100%|██████████| 29/29 [00:00<00:00, 36.98it/s]
100%|██████████| 29/29 [00:00<00:00, 40.85it/s]
100%|██████████| 29/29 [00:00<00:00, 33.30it/s]
  7%|▋         | 2/29 [00:00<00:01, 17.70it/s]


Serum(Exploratory): 0.8702748640248641


100%|██████████| 29/29 [00:00<00:00, 38.96it/s]
100%|██████████| 29/29 [00:00<00:00, 43.12it/s]
100%|██████████| 29/29 [00:00<00:00, 42.69it/s]
100%|██████████| 29/29 [00:00<00:00, 43.27it/s]
100%|██████████| 29/29 [00:00<00:00, 42.44it/s]
100%|██████████| 29/29 [00:00<00:00, 42.53it/s]
100%|██████████| 29/29 [00:00<00:00, 42.08it/s]
100%|██████████| 29/29 [00:00<00:00, 42.90it/s]
100%|██████████| 29/29 [00:00<00:00, 43.48it/s]
100%|██████████| 29/29 [00:00<00:00, 39.12it/s]
  7%|▋         | 2/29 [00:00<00:01, 16.39it/s]


pca2_plasma(Exploratory): 0.8382721445221444


100%|██████████| 29/29 [00:00<00:00, 41.16it/s]
100%|██████████| 29/29 [00:00<00:00, 41.85it/s]
100%|██████████| 29/29 [00:00<00:00, 42.30it/s]
100%|██████████| 29/29 [00:00<00:00, 41.63it/s]
100%|██████████| 29/29 [00:00<00:00, 42.10it/s]
100%|██████████| 29/29 [00:00<00:00, 35.97it/s]
100%|██████████| 29/29 [00:00<00:00, 44.30it/s]
100%|██████████| 29/29 [00:00<00:00, 41.03it/s]
100%|██████████| 29/29 [00:00<00:00, 42.63it/s]
100%|██████████| 29/29 [00:00<00:00, 42.22it/s]
  7%|▋         | 2/29 [00:00<00:01, 16.81it/s]


pca2_serum(Exploratory): 0.8320270007770008


100%|██████████| 29/29 [00:00<00:00, 42.07it/s]
100%|██████████| 29/29 [00:00<00:00, 41.33it/s]
100%|██████████| 29/29 [00:00<00:00, 37.75it/s]
100%|██████████| 29/29 [00:00<00:00, 36.94it/s]
100%|██████████| 29/29 [00:00<00:00, 31.76it/s]
100%|██████████| 29/29 [00:00<00:00, 38.73it/s]
100%|██████████| 29/29 [00:00<00:00, 36.89it/s]
100%|██████████| 29/29 [00:00<00:00, 37.67it/s]
100%|██████████| 29/29 [00:00<00:00, 42.77it/s]
100%|██████████| 29/29 [00:00<00:00, 32.58it/s]
  7%|▋         | 2/29 [00:00<00:01, 15.87it/s]


pca3_plasma(Exploratory): 0.796362665112665


100%|██████████| 29/29 [00:00<00:00, 41.49it/s]
100%|██████████| 29/29 [00:00<00:00, 42.57it/s]
100%|██████████| 29/29 [00:00<00:00, 42.35it/s]
100%|██████████| 29/29 [00:00<00:00, 40.43it/s]
100%|██████████| 29/29 [00:00<00:00, 29.19it/s]
100%|██████████| 29/29 [00:00<00:00, 41.07it/s]
100%|██████████| 29/29 [00:00<00:00, 39.56it/s]
100%|██████████| 29/29 [00:00<00:00, 41.42it/s]
100%|██████████| 29/29 [00:00<00:00, 38.00it/s]
100%|██████████| 29/29 [00:00<00:00, 36.49it/s]
  3%|▎         | 1/29 [00:00<00:03,  8.77it/s]


pca3_serum(Exploratory): 0.8505244755244756


100%|██████████| 29/29 [00:00<00:00, 37.91it/s]
100%|██████████| 29/29 [00:00<00:00, 42.76it/s]
100%|██████████| 29/29 [00:00<00:00, 42.21it/s]
100%|██████████| 29/29 [00:00<00:00, 39.27it/s]
100%|██████████| 29/29 [00:00<00:00, 41.76it/s]
100%|██████████| 29/29 [00:00<00:00, 42.61it/s]
100%|██████████| 29/29 [00:00<00:00, 42.38it/s]
100%|██████████| 29/29 [00:00<00:00, 40.51it/s]
100%|██████████| 29/29 [00:00<00:00, 38.09it/s]
100%|██████████| 29/29 [00:00<00:00, 39.11it/s]
  7%|▋         | 2/29 [00:00<00:01, 16.00it/s]


pca4_plasma(Exploratory): 0.8274135586635586


100%|██████████| 29/29 [00:00<00:00, 34.21it/s]
100%|██████████| 29/29 [00:00<00:00, 38.90it/s]
100%|██████████| 29/29 [00:00<00:00, 38.64it/s]
100%|██████████| 29/29 [00:00<00:00, 41.15it/s]
100%|██████████| 29/29 [00:00<00:00, 42.53it/s]
100%|██████████| 29/29 [00:00<00:00, 41.76it/s]
100%|██████████| 29/29 [00:00<00:00, 34.17it/s]
100%|██████████| 29/29 [00:00<00:00, 42.97it/s]
100%|██████████| 29/29 [00:00<00:00, 42.72it/s]
100%|██████████| 29/29 [00:00<00:00, 41.15it/s]
  7%|▋         | 2/29 [00:00<00:01, 16.39it/s]


pca4_serum(Exploratory): 0.887956487956488


100%|██████████| 29/29 [00:00<00:00, 39.85it/s]
100%|██████████| 29/29 [00:00<00:00, 43.66it/s]
100%|██████████| 29/29 [00:00<00:00, 42.48it/s]
100%|██████████| 29/29 [00:00<00:00, 40.89it/s]
100%|██████████| 29/29 [00:00<00:00, 42.04it/s]
100%|██████████| 29/29 [00:00<00:00, 40.35it/s]
100%|██████████| 29/29 [00:01<00:00, 27.61it/s]
100%|██████████| 29/29 [00:00<00:00, 40.85it/s]
100%|██████████| 29/29 [00:00<00:00, 29.59it/s]
100%|██████████| 29/29 [00:00<00:00, 36.50it/s]
  7%|▋         | 2/29 [00:00<00:01, 15.15it/s]


pca5_plasma(Exploratory): 0.8250728438228437


100%|██████████| 29/29 [00:00<00:00, 39.10it/s]
100%|██████████| 29/29 [00:00<00:00, 31.31it/s]
100%|██████████| 29/29 [00:00<00:00, 32.03it/s]
100%|██████████| 29/29 [00:00<00:00, 38.14it/s]
100%|██████████| 29/29 [00:00<00:00, 39.67it/s]
100%|██████████| 29/29 [00:00<00:00, 37.46it/s]
100%|██████████| 29/29 [00:00<00:00, 40.01it/s]
100%|██████████| 29/29 [00:00<00:00, 41.42it/s]
100%|██████████| 29/29 [00:00<00:00, 42.24it/s]
100%|██████████| 29/29 [00:00<00:00, 41.37it/s]



pca5_serum(Exploratory): 0.8663704351204352


100%|██████████| 29/29 [00:00<00:00, 42.48it/s]
100%|██████████| 29/29 [00:00<00:00, 33.49it/s]
100%|██████████| 29/29 [00:01<00:00, 26.32it/s]
100%|██████████| 29/29 [00:00<00:00, 31.56it/s]
100%|██████████| 29/29 [00:00<00:00, 30.12it/s]
100%|██████████| 29/29 [00:00<00:00, 43.48it/s]
100%|██████████| 29/29 [00:00<00:00, 39.10it/s]
100%|██████████| 29/29 [00:00<00:00, 41.53it/s]
100%|██████████| 29/29 [00:00<00:00, 41.31it/s]
100%|██████████| 29/29 [00:00<00:00, 41.84it/s]
  7%|▋         | 2/29 [00:00<00:01, 17.54it/s]


tsne2_plasma(Exploratory): 0.8122037684537684


100%|██████████| 29/29 [00:00<00:00, 40.33it/s]
100%|██████████| 29/29 [00:00<00:00, 42.69it/s]
100%|██████████| 29/29 [00:00<00:00, 42.28it/s]
100%|██████████| 29/29 [00:00<00:00, 40.97it/s]
100%|██████████| 29/29 [00:00<00:00, 32.73it/s]
100%|██████████| 29/29 [00:00<00:00, 33.14it/s]
100%|██████████| 29/29 [00:00<00:00, 36.99it/s]
100%|██████████| 29/29 [00:00<00:00, 38.95it/s]
100%|██████████| 29/29 [00:00<00:00, 36.42it/s]
100%|██████████| 29/29 [00:00<00:00, 38.16it/s]



tsne2_serum(Exploratory): 0.8320270007770008


100%|██████████| 29/29 [00:00<00:00, 34.61it/s]
100%|██████████| 29/29 [00:00<00:00, 30.66it/s]
100%|██████████| 29/29 [00:00<00:00, 36.43it/s]
100%|██████████| 29/29 [00:00<00:00, 40.15it/s]
100%|██████████| 29/29 [00:00<00:00, 40.80it/s]
100%|██████████| 29/29 [00:00<00:00, 41.51it/s]
100%|██████████| 29/29 [00:00<00:00, 41.20it/s]
100%|██████████| 29/29 [00:00<00:00, 42.92it/s]
100%|██████████| 29/29 [00:00<00:00, 42.27it/s]
100%|██████████| 29/29 [00:00<00:00, 43.23it/s]
  7%|▋         | 2/29 [00:00<00:01, 16.67it/s]


tsne3_plasma(Exploratory): 0.7874893162393162


100%|██████████| 29/29 [00:00<00:00, 34.58it/s]
100%|██████████| 29/29 [00:00<00:00, 34.13it/s]
100%|██████████| 29/29 [00:00<00:00, 40.49it/s]
100%|██████████| 29/29 [00:00<00:00, 39.53it/s]
100%|██████████| 29/29 [00:00<00:00, 41.02it/s]
100%|██████████| 29/29 [00:00<00:00, 42.91it/s]
100%|██████████| 29/29 [00:00<00:00, 40.76it/s]
100%|██████████| 29/29 [00:00<00:00, 42.19it/s]
100%|██████████| 29/29 [00:00<00:00, 37.89it/s]
100%|██████████| 29/29 [00:00<00:00, 40.92it/s]


tsne3_serum(Exploratory): 0.8505244755244756





In [75]:
accuracy_df = pd.DataFrame(accuracy, columns=['Processed Dataset', 'Accuracy'])
accuracy_df.to_csv('accuracy/accuracy_exploratory.csv', index=False)
accuracy_df

Unnamed: 0,Processed Dataset,Accuracy
0,Plasma(Exploratory),0.878559634809635
1,Serum(Exploratory),0.8702748640248641
2,pca2_plasma(Exploratory),0.8382721445221444
3,pca2_serum(Exploratory),0.8320270007770008
4,pca3_plasma(Exploratory),0.796362665112665
5,pca3_serum(Exploratory),0.8505244755244756
6,pca4_plasma(Exploratory),0.8274135586635586
7,pca4_serum(Exploratory),0.887956487956488
8,pca5_plasma(Exploratory),0.8250728438228437
9,pca5_serum(Exploratory),0.8663704351204352


In [74]:
final_acc = pd.concat([accuracy_df, pd.read_csv('accuracy/accuracy_Raw.csv'),\
                       pd.read_csv('accuracy/accuracy_Specific Metabolics.csv')], ignore_index=True)
final_acc.to_csv('accuracy/accuracy_final.csv')
final_acc

Unnamed: 0.1,Processed Dataset,Accuracy,Unnamed: 0
0,Plasma(Exploratory),0.878559634809635,
1,Serum(Exploratory),0.8702748640248641,
2,pca2_plasma(Exploratory),0.8382721445221444,
3,pca2_serum(Exploratory),0.8320270007770008,
4,pca3_plasma(Exploratory),0.796362665112665,
5,pca3_serum(Exploratory),0.8505244755244756,
6,pca4_plasma(Exploratory),0.8274135586635586,
7,pca4_serum(Exploratory),0.887956487956488,
8,pca5_plasma(Exploratory),0.8250728438228437,
9,pca5_serum(Exploratory),0.8663704351204352,
