In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, classification_report, make_scorer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

In [2]:
data = pd.read_csv('../data/feature_selected_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001617,...,ENSG00000272305,ENSG00000272414,ENSG00000272442,ENSG00000272658,ENSG00000272869,ENSG00000273079,ENSG00000273173,ENSG00000273274,ENSG00000273294,group
0,0,5.965571,1.612375,4.150662,2.975845,11.005488,6.825329,4.22145,2.884834,2.213457,...,0.265925,-0.005377,1.546032,1.36546,0.69123,5.637483,-0.005377,2.683536,-0.339797,
1,1,5.741587,2.147793,3.732756,3.199989,10.8607,6.453687,4.218183,3.199989,2.543722,...,0.1015,-0.515172,1.806756,0.93549,1.004202,5.975612,0.532134,2.926666,0.435919,
2,2,5.996891,0.418542,3.634637,2.949733,10.934025,6.437658,3.736947,2.683041,2.373873,...,-0.184123,-0.546693,1.127079,0.675699,0.943633,5.531648,-0.184123,2.260662,-0.691083,
3,3,5.551919,0.702492,3.853979,2.991061,10.760445,6.71084,4.003661,2.991061,2.71276,...,-0.178864,2.538993,1.301129,0.702492,0.43119,5.571799,-0.034474,2.341393,0.096771,
4,4,6.430237,1.215978,3.61422,2.83613,11.491427,7.437655,4.377965,1.794991,2.189011,...,-0.079478,-1.216981,0.16153,0.898496,0.085581,5.636848,-1.216981,1.351861,-0.079478,


In [3]:
data.drop(columns=data.columns[0], inplace=True)

In [4]:
# fix values of last column: group
expression_data = pd.read_csv('../data/expression_data.csv')
data['group'] = expression_data['Simplified_class']
del expression_data

In [5]:
data.group.head()

0    Normal
1    Normal
2    Normal
3    Normal
4    Normal
Name: group, dtype: object

In [6]:
# check values of 'group' column
data.group.value_counts()

group
Normal                   74
Advanced_fibrosis        65
Non_advanced_Fibrosis    53
Name: count, dtype: int64

In [7]:
X = data.drop('group', axis=1)
y = data.group

In [8]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, random_state=24, test_size=0.2)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, random_state=24, test_size=0.5)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [10]:
pca = PCA()
X_train_reduced = pca.fit_transform(X_train_scaled)
pca.n_components

In [11]:
pca.n_components_

153

## Model Selection 

In [12]:
# Logistic Regression 
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_reduced, y_train)
# SVM
svc = SVC()
svc.fit(X_train_reduced, y_train)
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train_reduced, y_train)
# KNN
knn = KNeighborsClassifier()
knn.fit(X_train_reduced, y_train);

In [13]:
# validation data preprocessing
X_val_scaled = scaler.transform(X_val)
X_val_reduced = pca.transform(X_val_scaled)
encoder = OneHotEncoder()
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1)).toarray()
y_val_encoded = encoder.transform(y_val.values.reshape(-1, 1)).toarray()

In [14]:
# defining model
model = Sequential()
model.add(Dense(64, input_shape=(153,), activation='relu')) 
model.add(Dropout(0.5))  
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax')) 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [15]:
# f1 metric for training neural network
def f1_score(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true * y_pred, 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true) * y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true * (1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

In [16]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=[f1_score])
model.summary()

In [19]:
history = model.fit(X_train_reduced, y_train_encoded, epochs=100, batch_size=16,
                    validation_data=(X_val_reduced, y_val_encoded), verbose=2, 
                    )

Epoch 1/100
10/10 - 1s - 82ms/step - f1_score: 0.2710 - loss: 13.5894 - val_f1_score: 0.0627 - val_loss: 3.6974
Epoch 2/100
10/10 - 0s - 5ms/step - f1_score: 0.3459 - loss: 10.0186 - val_f1_score: 0.4373 - val_loss: 1.8282
Epoch 3/100
10/10 - 0s - 5ms/step - f1_score: 0.3820 - loss: 9.0624 - val_f1_score: 0.4891 - val_loss: 1.4793
Epoch 4/100
10/10 - 0s - 4ms/step - f1_score: 0.4988 - loss: 6.7037 - val_f1_score: 0.4891 - val_loss: 1.2815
Epoch 5/100
10/10 - 0s - 5ms/step - f1_score: 0.5050 - loss: 6.5261 - val_f1_score: 0.4891 - val_loss: 1.1772
Epoch 6/100
10/10 - 0s - 5ms/step - f1_score: 0.5229 - loss: 6.0012 - val_f1_score: 0.4891 - val_loss: 1.1082
Epoch 7/100
10/10 - 0s - 4ms/step - f1_score: 0.4772 - loss: 5.2344 - val_f1_score: 0.4984 - val_loss: 1.0234
Epoch 8/100
10/10 - 0s - 5ms/step - f1_score: 0.4988 - loss: 4.3262 - val_f1_score: 0.5261 - val_loss: 0.9430
Epoch 9/100
10/10 - 0s - 5ms/step - f1_score: 0.5494 - loss: 4.2511 - val_f1_score: 0.5261 - val_loss: 0.8899
Epoch 1

In [23]:
# evalation
log_reg_val_pred = log_reg.predict(X_val_reduced)
svc_val_pred = svc.predict(X_val_reduced)
rf_val_pred = rf.predict(X_val_reduced)
knn_val_pred = knn.predict(X_val_reduced)

log_reg_val_f1 = f1_score(y_val, log_reg_val_pred, average=None)
svc_val_f1 = f1_score(y_val, svc_val_pred, average=None)
rf_val_f1 = f1_score(y_val, rf_val_pred, average=None)
knn_val_f1 = f1_score(y_val, knn_val_pred, average=None)

print(f'Logistic Regression F1: {log_reg_val_f1}')
print(f'SVC F1: {svc_val_f1}')
print(f'Random Forest F1: {rf_val_f1}')
print(f'KNN F1: {knn_val_f1}')

Logistic Regression F1: [0.8        0.85714286 0.95238095]
SVC F1: [0.88888889 0.88888889 0.9       ]
Random Forest F1: [0.66666667 0.66666667 0.95652174]
KNN F1: [0.85714286 0.8        0.95238095]


In [24]:
log_reg_val_f1_micro = f1_score(y_val, log_reg_val_pred, average='micro')
svc_val_f1_micro = f1_score(y_val, svc_val_pred, average='micro')
rf_val_f1_micro = f1_score(y_val, rf_val_pred, average='micro')
knn_val_f1_micro = f1_score(y_val, knn_val_pred, average='micro')

print(f'Logistic Regression F1: {log_reg_val_f1_micro}')
print(f'SVC F1: {svc_val_f1_micro}')
print(f'Random Forest F1: {rf_val_f1_micro}')
print(f'KNN F1: {knn_val_f1_micro}')

Logistic Regression F1: 0.8947368421052632
SVC F1: 0.8947368421052632
Random Forest F1: 0.8421052631578947
KNN F1: 0.8947368421052632


In [25]:
log_reg_val_f1_macro = f1_score(y_val, log_reg_val_pred, average='macro')
svc_val_f1_macro = f1_score(y_val, svc_val_pred, average='macro')
rf_val_f1_macro = f1_score(y_val, rf_val_pred, average='macro')
knn_val_f1_macro = f1_score(y_val, knn_val_pred, average='macro')

print(f'Logistic Regression F1: {log_reg_val_f1_macro}')
print(f'SVC F1: {svc_val_f1_macro}')
print(f'Random Forest F1: {rf_val_f1_macro}')
print(f'KNN F1: {knn_val_f1_macro}')

Logistic Regression F1: 0.8698412698412697
SVC F1: 0.8925925925925927
Random Forest F1: 0.7632850241545892
KNN F1: 0.8698412698412697


In [26]:
log_reg_val_f1_weighted = f1_score(y_val, log_reg_val_pred, average='weighted')
svc_val_f1_weighted = f1_score(y_val, svc_val_pred, average='weighted')
rf_val_f1_weighted = f1_score(y_val, rf_val_pred, average='weighted')
knn_val_f1_weighted = f1_score(y_val, knn_val_pred, average='weighted')

print(f'Logistic Regression F1: {log_reg_val_f1_weighted}')
print(f'SVC F1: {svc_val_f1_weighted}')
print(f'Random Forest F1: {rf_val_f1_weighted}')
print(f'KNN F1: {knn_val_f1_weighted}')

Logistic Regression F1: 0.900250626566416
SVC F1: 0.8953216374269006
Random Forest F1: 0.8344774980930587
KNN F1: 0.900250626566416


In [28]:
print('Classification Report of Logistic Regression')
print(classification_report(y_val, log_reg_val_pred))

Classification Report of Logistic Regression
                       precision    recall  f1-score   support

    Advanced_fibrosis       0.67      1.00      0.80         4
Non_advanced_Fibrosis       1.00      0.75      0.86         4
               Normal       1.00      0.91      0.95        11

             accuracy                           0.89        19
            macro avg       0.89      0.89      0.87        19
         weighted avg       0.93      0.89      0.90        19



In [29]:
print('Classification Report of KNN')
print(classification_report(y_val, knn_val_pred))

Classification Report of KNN
                       precision    recall  f1-score   support

    Advanced_fibrosis       1.00      0.75      0.86         4
Non_advanced_Fibrosis       0.67      1.00      0.80         4
               Normal       1.00      0.91      0.95        11

             accuracy                           0.89        19
            macro avg       0.89      0.89      0.87        19
         weighted avg       0.93      0.89      0.90        19



In [39]:
# grid search
param_grid = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
f1_scorer = make_scorer(f1_score, average='weighted')
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, scoring=f1_scorer, cv=3, verbose=1)
grid_search.fit(X_train_reduced, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [40]:
print(f'Best F1: {grid_search.best_score_}')
print(f'Best Params: {grid_search.best_params_}')
best_model = grid_search.best_estimator_

Best F1: 0.8294397757507479
Best Params: {'C': 0.1, 'solver': 'lbfgs'}


In [41]:
# final test set validation
X_test_scaled = scaler.transform(X_test)
X_test_reduced = pca.transform(X_test_scaled)
y_test_pred = best_model.predict(X_test_reduced)
print(classification_report(y_test, y_test_pred))

                       precision    recall  f1-score   support

    Advanced_fibrosis       0.60      1.00      0.75         6
Non_advanced_Fibrosis       0.86      0.75      0.80         8
               Normal       1.00      0.50      0.67         6

             accuracy                           0.75        20
            macro avg       0.82      0.75      0.74        20
         weighted avg       0.82      0.75      0.74        20



In [1]:
best_model

NameError: name 'best_model' is not defined