In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../data/whole_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000273079,ENSG00000273173,ENSG00000273259,ENSG00000273274,ENSG00000273294,SEX,BMI_surg,Age,Diabet,Simplified_class
0,DLDR_0001,5.965571,1.612375,4.133821,4.111056,4.150662,2.975845,11.005488,4.405768,6.825329,...,5.637483,-0.005377,1.6758,2.683536,-0.339797,Female,35.214555,55,Non Diabetic,Normal
1,DLDR_0002,5.741587,2.147793,4.120969,3.922234,3.732756,3.199989,10.8607,3.89535,6.453687,...,5.975612,0.532134,1.555218,2.926666,0.435919,Female,39.421748,47,Diabetic,Normal
2,DLDR_0003,5.996891,0.418542,4.086129,3.964871,3.634637,2.949733,10.934025,4.282577,6.437658,...,5.531648,-0.184123,2.391906,2.260662,-0.691083,Male,48.758108,46,Non Diabetic,Normal
3,DLDR_0004,5.551919,0.702492,4.11624,3.97835,3.853979,2.991061,10.760445,4.297722,6.71084,...,5.571799,-0.034474,1.639298,2.341393,0.096771,Female,41.822607,36,Non Diabetic,Normal
4,DLDR_0005,6.430237,1.215978,4.393797,4.018235,3.61422,2.83613,11.491427,4.405558,7.437655,...,5.636848,-1.216981,1.97416,1.351861,-0.079478,Female,53.582192,54,Non Diabetic,Normal


In [3]:
data.drop(data.columns[0], axis=1, inplace=True)

In [4]:
encoded_data = pd.get_dummies(data[['SEX', 'Diabet']]).astype(int)
encoded_data

Unnamed: 0,SEX_Female,SEX_Male,Diabet_Diabetic,Diabet_Non Diabetic
0,1,0,0,1
1,1,0,1,0
2,0,1,0,1
3,1,0,0,1
4,1,0,0,1
...,...,...,...,...
187,1,0,0,1
188,0,1,1,0
189,1,0,0,1
190,1,0,0,1


In [5]:
data = pd.concat([data, encoded_data], axis=1)

In [6]:
X = data.drop(['SEX', 'Diabet','Simplified_class'], axis=1)
y = data.Simplified_class

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [8]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [9]:
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances}).sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
15487,ENSG00000197756,0.008013
11950,ENSG00000169919,0.007750
16731,ENSG00000231500,0.007156
8285,ENSG00000145494,0.006874
8621,ENSG00000148290,0.006665
...,...,...
6026,ENSG00000130733,0.000000
6025,ENSG00000130731,0.000000
6024,ENSG00000130726,0.000000
6023,ENSG00000130725,0.000000


In [10]:
top_features = feature_importance_df.head(1000)['Feature'].tolist()
X_selected = X_train[top_features]

In [11]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_selected, y_train)

In [13]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test[top_features])
print('Feature Selection Using Whole Data:')
print(classification_report(y_test, y_pred))

Feature Selection Using Whole Data:
                       precision    recall  f1-score   support

    Advanced_fibrosis       0.90      0.75      0.82        12
Non_advanced_Fibrosis       0.92      0.92      0.92        13
               Normal       0.81      0.93      0.87        14

             accuracy                           0.87        39
            macro avg       0.88      0.87      0.87        39
         weighted avg       0.88      0.87      0.87        39



In [15]:
with open('selected_features_whole_data.txt', 'w') as f:
    for x in top_features:
        f.write(x + '\n')

In [16]:
feature_importance_df[feature_importance_df.Feature == 'BMI_surg']

Unnamed: 0,Feature,Importance
17396,BMI_surg,0.0


In [17]:
feature_importance_df[feature_importance_df.Feature == 'Age']

Unnamed: 0,Feature,Importance
17397,Age,0.0


In [18]:
feature_importance_df[feature_importance_df.Feature == 'SEX_Female']

Unnamed: 0,Feature,Importance
17398,SEX_Female,0.0


In [19]:
feature_importance_df[feature_importance_df.Feature == 'SEX_Male']

Unnamed: 0,Feature,Importance
17399,SEX_Male,0.0


In [20]:
feature_importance_df[feature_importance_df.Feature == 'Diabet_Diabetic']

Unnamed: 0,Feature,Importance
17400,Diabet_Diabetic,0.0


In [21]:
feature_importance_df[feature_importance_df.Feature == 'Diabet_Non Diabetic']

Unnamed: 0,Feature,Importance
17401,Diabet_Non Diabetic,0.0


In [22]:
import pickle
with open('selected_features.pkl', 'wb') as f:
    pickle.dump(top_features, f)

In [23]:
with open('whole_data_model.pkl', 'wb') as f:
    pickle.dump(model, f)