In [1]:
import pandas as pd 
import numpy as np
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('../data/expression_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000272414,ENSG00000272442,ENSG00000272658,ENSG00000272869,ENSG00000273079,ENSG00000273173,ENSG00000273259,ENSG00000273274,ENSG00000273294,Simplified_class
0,DLDR_0001,5.965571,1.612375,4.133821,4.111056,4.150662,2.975845,11.005488,4.405768,6.825329,...,-0.005377,1.546032,1.36546,0.69123,5.637483,-0.005377,1.6758,2.683536,-0.339797,Normal
1,DLDR_0002,5.741587,2.147793,4.120969,3.922234,3.732756,3.199989,10.8607,3.89535,6.453687,...,-0.515172,1.806756,0.93549,1.004202,5.975612,0.532134,1.555218,2.926666,0.435919,Normal
2,DLDR_0003,5.996891,0.418542,4.086129,3.964871,3.634637,2.949733,10.934025,4.282577,6.437658,...,-0.546693,1.127079,0.675699,0.943633,5.531648,-0.184123,2.391906,2.260662,-0.691083,Normal
3,DLDR_0004,5.551919,0.702492,4.11624,3.97835,3.853979,2.991061,10.760445,4.297722,6.71084,...,2.538993,1.301129,0.702492,0.43119,5.571799,-0.034474,1.639298,2.341393,0.096771,Normal
4,DLDR_0005,6.430237,1.215978,4.393797,4.018235,3.61422,2.83613,11.491427,4.405558,7.437655,...,-1.216981,0.16153,0.898496,0.085581,5.636848,-1.216981,1.97416,1.351861,-0.079478,Normal


In [3]:
data.drop(data.columns[0], axis=1, inplace=True)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop('Simplified_class', axis=1),
                                                    data.Simplified_class, random_state=42, test_size=0.2)

In [7]:
model = LogisticRegression(max_iter=1000)
sfs = SequentialFeatureSelector(model, n_features_to_select="auto", direction='forward', cv=5, n_jobs=7)
sfs.fit(X_train, y_train)

KeyboardInterrupt: 

In [8]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [9]:
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': data.columns[:-1],
    'Importance': importances}).sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
2438,ENSG00000101343,0.009644
14714,ENSG00000187257,0.008097
8592,ENSG00000147905,0.007552
8800,ENSG00000149743,0.006683
8938,ENSG00000151461,0.006595
...,...,...
6004,ENSG00000130643,0.000000
6002,ENSG00000130638,0.000000
6001,ENSG00000130635,0.000000
6000,ENSG00000130595,0.000000


In [26]:
top_features = feature_importance_df.head(1000)['Feature'].tolist()
X_selected = X_train[top_features]

In [39]:
model0 = LogisticRegression(max_iter=1000, C=0.1, solver='lbfgs')
model0.fit(X_selected, y_train)

In [40]:
from sklearn.metrics import classification_report
y_pred = model0.predict(X_test[top_features])
print('Feature Selection Using Only Expression Data:')
print(classification_report(y_test, y_pred))

Feature Selection Using Only Expression Data:
                       precision    recall  f1-score   support

    Advanced_fibrosis       0.86      0.50      0.63        12
Non_advanced_Fibrosis       0.92      0.92      0.92        13
               Normal       0.68      0.93      0.79        14

             accuracy                           0.79        39
            macro avg       0.82      0.78      0.78        39
         weighted avg       0.82      0.79      0.78        39



In [14]:
import random
random_selected_features = random.sample(list(X_train.columns), 1000)

In [34]:
model.fit(X_train[random_selected_features], y_train)
print('Random Feature Selection Using Expression Data First Try:')
print(classification_report(y_test, model.predict(X_test[random_selected_features])))

Random Feature Selection Using Expression Data First Try:
                       precision    recall  f1-score   support

    Advanced_fibrosis       1.00      0.83      0.91        12
Non_advanced_Fibrosis       0.93      1.00      0.96        13
               Normal       0.87      0.93      0.90        14

             accuracy                           0.92        39
            macro avg       0.93      0.92      0.92        39
         weighted avg       0.93      0.92      0.92        39



In [17]:
random_selected_features2 = random.sample(list(X_train.columns), 1000)
model2 = LogisticRegression(max_iter=1000)
model2.fit(X_train[random_selected_features2], y_train)
print(classification_report(y_test, model2.predict(X_test[random_selected_features2])))

                       precision    recall  f1-score   support

    Advanced_fibrosis       1.00      0.67      0.80        12
Non_advanced_Fibrosis       0.92      0.92      0.92        13
               Normal       0.72      0.93      0.81        14

             accuracy                           0.85        39
            macro avg       0.88      0.84      0.85        39
         weighted avg       0.87      0.85      0.85        39



In [18]:
random_selected_features3 = random.sample(list(X_train.columns), 1000)
model3 = LogisticRegression(max_iter=1000)
model3.fit(X_train[random_selected_features3], y_train)
print(classification_report(y_test, model3.predict(X_test[random_selected_features3])))

                       precision    recall  f1-score   support

    Advanced_fibrosis       0.82      0.75      0.78        12
Non_advanced_Fibrosis       0.92      0.85      0.88        13
               Normal       0.81      0.93      0.87        14

             accuracy                           0.85        39
            macro avg       0.85      0.84      0.84        39
         weighted avg       0.85      0.85      0.85        39



In [24]:
model = LogisticRegression(max_iter=1000)
sfs = SequentialFeatureSelector(model, n_features_to_select=100, direction='forward', cv=3, n_jobs=7)
sfs.fit(X_train[random_selected_features], y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

In [38]:
with open('selected_features_expression_data.txt', 'w') as f:
    for x in top_features:
        f.write(x + '\n')
with open('random_selected_features_expression_data.txt', 'w') as f:
    for x in random_selected_features:
        f.write(x + '\n')

In [42]:
import pickle 
with open('model0.pkl', 'wb') as f:
    pickle.dump(model0, f)
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('model2.pkl', 'wb') as f:
    pickle.dump(model2, f)
with open('model3.pkl', 'wb') as f:
    pickle.dump(model3, f)
