In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
import pickle

In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")
from utils import get_tr_data

In [3]:
healthy_data = get_tr_data(True)
sz_data = get_tr_data(False)

# region prediction transcriptomis data RF gridsearch

In [12]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(healthy_data.region)
y = np.concatenate([y, encoder.transform(sz_data.region)])

X = np.concatenate([healthy_data[common_genes].values, sz_data[common_genes].values])
n_splits = 5
kfold = KFold(n_splits=n_splits)
print(X.shape, y.shape)
means = X.mean(axis=0)
sigma = X.std(axis=0)
X = (X - means) / sigma

(280, 14177) (280,)


In [15]:
gridsearch = GridSearchCV(
    estimator=RandomForestClassifier(n_jobs=-1, random_state=1298643),
    cv=KFold(n_splits=280),
    param_grid={
        "n_estimators": [200, 300], "criterion": ["gini", "entropy"],
        "max_depth": np.arange(30, 50, 4),
        "max_features": ["sqrt", "log2"],
    },
    scoring="accuracy",
    verbose=2,
)

In [None]:
gridsearch.fit(X, y)

In [17]:
gridsearch.best_score_

0.2357142857142857

In [18]:
gridsearch.best_params_

{'criterion': 'entropy',
 'max_depth': 30,
 'max_features': 'sqrt',
 'n_estimators': 300}

In [20]:
with open("../../data/intermediate/gridsearch_RF.pkl", "wb") as fout:
    pickle.dump(gridsearch, fout)