In [53]:
import numpy as np
import pandas as pd
import pickle
import os

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
ROOT_DIR = "/Users/Tim/data/tcga_gbm/sample_embeddings"
METADATA_PATH = os.path.join(ROOT_DIR, "full_slides_metadata.csv")
TRAIN_PATH = os.path.join(ROOT_DIR, "aggr_train_embeddings.pickle")
VAL_PATH = os.path.join(ROOT_DIR, "aggr_val_embeddings.pickle")

In [4]:
metadata = pd.read_csv(METADATA_PATH)

In [12]:
case2label = metadata.groupby("case_id")["label"].first()
case2label.head()

case_id
TCGA-02-0055    0
TCGA-02-0057    1
TCGA-02-0059    0
TCGA-02-0075    1
TCGA-02-0085    1
Name: label, dtype: int64

In [18]:
def load_embeddings(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

In [71]:
def create_X_y(file_path):
    embeddings = load_embeddings(file_path)
    df = {}
    y = np.zeros(len(embeddings))
    i = 0
    for case_id, vals in embeddings.items():
        df[case_id] = vals["mean"]
        y[i] = case2label[case_id]
        i += 1
    
    X = pd.DataFrame.from_dict(df, orient="index")
    return X, y

In [72]:
X_train, y_train = create_X_y(TRAIN_PATH)
X_val, y_val = create_X_y(VAL_PATH)

In [73]:
logreg = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train, y_train)

In [74]:
logreg.score(X_val, y_val)

0.5555555555555556

In [75]:
rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [76]:
rf.score(X_val, y_val)

0.5555555555555556