# Split Feature
## Create csv file with indexed for training data and testing data

### Import libraries

In [3]:
from pathlib import Path
from sklearn.model_selection import train_test_split 
import pandas as pd
import sqlite3
import numpy as np

### Specifice tissue, datsets, test ratio, save path

In [4]:
weight_tissue = "Brain_Amygdala"
phen_name = "CWR_Total"

data_dir = "/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/results"
phen_df_name = "2024-03-27T11:03:04.174838_phenotype_residualed.db" 
pred_df_name = "2024-03-25T11:01:20.810692_predictor_feature.csv"

default_seed = 42
test_size = 0.2

In [5]:
def get_features_data(load_path):
    """
    Get features data from csv at load path
    """
    
    except_column = ["MotherEducation", "FatherEducation"]
    feature_data = pd.read_csv(load_path, sep="\t")
    feature_data = feature_data.drop(except_column, axis=1)

    return feature_data

def get_phen_data(load_path):
    """
    Get features data from csv at load path
    """
    # Connect to the SQLite database
    conn = sqlite3.connect(load_path)
    # Read data from the "Phenotype" table into a DataFrame
    query = "SELECT * FROM Phenotype"
    phen_data = pd.read_sql_query(query, conn, index_col=None)

    return phen_data


# load x (features) dataframes
feature_path = Path(data_dir) / weight_tissue / pred_df_name
feature_data = get_features_data(feature_path)
# load y (label) dataframes
phen_path =  Path(data_dir) / phen_df_name
phen_data = get_phen_data(phen_path)



print(f"Splitting data for tissue : {weight_tissue} and phenotype : {phen_name}")
# get index of trainning and testing data
indices = np.arange(len(phen_data))
indices_train, indices_test, training_data, testing_data = train_test_split(indices,
                                                                            phen_data["CWR_Total"].values, 
                                                                            random_state=default_seed,
                                                                            test_size=test_size)

train_indices = phen_data.index[indices_train]
test_indices = phen_data.index[indices_test]

print(f"Training data has shape: {training_data.shape}")
print(f"Testing data has shape: {testing_data.shape}")

# create pandas dataframe with all indexes and their respective labels, stratified by phenotypic class
index_data = []
for index in train_indices:
    index_data.append({"labeled_data_index": index, "label": "train"})
for index in test_indices:
    index_data.append({"labeled_data_index": index, "label": "test"})

# make index data a dataframe and sort it by labeled data index
index_data = (
    pd.DataFrame(index_data)
    .sort_values(["labeled_data_index"])
)

# save indexes as csv file
index_data.to_csv(f"/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/results/{weight_tissue}/Data_splits_index_{phen_name}.csv", sep="\t")
print(f"Saved index data\n")

Splitting data for tissue : Brain_Amygdala and phenotype : CWR_Total
Training data has shape: (836,)
Testing data has shape: (210,)
Saved index data



In [10]:
def get_dataset(feature_data, phen_data, data_split_indexes, label):
    
    indexes = data_split_indexes.loc[data_split_indexes["label"] == label]
    indexes = indexes["labeled_data_index"]
    
    data = feature_data.loc[indexes]
    phenotype = phen_data.loc[indexes]
  
    
    return data, phenotype, indexes

In [15]:
from pathlib import Path
from select_parameter_utils import get_dataset
import numpy as np
import pandas as pd


# set numpy seed to make random operations reproduceable
np.random.seed(0)

# load training data from indexes and features dataframe
data_split_path = Path(f"/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/results/Brain_Amygdala/Data_splits_index_CWR_Total.csv")
data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
X_train_raw_df, y_train_tran_df = get_dataset(feature_data, phen_data["CWR_Total"], data_split_indexes, "train")

print((y_train_tran_df.values))
#print(feature_data)

[ 1.07156243e+01 -1.34160899e+01  2.15102909e+01  1.71623369e+01
 -3.94221816e+00 -1.06927974e+01  2.60751551e+01  9.24347190e+00
 -4.71268385e+00 -5.81993581e+00 -6.81003482e+00  1.83625962e+01
  1.47231508e+01  2.79316276e+00  1.90729123e+01 -2.50484108e+01
 -7.86738477e-02  2.21160787e+01  1.95296100e+01  7.36185141e+00
 -2.04407404e+01 -3.20621878e+01 -1.02800096e+01 -3.18624436e+01
 -4.79214949e+00  3.94759453e+00  1.96739651e+00  2.47298557e-02
 -1.01662161e+00 -4.87929394e+00 -1.49592187e+01 -1.15281576e+00
 -1.33377066e+01 -1.13575085e+01 -2.41755225e+01 -3.11901604e+01
  1.13458057e+01 -1.50749931e+01 -1.66482385e+01 -2.79084496e+01
 -9.07096278e-01 -1.81602278e+01  2.80368908e+00  2.48999369e+01
  9.20096609e+00 -9.61091510e+00 -1.51596868e+01 -2.20903799e+01
 -8.59810817e+00 -3.02851985e+00  6.42197068e+00 -1.77237944e+01
 -2.13248758e+01 -3.45206060e+01  8.62750107e+00 -3.11377086e+01
 -2.00780701e+01 -2.20314876e+01  3.07118008e+01  1.69135165e+01
  4.34474110e+00 -2.11361