In [1]:
from sklearn.utils import _safe_indexing
import numpy as np
def _shuffle(y, groups, random_state):
        

    """Return a shuffled copy of y eventually shuffle among same groups."""
    if groups is None:
        indices = random_state.permutation(len(y))
    else:
        indices = np.arange(len(groups))
        for group in np.unique(groups):
            this_mask = groups == group
            indices[this_mask] = random_state.permutation(indices[this_mask])
    return _safe_indexing(y, indices)

In [2]:
from select_parameter_utils import group_shuffle_split, OOB_Search, run_RIT, get_rit_counts
from math import ceil
from copy import deepcopy
import joblib


def run_ITF(X, y, load_configure, rit_params, train_model, group_labels):

   
    X_train_raw_df, X_test_raw_df, y_train_tran_df, y_test_tran_df, train_index, test_index = group_shuffle_split(X, 
                                                                                                                  y, 
                                                                                                                  seed=load_configure['default_seed'],
                                                                                                                  test_size=0.2,
                                                                                                                  groups=group_labels)

    oob_gridsearch = OOB_Search(n_jobs=1,
                                estimator=train_model,
                                param_grid={"K":[1]})

    oob_gridsearch.fit(X_train=X_train_raw_df, y_train=y_train_tran_df)



    all_rf_weights, cv_results= oob_gridsearch.extract_oob_result(oob_gridsearch.output_array, oob_gridsearch.params_iterable)
    
    
    # Convert the bootstrap resampling proportion to the number
    # of rows to resample from the training data
    n_samples = ceil(rit_params['propn_n_samples']* X_train_raw_df.shape[0])

    all_rit_bootstrap_output = {}
    output = joblib.Parallel(n_jobs=3)(
        joblib.delayed(run_RIT)(deepcopy(train_model), 
                                X_train_raw_df, 
                                y_train_tran_df.flatten(), 
                                X_test_raw_df, 
                                y_test_tran_df.flatten(), 
                                n_samples, 
                                all_rf_weights[0], 
                                **rit_params)
        for b in range(0, rit_params['n_bootstrapped'])) 
    for i in (range(0,rit_params['n_bootstrapped'])):
          all_rit_bootstrap_output['rf_bootstrap{}'.format(i)] = output[i]
     
    # Create a DataFrame from the data
    feature_name = [f"Feature{i}" for i in range(1, 21)]
    nub_feature = list(range(len(feature_name)))
    feature_dict = {feature_name[i]: nub_feature[i] for i in range(len(feature_name))}
    # get each bootstrapp sample's interaction term
    rit_counts = joblib.Parallel(n_jobs=3)(
            joblib.delayed(get_rit_counts)(b, 
                                           all_rit_bootstrap_output, 
                                           feature_dict)
            for b in range(rit_params['n_bootstrapped']))   

    all_rit_interactions = [item for sublist in rit_counts for item in sublist]
    stability_score = {m: all_rit_interactions.count(m) / rit_params['n_bootstrapped'] for m in all_rit_interactions}
    
    return stability_score


In [10]:
import numpy as np
from sklearn.datasets import make_regression
import model as im 
import yaml
import sys
import joblib

X, y = make_regression(n_samples=60, n_features=20)
    
# Generate group labels
group_labels = np.repeat(np.arange(3), 20)


parameters = {'max_depth': 2, 'n_estimators': 2, 'oob_score': True}

configure_file = "/exeh_4/yuping/Epistasis_Interaction/02_Explore_interaction_term/model_configure/IRF_RF.yaml"
try:
    with open(configure_file) as infile:
        load_configure = yaml.safe_load(infile)
except Exception:
        sys.stderr.write("Please specify valid yaml file.")
        sys.exit(1)


rit_params = {
    'n_intersection_tree' :  load_configure['model_params']['n_intersection_tree'][0],
    'max_depth' : load_configure['model_params']['max_depth'][0],
    'num_splits' : load_configure['model_params']['num_splits'][0],
    'n_bootstrapped': load_configure['model_params']['n_bootstrapped'][0],
    'propn_n_samples' : load_configure['model_params']['propn_n_samples'][0]
}


train_model = im.IterativeRFRegression(rseed=1, **parameters)



permutation_scores = joblib.Parallel(n_jobs=2)(
        joblib.delayed(run_ITF)(X,
                                _shuffle(y, group_labels, np.random.RandomState(load_configure['default_seed'])),
                                load_configure,
                                rit_params,
                                train_model,
                                group_labels)
        for _ in range(2)
)

print(permutation_scores)

[{'Feature3_Feature4': 0.3, 'Feature2_Feature3': 0.1, 'Feature15_Feature18': 0.1, 'Feature3_Feature18': 0.2, 'Feature3_Feature15': 0.1, 'Feature4_Feature15': 0.2}, {'Feature4_Feature15': 0.2, 'Feature3_Feature18': 0.3, 'Feature3_Feature4': 0.3, 'Feature4_Feature18': 0.1, 'Feature2_Feature3': 0.1}]


In [4]:
import pandas as pd

def compare_dicts(observed_data, perm_data):
    # Create a set of keys present in dict1
    keys_set = set(observed_data.keys())
    
    # Initialize a list to store modified dictionaries
    modified_dicts = []
    
    # Iterate over each dictionary in other_dicts
    for other_dict in perm_data:
        # Create a copy of the dictionary
        modified_dict = other_dict.copy()
        
        # Update the dictionary with keys from dict1
        for key in list(modified_dict.keys()):  # Convert to list to avoid modifying dict during iteration
            # If the key is not present in dict1, remove it from the dictionary
            if key not in keys_set:
                del modified_dict[key]
        
        # Add missing keys from dict1 with value 0
        for key in keys_set:
            if key not in modified_dict:
                modified_dict[key] = 0
        
        # Append the modified dictionary to the list
        modified_dicts.append(modified_dict)
    
    return modified_dicts

# Example dictionaries
Observed_data = {'a':4, 'b':5}
Perm_data = [{'a': 4, 'b': 5, 'd': 6}, {'a': 7, 'c': 8}]  # Example list of dictionaries

# Compare dictionaries
modified_dicts = compare_dicts(Observed_data, Perm_data)
print(modified_dicts)

# Create DataFrame
data = {key: [d.get(key, 0) for d in modified_dicts] for key in Observed_data.keys()}
df = pd.DataFrame(data)

print(df)
print(modified_dicts)

[{'a': 4, 'b': 5}, {'a': 7, 'b': 0}]
   a  b
0  4  5
1  7  0
[{'a': 4, 'b': 5}, {'a': 7, 'b': 0}]


In [9]:
observed_data = {'Feature11_Feature18': 0.4, 'Feature2_Feature12': 0.3}
# Compare dictionaries
#modified_dicts = compare_dicts(observed_data, permutation_scores)
# Create DataFrame
data = {key: [d.get(key, 0) 
              for d in compare_dicts(observed_data, permutation_scores)] 
        for key in observed_data.keys()
}
df = pd.DataFrame(data)

feature_pval = []
for i in df.columns.to_list():
    a=df[i].to_list()
    b=observed_data[i]
    
    # Perform element-wise comparison and count the number of values greater than or equal to b
    count_greater_or_equal = sum(1 for x in a if x >= b)
    
    # Calculate the p-value
    pvalue = (count_greater_or_equal + 1) / (len(a) + 1)
    
    feature_pval.append(pvalue)
    

feature_pval = pd.DataFrame(feature_pval, index=df.columns.to_list())
# Assigning column names
feature_pval.columns = ['Pval']


print(feature_pval)

print(data)


                         Pval
Feature11_Feature18  0.333333
Feature2_Feature12   0.333333
{'Feature11_Feature18': [0, 0], 'Feature2_Feature12': [0, 0]}
