In [196]:
import os
import json
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [197]:
root = os.getcwd() # gives path to git clone
data_dir = os.path.join(root, "cellular_clarity")
data_filename = "12864_2023_9714_MOESM2_ESM.xlsx"
associated_motifs_filename = "associated_motifs.json"
genes_filenames = {
    "all": os.path.join("DEGs", "all_genes.csv"),
    "deg": os.path.join("DEGs", "deg_genes.csv"),
}

clusters_sheetname = "02. Epi DEGs (ctrls @ all tps)"
motifs_sheetname = "03. Motifs Identified Using AME"

In [198]:
# all_genes_dir = os.path.join(data_dir, genes_filenames["all"])
# genes = pd.read_csv(all_genes_dir)
# selected_genes = np.random.choice(genes["AGI"], 1000, replace=False)
#
# data = {}
# for gene in genes["AGI"]:
#     count = np.random.choice(4, replace=False, p=[0.5, 0.45, 0.045, 0.005])
#     data[gene] = list(np.random.choice(selected_genes, count, replace=False))
#
# # Write the dictionary to a JSON file
# associated_motifs_dir = os.path.join(data_dir, associated_motifs_filename)
# with open(associated_motifs_dir, "w") as file:
#     json.dump(data, file, indent=4) # indent for better readability
#
# print(f"Dictionary written to {associated_motifs_dir}")

Dictionary written to C:\Users\Mohsen\Desktop\Documents\Spring 2025\ECE759\git\ECE759_Project\cellular_clarity\associated_motifs.json


In [199]:
def load_json(filepath):
    """
    Loads JSON data from a file.

    Args:
        filepath (str): The path to the JSON file.

    Returns:
        dict or list: The JSON data as a Python dictionary or list, or None if an error occurs.
    """
    try:
        with open(filepath, 'r') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"Error: File not found: {filepath}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in: {filepath}")
        return None

In [200]:
def associated_motifs_(dir, filename) -> dict:
    """
    loads and returns the json file containing a list of all relevant genes and the set of motifs associated to each, based on AME
    Expected format of this file is a dict that maps each gene to all the motifs present in their promoter via the related TF to each such motif
    associated_motifs = {
        "AT5G18090": [
            "AT3G26790",
            "AT4G33280",
        ],
        ...,
        "AT3G18990": [
            "AT5G60130",
        ]
    }
    """
    associated_motifs_dir = os.path.join(dir, filename)
    return load_json(associated_motifs_dir)

associated_motifs = associated_motifs_(data_dir, associated_motifs_filename)
associated_motifs

{'AT1G01010': [],
 'AT1G01020': ['AT5G03315'],
 'AT1G01030': ['AT3G02455'],
 'AT1G01040': ['AT3G53890'],
 'AT1G01046': ['AT5G47077'],
 'AT1G01050': [],
 'AT1G01060': [],
 'AT1G01070': ['AT1G08887'],
 'AT1G01080': ['AT1G18140', 'AT2G26710'],
 'AT1G01090': [],
 'AT1G01100': [],
 'AT1G01110': [],
 'AT1G01120': ['AT2G31240'],
 'AT1G01130': [],
 'AT1G01140': ['AT3G05000', 'AT3G63050'],
 'AT1G01150': ['AT4G02380'],
 'AT1G01160': ['AT5G08215'],
 'AT1G01170': [],
 'AT1G01180': ['AT4G05315'],
 'AT1G01183': [],
 'AT1G01190': ['AT3G51840'],
 'AT1G01200': [],
 'AT1G01210': [],
 'AT1G01220': [],
 'AT1G01225': [],
 'AT1G01230': [],
 'AT1G01240': [],
 'AT1G01250': ['AT1G08887'],
 'AT1G01260': [],
 'AT1G01270': ['AT3G49630'],
 'AT1G01280': ['AT5G03805'],
 'AT1G01290': [],
 'AT1G01300': ['AT1G70730'],
 'AT1G01305': ['AT4G18520'],
 'AT1G01310': [],
 'AT1G01320': ['AT5G62150'],
 'AT1G01335': [],
 'AT1G01340': [],
 'AT1G01350': ['AT1G69490'],
 'AT1G01355': [],
 'AT1G01360': ['AT3G45510'],
 'AT1G01370': ['

In [203]:
def target_genes_(dir, filenames, associated_motifs, cluster=None) -> (list, list):
    """
    Target genes are either those differentially expressed (if DEG is set to True) or others, and if cluster is stated, they should also be in a specific cluster
    """
    degs_dir = os.path.join(dir, filenames["deg"])
    degs = pd.read_csv(degs_dir)
    if cluster is not None:
        degs = degs[degs["Cluster"] == cluster]
    treat = sorted(list(degs["AGI"]))

    all_genes_dir = os.path.join(dir, filenames["all"])
    genes = pd.read_csv(all_genes_dir)
    control =  sorted(list(set(genes["AGI"]) - set(degs["AGI"])))

    freq = []
    relevant_motifs = set()
    for gene in set(treat).intersection(set(associated_motifs.keys())):
        relevant_motifs = relevant_motifs.union(set(associated_motifs[gene]))
    for gene in control:
        if gene in associated_motifs.keys():
            count = len(relevant_motifs.intersection(set(associated_motifs[gene])))
        else:
            count = 0
        freq.append(count)

    freq = pd.Series(index=control, data=freq)
    freq = freq[freq > 0]
    sum = np.sum(freq)
    indices = np.random.choice(len(freq), min([len(treat), len(freq)]), replace=False, p=freq/sum)
    control = sorted(list(freq.iloc[indices].index))

    return treat, control

treated_genes, control_genes = target_genes_(data_dir, genes_filenames, associated_motifs)
len(treated_genes), len(control_genes)

(2739, 2739)

In [220]:
def build_data(treated_genes, control_genes, associated_motifs, threshold=0.5):
    data = treated_genes.copy()
    data.extend(control_genes)
    relevant_motifs = set()
    for gene in set(treated_genes).intersection(set(associated_motifs.keys())):
        relevant_motifs = relevant_motifs.union(set(associated_motifs[gene]))
    relevant_motifs = sorted(list(relevant_motifs))
    data = pd.DataFrame(index=data, columns=relevant_motifs, data=0)
    for gene in data.index:
        if gene in associated_motifs.keys():
            for motif in associated_motifs[gene]:
                if motif in relevant_motifs:
                    data.at[gene, motif] = 1
    data["y"] = np.concatenate([np.ones(len(treated_genes)), np.zeros(len(control_genes))])
    data.sort_index(inplace=True)
    y = data["y"]
    y = np.where(abs(y) > threshold, 1.0, 0.0)
    data.drop("y", axis="columns", inplace=True)
    X = data.copy()
    return X, y

X, y = build_data(treated_genes, control_genes, associated_motifs)
X

Unnamed: 0,AT1G01320,AT1G01610,AT1G01760,AT1G01860,AT1G01880,AT1G02040,AT1G02500,AT1G02690,AT1G03080,AT1G03440,...,AT5G67040,AT5G67200,AT5G67300,AT5G67580,ATCG00570,ATCG00830,ATCG01130,ATMG00610,ATMG00650,ATMG01090
AT1G01140,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AT1G01180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AT1G01210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AT1G01390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AT1G01420,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ATMG01210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ATMG01390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ATMG01410,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ATMG09450,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [223]:
models = {
    "Logistic Regression":
        # Initialize Logistic Regression with Lasso (L1 regularization)
        LogisticRegression(penalty='l1', solver='saga', max_iter=1000, random_state=0),  # 'saga' solver works for L1 penalty
    "Gaussian Naive Bayes":
        GaussianNB(),

}

cv = KFold(n_splits=10, shuffle=True, random_state=0)

for key in models.keys():
    print(f"---------------\nmodel: {key}")

    # Evaluate the model using cross-validation
    scores = cross_val_score(models[key], X, y, cv=cv, scoring='accuracy')

    # Print the accuracy scores for each fold and the average accuracy score
    print("Accuracy scores for each fold:", scores)
    print("Average accuracy score:", scores.mean())

---------------
model: Logistic Regression
Accuracy scores for each fold: [0.6149635  0.57846715 0.5729927  0.6149635  0.58941606 0.62408759
 0.61861314 0.57481752 0.60329068 0.58866545]
Average accuracy score: 0.5980277292197653
---------------
model: Gaussian Naive Bayes
Accuracy scores for each fold: [0.65145985 0.59306569 0.54744526 0.58394161 0.58394161 0.61313869
 0.62591241 0.57481752 0.63619744 0.62888483]
Average accuracy score: 0.6038804894647647


In [229]:
# Set the number of experiment repetitions (k)
k = 20

# Store the coefficients for each iteration (if desired)
coefficients = []

# Run logistic regression with Lasso (L1) for k iterations
for _ in range(k):
    # Split the data into training and test sets (e.g., 80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=_)

    # Initialize Logistic Regression with Lasso (L1 regularization)
    model = LogisticRegression(penalty='l1', solver='saga', max_iter=1000, random_state=_)  # 'saga' solver works for L1 penalty

    # Fit the model
    clf = model.fit(X_train, y_train)
    # clf.score(X_test, y_test)
    # clf.predict_proba(X_test)

    # Optionally, store the coefficients for this iteration
    coefficients.append(model.coef_.flatten())

    # Print the coefficients for this iteration (if desired)
    # print(f"Iteration {_+1} - Coefficients: {model.coef_}")

# Example of accessing coefficients from all iterations
result = pd.DataFrame(index=range(k), columns=X.columns, data=coefficients)
nonzero_cols = np.all(result.to_numpy() != 0, axis=0)
result = result.iloc[:, nonzero_cols]
result = result.mean(axis=0)
result

AT1G01880   -0.968716
AT1G02690   -1.237248
AT1G04263   -1.101517
AT1G04337   -1.779360
AT1G07723   -0.799884
               ...   
AT5G54430   -1.213843
AT5G54580   -0.669609
AT5G64940   -1.346704
AT5G67200   -1.243175
AT5G67300   -1.028165
Length: 125, dtype: float64