In [4]:
import pandas as pd
import os
from concurrent.futures import ProcessPoolExecutor
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('#'):
                header_line = line[1:].strip()  # Remove '#' and strip spaces
                columns = header_line.split('\t')  # Split the header by tab
                break
        else:
            raise ValueError(f"No header line found in {file_path}")
    
    # Read the data into a DataFrame
    df = pd.read_csv(file_path, sep='\t', header=None, names=columns, comment='#')
    
    # Ensure required columns are present
    if 'Me' not in df.columns or 'Un' not in df.columns:
        raise ValueError(f"'Me' or 'Un' columns not found in {file_path}")

    # Calculate methylation level and filter out rows where meth is 0 or 1
    df['meth'] = df['Me'] / (df['Me'] + df['Un'])
    df = df[(df['meth'] > 0) & (df['meth'] < 1)]

    return df

base_path = "/shares/grossniklaus.botinst.uzh/dkt/projects/meth1001_code_DKT/analysis/18_jsd_temp/output"

def process_single_file(file_path):
    processed_data = load_data(file_path)
    
    if "CHG" in file_path:
        return 'CHG', processed_data
    elif "CpG" in file_path:
        return 'CpG', processed_data
    elif "CHH" in file_path:
        return 'CHH', processed_data
    else:
        return 'Unknown', processed_data  

def process_files(directory):
    data_dict = {'CHG': [], 'CpG': [], 'CHH': []}
    
    dir_path = os.path.join(base_path, directory)
    
    if not os.path.isdir(dir_path):
        print(f"Directory {dir_path} does not exist. Skipping.")
        return data_dict
    
    file_paths = [os.path.join(dir_path, file_name) 
                  for file_name in os.listdir(dir_path) if file_name.endswith('.txt')]
    
    if not file_paths:
        print(f"No .txt files found in {dir_path}. Skipping.")
        return data_dict
    
    # Use ProcessPoolExecutor for parallel processing of files
    with ProcessPoolExecutor(max_workers=12) as executor:  
        results = executor.map(process_single_file, file_paths)
    
    # Aggregate the data by category (CHG, CpG, CHH)
    for category, processed_data in results:
        if category in data_dict:
            data_dict[category].append(processed_data)
        else:
            print(f"Unknown category '{category}' in file {file_path}. Skipping.")
    
    # Concatenate data for each category into a single DataFrame
    for key in data_dict:
        if data_dict[key]:  
            data_dict[key] = pd.concat(data_dict[key], ignore_index=True)
        else:
            data_dict[key] = pd.DataFrame()  # Empty DataFrame if no data
    
    return data_dict

# Define a function to prepare and plot the data as box plots
def prepare_data_for_box_plot(temps, contexts, process_files):
    all_data = []
    
    for temp in temps:
        for context in contexts:
            dir_name = f"{context}_{temp}"
            context_data = process_files(dir_name)
            
            if context_data.empty:
                print(f"No data found for {context} at {temp}.")
                continue
            
            if 'JSD_bit_' not in context_data.columns:
                print(f"'JSD_bit_' column not found in {dir_name}.")
                continue
            
            context_data = context_data.rename(columns={'JSD_bit_': 'JSD (bit)'})
            context_data['Temperature'] = temp
            context_data['Context'] = context
            
            all_data.append(context_data[['Temperature', 'Context', 'JSD (bit)']])
    
    return pd.concat(all_data, ignore_index=True)



In [5]:
def plot_box_plots(data, ax):
    sns.set(style='ticks')
    sns.set_context('talk')
    
    sns.boxplot(x='Temperature', y='JSD (bit)', hue='Context', data=data, ax=ax, palette='Set2')
    
    ax.set_title('JSD Distribution Across Temperatures and Contexts', fontsize=14)
    ax.set_xlabel('Temperature', fontsize=12)
    ax.set_ylabel('JSD (bit)', fontsize=12)
    ax.legend(title='Context')

def plot_all_temps():
    temps = ['10C', '16C', '22C']
    contexts = ['CHG', 'CpG', 'CHH']
    
    data = prepare_data_for_box_plot(temps, contexts, process_files)
    
    if data.empty:
        print("No data available for plotting.")
        return
    
    fig, ax = plt.subplots(figsize=(12, 8))
    plot_box_plots(data, ax)
    
    plt.tight_layout()
    plt.show()

# Example call to start the plotting
plot_all_temps()


AttributeError: 'dict' object has no attribute 'empty'