
# Log2fc Flag Characterization

## Purpose:

- Further characterize log2fc that have signs different from the expected sign (when using the straightforward log2fc computation from group means).

In [None]:
import ast

import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [None]:
# Parameters
DGE_TABLE_PATH = None # Path to DGE table
VV_LOG_PATH = None # Path to VV log file
OUTPUT_PATH = None # Path to output file

In [None]:
def parse_tuple(string):
    try:
        return tuple(ast.literal_eval(string))
    except (SyntaxError, ValueError):
        # Handle any parsing errors
        return None

# Load df_vv df_vv recognize the index column is a string representation of a tuple (the first column)
df_vv = pd.read_csv(VV_LOG_PATH, sep="\t", index_col=0, converters={0: parse_tuple})
# Columns in df_vv: 'description', 'sample(s)', 'function', 'code', 'message', 'code_level','kwargs', 'config']

In [None]:
# Find specific flag related to log2fc and whose index is a tuple with the last element == "DGE Output"
target_message = df_vv.loc[df_vv["message"].str.contains("log2fc") & df_vv.index.map(lambda x: x[-1] == "DGE Output" if len(x) > 0 else False),'message']

if len(target_message) == 1:
    # Only one row is found
    value = target_message[0]
    print("Value:", value)
else:
    # Either no rows or more than one row is found
    print("Error: One and only one row expected, but found", len(target_message), "rows.")

In [None]:
# Convert to json
start_index = value.index("{")  # Find the starting index of the JSON part
dict_string = value[start_index:]  # Extract the JSON part from the string

# Evaluate the JSON string as a dictionary using ast
data = ast.literal_eval(dict_string)

In [None]:
# Convert data to dataframe
df = pd.DataFrame(data).T # Transpose to have each flagged gene as a row

In [None]:
SMALL_COUNTS_THRESHOLD = 20
original_count = len(df)
# Filter out cases where the sum of all columns starting with Group.Mean is less than SMALL_COUNTS_THRESHOLD, these are considered small counts examples and are more likely heavily impacted by differences in the DESeq2 log2fc method and the direct computation method.
df = df.loc[df.filter(regex="Group.Mean").sum(axis=1) > SMALL_COUNTS_THRESHOLD]
print(f"Total number gene-rows: {original_count}")
print("Filtered out", original_count - len(df), "rows with sum of Group.Mean columns less than", SMALL_COUNTS_THRESHOLD)

# Create column for average Group.Mean and sort descending by that column
df["Group.Mean_AVERAGE"] = df.filter(regex="Group.Mean").mean(axis=1)
df.sort_values(by="Group.Mean_AVERAGE", ascending=False, inplace=True) 

In [None]:
df_dge = pd.read_csv(DGE_TABLE_PATH)
# Use df_dge to find corresponding Group.Std_ values for each Group.Mean_ in df_vv, they should share indices
df_dge = df_dge.loc[df.index]

# Add Group.Stdev columns to df
stdev_cols = df_dge.filter(regex="Group.Stdev")

# Column bind df and stdev_cols
df = pd.concat([df, stdev_cols], axis=1)

# Create ratio of Group.Mean to Group.Stdev for each Group.{Mean,Stdev}_(<GROUP>) column
for col in df.filter(regex="Group.Mean").columns:
    if "_AVERAGE" in col:
        continue
    stdev_col = col.replace("Mean", "Stdev")
    df["Stdev_to_Mean_ratio_" + col.replace("Group.Mean","")] = df[stdev_col] / df[col]

df


In [None]:
# Write results to csv file named log2fc_flag_characterization.csv
df.to_csv(OUTPUT_PATH)