In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
import os
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
warnings.filterwarnings('ignore')

from utils import *

In [0]:
df_raw = spark.table("onesource_eu_dev_rni.onebiome.mpa4_species_level_reformated").toPandas()
df_raw = df_raw.set_index("barcode")
display(df_raw)

In [0]:
# cols = sorted([col for col in df.columns if col.startswith('Pseudomonas')])
# display(pd.DataFrame(cols, columns=['Pseudomonas_columns']))

In [0]:
# display(df.loc[df["Pseudomonas_b_oryzihabitans"] != df["Pseudomonas_b_oryzihabitans_b"], ['Pseudomonas_b_oryzihabitans', 'Pseudomonas_b_oryzihabitans_b']])



In [0]:
def clean_suffix_and_combine_columns(df):
    # Step 1: Clean suffix in column names
    df.columns = [' '.join([part for part in col.split('_') if len(part) > 1]) for col in df.columns]
    # Step 2: Combine duplicate columns by summing their values
    df = df.groupby(df.columns, axis=1).sum()
    return df

df_cleaned = clean_suffix_and_combine_columns(df_raw)
print(df_cleaned.shape)
display(df_cleaned)

In [0]:
import pandas as pd

col_splits = [col.split(' ') for col in df_cleaned.columns]

counts = {
    'two_words': sum(len(parts) == 2 for parts in col_splits),
    'more_than_two_words': sum(len(parts) > 2 for parts in col_splits),
    'last_word_unknown': sum(parts[-1] == 'unknown' for parts in col_splits),
    'last_word_startswith_sp': sum(parts[-1].startswith('sp') for parts in col_splits),
    'not_startswith_letter': sum(not col[0].isalpha() for col in df_cleaned.columns)
}

pd.DataFrame([counts], index=['counts']).transpose()


In [0]:
cols_more_than_two_words = [col for col in df_cleaned.columns if len(col.split(' ')) > 2]
display(pd.DataFrame(cols_more_than_two_words, columns=['columns_with_more_than_two_words']))

In [0]:
plot_data_distribution(df_cleaned, 'relative species abundance')

In [0]:
df_cleaned.to_csv('../data/dicaprio_mpa4_species.csv', sep='\t', header=True, index=True)

**to genus**

In [0]:
df_genus = df_cleaned.groupby(df_cleaned.columns.str.split(' ').str[0], axis=1).sum()
print(df_genus.shape)
plot_data_distribution(df_genus, 'genus relative rebundance')

In [0]:
df_genus.to_csv('../data/dicaprio_mpa4_genus.csv', sep='\t', header=True, index=True)

**log transformation**

In [0]:
def abundance_table_log_transformation(df):
    pseudo_count = 10 ** np.log10(df[df > 0].min().min()**2 / df.max().max())
    print("pseudo count added: ", pseudo_count)
    return df.applymap(lambda x: np.log10(x + pseudo_count))

df_species_log = abundance_table_log_transformation(df_cleaned)
plot_data_distribution(df_species_log, 'Log10 species-level counts')

In [0]:
df_genus_log = abundance_table_log_transformation(df_genus)
plot_data_distribution(df_genus_log, 'Log10 genus-level counts')

**rescale to 0-1**

In [0]:
def log_abundance_table_rescale(df):
    return (df - df.min().min())/(df.max().max() - df.min().min())

df_species_log_scaled = log_abundance_table_rescale(df_species_log)
plot_data_distribution(df_species_log_scaled, 'Rescaled_log10 species-level relative abundance')

In [0]:
df_genus_log_scaled = log_abundance_table_rescale(df_genus_log)
plot_data_distribution(df_genus_log_scaled, 'Rescaled_log10 genus-level relative abundance')

In [0]:
df_genus_log_scaled.to_csv('../data/dicaprio_mpa4_genus_log_scaled.csv', sep='\t', header=True, index=True)
df_species_log_scaled.to_csv('../data/dicaprio_mpa4_species_log_scaled.csv', sep='\t', header=True, index=True)


In [0]:
df_genus_log_scaled_reduced = feature_reduction_pipeline(df_genus_log_scaled, variance_threshold=0.01, correlation_threshold=0.8, viz_corr_clusters=False, fname='dicaprio_mpa4_genus_log_scaled_reduced')

In [0]:
plot_data_distribution(df_genus_log_scaled_reduced, 'Log10 Genus-level counts after feature reduction')

In [0]:
df_genus_log_scaled_reduced.to_csv('../data/dicaprio_mpa4_genus_log_scaled_reduced.csv', sep='\t', header=True, index=True)

# end