In [2]:
import pandas as pd

import qiime2
from qiime2 import Artifact, Metadata
from qiime2.plugins.feature_table.methods import rarefy
from qiime2.plugins.feature_table.visualizers import summarize
from qiime2.plugins.diversity.pipelines import beta

from scipy import interpolate 

In [4]:
%cd

/Users/zkarwowska


In [5]:
#read feature table and metadata
feature_table = pd.read_csv('Desktop/SONATA_VAR_MODEL/VAR_MODELS/DATA/raw_data/550_data/raw_data_from_qiita/550_reference.tsv',
                            sep = '\t', 
                            skiprows = [0],
                            index_col=[0]
                           )

metadata = pd.read_csv('Desktop/SONATA_VAR_MODEL/VAR_MODELS/DATA/raw_data/550_data/raw_data_from_qiita/550_metadata.txt',
                       sep = '\t'
                      )

In [6]:
#remove mislabeled data and select only gut microbiome samples
metadata =  metadata[(metadata['mislabeled'] == False) & (metadata['body_site'] == 'UBERON:feces')]

#separate female from male samples
male_samples = metadata[metadata['sex'] == 'male']['sample_name']
female_samples = metadata[metadata['sex'] == 'female']['sample_name']

#make dictionary with sample name and time step
sample_to_day_dict = dict(zip(metadata['sample_name'], metadata['days_since_experiment_start']))

#filter male and female data
male_df = feature_table[male_samples]
female_df = feature_table[female_samples]

#change column names into days_since_experiment_start
male_df = male_df.rename(columns=sample_to_day_dict)
female_df = female_df.rename(columns=sample_to_day_dict)

#save
#male_df.to_csv('Desktop/microbiome_predicion/1.data_overview/raw_male_feces.csv', sep = '\t')
#female_df.to_csv('Desktop/microbiome_predicion/1.data_overview/raw_female_feces.csv', sep = '\t')

### interpolate using nearest interpolation



In [7]:
#read table - male data
df = male_df
df = df.T

In [8]:
def nearest_interp(col, masked_df):
    
    #masked_df = masked_df.index.astype(int)
    
    df_interpolated = pd.DataFrame(index = masked_df.index)

    tmp = masked_df[col]
    base_nodes =  tmp.dropna().index #nodes
    interpolated_nodes = tmp[tmp.isna()].index.tolist() #this is being interpolated
    
    f = interpolate.interp1d(base_nodes,
                             tmp.dropna().values,
                             kind='nearest')
    new_y = f(interpolated_nodes)

    name = str(col)
    df_interpolated.loc[base_nodes, name] = tmp.dropna().values
    df_interpolated.loc[interpolated_nodes, name] = new_y
    
    return df_interpolated

In [9]:
#define subject specific variables
df.index = df.index.astype(int)
df = df.loc[0:418] # we cut the end of time series as it contains too many missing timepoints
full_df = list(range(0, 419))
missing_tpoints = list(set(full_df) - set(df.index.astype(int)))

#add missing rows to out dataframe
df_with_missing_tpoints = df.reindex(df.index.union(missing_tpoints))

#dataframe with only missing timepoints
masked_df = df_with_missing_tpoints.loc[missing_tpoints]

#missing timepoints 
df_with_missing_tpoints.isna().sum()[0] 

91

In [None]:
#interpolate
interpolated_timepoints = []
for col in df_with_missing_tpoints.columns:
    y = nearest_interp(col, df_with_missing_tpoints)
    interpolated_timepoints.append(y)
nearest_df = pd.concat(interpolated_timepoints, axis=1)

#drop columns that are 0 in all timepoints
nearest_df = nearest_df.loc[:, (nearest_df != 0).any(axis=0)]

In [None]:
nearest_df.shape

In [None]:
#save interpolated data
nearest_df.T.to_csv('male_feces_raw_interpolated.csv', sep = '\t')

### rarefy

In [None]:
!biom convert -i male_feces_raw_interpolated.csv -o male_feces_raw_interpolated.biom --table-type="OTU table" --to-hdf5

In [None]:
unrarefied_table = Artifact.import_data("FeatureTable[Frequency]", 'male_feces_raw_interpolated.biom')

In [None]:
SUMMARY = summarize(unrarefied_table)
SUMMARY.visualization

In [None]:
#rarefy
rarefy_result = rarefy(table=unrarefied_table, sampling_depth=16000)
rarefied_table = rarefy_result.rarefied_table
rarefied_table_df = rarefied_table.view(pd.DataFrame)

### reinterpolate missing timepoints that were lost after rarefaction

In [None]:
#create a dataframe to interpolate
rarefied_table_df.index = rarefied_table_df.index.astype(int)
missing_timepoints_after_rarefaction = list(set([i for i in range(0, 419)]) - set(rarefied_table_df.index))
rarefied_df_with_missing_tpoints = rarefied_table_df.reindex(rarefied_table_df.index.union(missing_timepoints_after_rarefaction))

#interpolate rarefied table
interpolated_timepoints = []
for col in rarefied_df_with_missing_tpoints.columns:
    y = nearest_interp(col, rarefied_df_with_missing_tpoints)
    interpolated_timepoints.append(y)
    
nearest_rarefied_interpolated_df = pd.concat(interpolated_timepoints, axis=1)
nearest_rarefied_interpolated_df.index = nearest_rarefied_interpolated_df.index.astype(int)

#slice dataframe, other parts are of low quality
nearest_rarefied_interpolated_df = nearest_rarefied_interpolated_df.loc[112:391].T
nearest_rarefied_interpolated_df.index.name = '#OTU ID'

In [None]:
nearest_rarefied_interpolated_df.shape

In [1]:
nearest_rarefied_interpolated_df.to_csv('male_feces_rarefied_double_interpolated.csv')

NameError: name 'nearest_rarefied_interpolated_df' is not defined