In [8]:
#basic packages
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
#interpolation
from scipy import interpolate 

#qiime
import qiime2

## 1. read data

In [25]:
metadata = pd.read_csv('input_data/550_metadata.txt', sep='\t')
feature_table = pd.read_csv('input_data/550_male_feces.tsv', sep = '\t', index_col = [0])

In [26]:
timestep_dictionary = dict(zip(metadata['sample_name'],
                      metadata['days_since_experiment_start']))

In [27]:
df = feature_table.T

#map samples to timepoints
df.index = df.index.to_series().map(timestep_dictionary)
df = df.loc[:, (df != 0).any(axis=0)]

df.index = df.index.astype(int)
df = df[df.index.isin(range(113, 393))] # best timepoints
df = df.sort_index()

In [28]:
df.shape

(227, 1399)

## 2. first interpolate

In [None]:
full_df = list(range(df.index[0], df.index[-1]))
missing_tpoints = list(set(full_df) - set(df.index.astype(int)))

#add missing rows to out dataframe
df_with_missing_tpoints = df.reindex(df.index.union(missing_tpoints))

#dataframe with only missing timepoints
masked_df = df_with_missing_tpoints.loc[missing_tpoints]

In [30]:
def nearest_interp(col, masked_df):
    
    df_interpolated = pd.DataFrame(index = masked_df.index)

    tmp = masked_df[col]
    base_nodes =  tmp.dropna().index #nodes
    interpolated_nodes = tmp[tmp.isna()].index #this is being interpolated
    
    f = interpolate.interp1d(base_nodes,
                             tmp.dropna().values,
                             kind='nearest')
    new_y = f(interpolated_nodes)

    name = str(col)
    df_interpolated.loc[base_nodes, name] = tmp.dropna().values
    df_interpolated.loc[interpolated_nodes, name] = new_y
    
    return df_interpolated

In [None]:
interpolated_timepoints = []
for col in df_with_missing_tpoints.columns:
    y = nearest_interp(col, df_with_missing_tpoints)
    interpolated_timepoints.append(y)
    
nearest_df = pd.concat(interpolated_timepoints, axis=1)

## 3. remove rare bacteria

In [36]:
#remove very rare bacteria
rare_bacteria_df = pd.DataFrame((nearest_df == 0).astype(int).sum(axis = 0))
rare_bacteria_col = rare_bacteria_df[rare_bacteria_df[0] > 250].index
df_filtered = nearest_df.drop(rare_bacteria_col, axis = 1)

In [37]:
df_filtered.shape

(279, 364)

In [38]:
df_filtered.index = ['s'+ str(i) for i in df_filtered.index]

In [40]:
#save feature table for mbImpute 
df_filtered.to_csv('mbimpute_input_data/feature_table.csv', sep = '\t')

In [None]:
# save feature table for distance matrix calculation
df_filtered.T.to_csv('mbimpute_input_data/filtered_interpolated_feces_male.tsv',
                     sep = '\t')

## 4. phylogenetic distance matrix
### filter sequences and create phylogenetic tree

In [None]:
!biom convert -i filtered_interpolated_feces_male.tsv -o featrue_table.biom --table-type="OTU table" --to-hdf5

In [9]:
#READ FEATURE TABLE AND SEQUENCES
feature_table = qiime2.Artifact.import_data('FeatureTable[Frequency]', 'featrue_table.biom')
sequences = qiime2.Artifact.import_data('FeatureData[Sequence]', 'sequences.fa')

In [18]:
#FILTER  REP SEQUENCES
filtered_sequences = qiime2.plugins.feature_table.methods.filter_seqs(sequences, feature_table)
filtered_sequences.filtered_data.save('filtered_sequences.qza')

'filtered_sequences.qza'

In [None]:
#CREATE TREE
qiime phylogeny align-to-tree-mafft-fasttree \
  --i-sequences filtered_sequences.qza \
  --o-alignment aligned-rep-seqs.qza \
  --o-masked-alignment masked-aligned-rep-seqs.qza \
  --o-tree unrooted-tree.qza \
  --o-rooted-tree rooted-tree.qza 

In [19]:
# TRANSFORM TREE INTO DISTANCE MATRIX
from skbio import TreeNode

tree = Artifact.load('rooted-tree.qza').view(TreeNode)
dist = tree.tip_tip_distances()
distance_matrix_df = dist.to_data_frame()
#SAVE
distance_matrix_df.to_csv('mbimpute_input_data/phylogenetic_distance_matrix.tsv', sep = '\t')

In [20]:
#distance_matrix_df = pd.read_csv('phylogenetic_distance_matrix.tsv', sep = '\t', index_col = [0])

## 5. mbImpute - mbImpute.ipynb

In [3]:
imputed_df = pd.read_csv('mbimpute_output/imputed_feature_table.csv', index_col = [0])

In [4]:
imputed_df.T.to_csv('mbimpute_output/imputed_feature_table_t.csv', sep='\t')

## 6. rarefaction after mbImpute

In [None]:
#IMPORT TO BIOM
biom convert -i imputed_feature_table_t.csv -o imputed_feature_table.biom --table-type="OTU table" --to-hdf5

In [12]:
#RAREFY IMPUTED FEATURE TABLE
imputed_feature_table = Artifact.import_data('FeatureTable[Frequency]', 'imputed_feature_table.biom')
rarefied_imputed_feature_table = qiime2.plugins.feature_table.methods.rarefy(imputed_feature_table, sampling_depth = 16000)

In [15]:
#CONVERT TO DATAFRAME
rarefied_imputed_feature_table_df = rarefied_imputed_feature_table.rarefied_table.view(pd.DataFrame) 

## 7. second interpolation

In [27]:
#CHANGE INDEX TO INT
rarefied_imputed_feature_table_df.index = rarefied_imputed_feature_table_df.index.astype(int)
#DEFINE MISSING TIMEPOINTS
missing_timepoints = list(set([i for i in range(191, 391)]) - set(rarefied_imputed_feature_table_df.index))
#ADD THESE POINTS TO THE DATAFRAME
table_with_missing_tpoints = rarefied_imputed_feature_table_df.reindex(rarefied_imputed_feature_table_df.index.union(missing_timepoints))

In [31]:
#INTERPOLATE USING NEAREST INTERPOLATION METHOD
interpolated_timepoints = []
for col in table_with_missing_tpoints.columns:
    y = nearest_interp(col, table_with_missing_tpoints)
    interpolated_timepoints.append(y)

impute_rarefied_interpolated_twice_df = pd.concat(interpolated_timepoints, axis=1)

In [32]:
impute_rarefied_interpolated_twice_df.to_csv('mbimpute_output/impute_rarefied_interpolated_twice_df.csv')

### sVAR model

### check prediction

In [None]:
history = data.loc[371:391]

mbimpute_pred = pd.read_csv('sVAR2_interpolated_df.csv')
mbimpute_pred.columns = history.columns
rarefied_pred = pd.read_csv('sVAR2_rarefied_df.csv')
rarefied_pred.columns = history.columns

mbimpute = impute_rarefied_interpolated_twice_df.loc[371:391]

In [None]:
mbimpute.to_csv('mbimpute_history.csv')
history.to_csv('rarefied_history.csv')

In [None]:
rarefied_pred[rarefied_pred<0]=0 
mbimpute_pred[mbimpute_pred<0]=0 

In [None]:
mbimpute_pred.index = mbimpute.index
rarefied_pred.index = history.index

## verify prediction

In [None]:
from scipy import stats

In [None]:
RHO = []
PVAL = []

for col in history.columns:
    x = history[col]
    y = rarefied_pred[col]

    rho, pval = stats.spearmanr(x, y)
    RHO.append(rho)

In [None]:
RHO_imputed = []
PVAL = []

for col in history.columns:
    x = mbimpute[col]
    y = mbimpute_pred[col]

    rho, pval = stats.spearmanr(x, y)
    RHO_imputed.append(rho)

In [None]:
rho = pd.DataFrame(RHO_imputed, columns = ['rho'])
rho['type'] = 'imputed'
rho['bacteria'] = history.columns

rho_r = pd.DataFrame(RHO, columns = ['rho'])
rho_r['type'] = 'not_imputed'
rho_r['bacteria'] = history.columns

RHO_DF = rho.append(rho_r)

In [None]:
RHO_DF = RHO_DF.dropna()

In [None]:
rho_r[rho_r['rho'] <0]['beacteria'].tolist()

In [None]:
rho[rho['bacteria'] == 'TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAAGAGCAAGTCTGATGTGAAAGGCTGGGGCTTAACCCCAGGACTG']

In [None]:
sns.boxplot(y = RHO_DF['rho'], x = RHO_DF['type'], color = 'White', linewidth = .8, width = .4)
sns.swarmplot(y = RHO_DF['rho'], x = RHO_DF['type'], palette = 'Set1', s = 7, alpha = .4)
plt.savefig('spearman_rho_imputed_svar2.png')

In [None]:
rarest_bacteria = pd.DataFrame(mbimpute.describe().loc['mean']).sort_values(by=['mean']).head(20).index