In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from scipy import interpolate 
from scipy.interpolate import pchip_interpolate
from scipy.interpolate import BSpline, splev
from sklearn.metrics import mean_squared_error

%matplotlib inline

sns.set_theme()
import warnings
warnings.filterwarnings('ignore')

In [None]:
%cd Desktop/microbiome_predicion/

In [None]:
df = pd.read_csv('1.data_overview/raw_male_feces.csv',
                 sep = '\t',
                 index_col = [0]
                )

In [None]:
#find missing timepoints
start_df = df.iloc[0].name
end_df = df.iloc[-1].name

full = list(range(start_df, end_df)) 
missing_tpoints = list(set(full) - set(df.index.astype(int)))

#add missing rows
df = df.reindex(df.index.union(missing_tpoints))
df_with_missingpoints = df.copy() #copy of df without Nans in missing points

In [None]:
#mask 10% of random time points in the data but do not mask timepoints that 
#are the same/one after or one before the true missing point 

plus_one = [i+1 for i in missing_tpoints]
minus_one = [i-1 for i in missing_tpoints]

true_missing = missing_tpoints + plus_one + minus_one

mask_L = df.iloc[list(set(df.index) - set(true_missing))].sort_index().index.tolist()
mask_L = mask_L[::2] #remove each 4 number to avoid masking range of values

random.seed(12)
number_masked_points = int(df.shape[0] * 0.03)
random_mask = (random.sample(mask_L, number_masked_points)) 

In [None]:
#mask timepoints in df
single_masked = df.copy()
single_masked.loc[random_mask] = np.nan

## Interpolation

In [None]:
#linear interpolation
def linear_interp(col, masked_df):
    
    df_interpolated = pd.DataFrame(index = masked_df.index)

    tmp = masked_df[col]
    base_nodes =  tmp.dropna().index #wezlowe
    interpolated_nodes = tmp[tmp.isna()].index #to uzupelniamy
    
    f = interpolate.interp1d(base_nodes,
                             tmp.dropna().values,
                             kind='linear')
    
    new_y = f(interpolated_nodes)
    
    name = str(col)
    df_interpolated.loc[base_nodes, name] = tmp.dropna().values
    df_interpolated.loc[interpolated_nodes, name] = new_y
    
    return df_interpolated


app = []
for col in df.columns:
    y = linear_interp(col, single_masked)
    app.append(y)
    
linear_df = pd.concat(app, axis=1)
linear_df = linear_df.loc[:,~linear_df.columns.duplicated()] #drop duplicates

In [None]:
#cubic interpolation
def cubic_interp(col, masked_df):
    
    df_interpolated = pd.DataFrame(index = masked_df.index)

    tmp = masked_df[col]
    base_nodes =  tmp.dropna().index #wezlowe
    interpolated_nodes = tmp[tmp.isna()].index #to uzupelniamy
    
    f = interpolate.interp1d(base_nodes,
                             tmp.dropna().values,
                             kind='cubic')
    new_y = f(interpolated_nodes)

    name = str(col)
    df_interpolated.loc[base_nodes, name] = tmp.dropna().values
    df_interpolated.loc[interpolated_nodes, name] = new_y
    
    return df_interpolated


app = []
for col in df.columns:
    y = cubic_interp(col, single_masked)
    app.append(y)
    
cubic_df = pd.concat(app, axis=1)
cubic_df = cubic_df.loc[:,~cubic_df.columns.duplicated()] #drop duplicates
cubic_df[cubic_df < 0] = 0

In [None]:
#nearest interpolation
def nearest_interp(col, masked_df):
    
    df_interpolated = pd.DataFrame(index = masked_df.index)

    tmp = masked_df[col]
    base_nodes =  tmp.dropna().index #wezlowe
    interpolated_nodes = tmp[tmp.isna()].index #to uzupelniamy
    
    f = interpolate.interp1d(base_nodes,
                             tmp.dropna().values,
                             kind='nearest')
    new_y = f(interpolated_nodes)

    name = str(col)
    df_interpolated.loc[base_nodes, name] = tmp.dropna().values
    df_interpolated.loc[interpolated_nodes, name] = new_y
    
    return df_interpolated


app = []
for col in df.columns:
    y = nearest_interp(col, single_masked)
    app.append(y)
    
nearest_df = pd.concat(app, axis=1)
nearest_df = nearest_df.loc[:,~nearest_df.columns.duplicated()] #drop duplicates

In [None]:
#PCHIP interpolation
def pchip_interp(col, masked_df):
    
    df_interpolated = pd.DataFrame(index = masked_df.index)
    
    tmp = masked_df[col]
    base_nodes =  tmp.dropna().index #wezlowe
    interpolated_nodes = tmp[tmp.isna()].index #to uzupelniamy

    y = pchip_interpolate(base_nodes,
                          tmp.dropna().values,
                          interpolated_nodes)
    
    
    name = str(col)
    df_interpolated.loc[base_nodes, name] = tmp.dropna().values
    df_interpolated.loc[interpolated_nodes, name] = y
    
    return df_interpolated


app = []
for col in df.columns:
    y = pchip_interp(col, single_masked)
    app.append(y)
    
pchip_df = pd.concat(app, axis=1)
pchip_df = pchip_df.loc[:,~pchip_df.columns.duplicated()] #drop duplicates

In [None]:
#bspline interpolation
def bspline_inter(col, masked_df):
    
    df_interpolated = pd.DataFrame(index = masked_df.index)

    tmp = masked_df[col]
    base_nodes =  tmp.dropna().index #wezlowe
    interpolated_nodes = tmp[tmp.isna()].index #to uzupelniamy
    
    f = BSpline(base_nodes,tmp.dropna().values, 3)
    new_y = f(interpolated_nodes)

    name = str(col)
    df_interpolated.loc[base_nodes, name] = tmp.dropna().values
    df_interpolated.loc[interpolated_nodes, name] = new_y
    
    return df_interpolated


app = []
for col in df.columns:
    y = bspline_inter(col, single_masked)
    app.append(y)
    
bspline_df = pd.concat(app, axis=1)
bspline_df = bspline_df.loc[:,~bspline_df.columns.duplicated()] #drop duplicates
bspline_df[bspline_df < 0] = 0

## analyse interpolation efficacy

In [None]:
#nRMSE
def calculate_nRMSE(interpolated_df, interpolation_type):
    
    nRMSE = []
    OTU = []
    for otu in df.iloc[random_mask].columns:

        interpolated_col = interpolated_df.iloc[random_mask][otu]
        true_col = df.iloc[random_mask][otu]

        rmse = mean_squared_error(true_col, interpolated_col, squared=False)

        otu_minmax = true_col.max() - true_col.min()
        nrsme = rmse/otu_minmax
        
        nRMSE.append(nrsme)
        OTU.append(otu)
    
    nrmse_df = pd.DataFrame(list(zip(OTU, nRMSE)),
                            columns = ['OTU', 'nRMSE'])
    nrmse_df['TYPE'] = interpolation_type
    
    return nrmse_df

In [None]:
#calculate error for each interpolated dataframe
def error_for_diff_interpolations(function):
    
    names_list = ['pchip', 'linear', 'bspline', 'cubic', 'nearest']
    dataframe_list = [pchip_df, linear_df, bspline_df, cubic_df, nearest_df]

    nRMSE_DF = []
    for n, d in zip(names_list, dataframe_list):
        nrmse_df = function(d, n)
        nRMSE_DF.append(nrmse_df)

    nRMSE_DF = pd.concat(nRMSE_DF)
    
    return nRMSE_DF

In [None]:
nRMSE_df = error_for_diff_interpolations(calculate_nRMSE)
mean_rmse_error = nRMSE_df.groupby('TYPE').mean()
mean_rmse_error = mean_rmse_error.sort_values(by = ['nRMSE'])