In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
import seaborn as sns; sns.set_style("whitegrid", {'axes.grid' : False})
import skbio
from scipy.spatial import distance

import time
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
import os

import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense,Dropout
from sklearn.model_selection import KFold

In [96]:
def dataframe_to_dataset(data_df):
    dataframe = df.copy()
    labels = dataframe.pop("label")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

def pred_dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe)))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

In [67]:
def encode_numerical_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = Normalization()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    # Normalize the input feature
    encoded_feature = normalizer(feature)
    return encoded_feature

# Data Preparation

## Data Setup

In [10]:
data_path = "../../../data/starting_data/not-filtered-data/"
features_path = os.path.join(data_path,"OTU_FIX_feature-table-l7.csv")
edited_metadta = f"{data_path}/../edited_metadata.tsv"
data_df = pd.read_csv(features_path,sep='\t')
    
# if drop_unassigned:
data_df = data_df.loc[~data_df["OTU ID"]
                          .str.contains("Unassigned")].reset_index(drop=True)

#transform on groupby perform the action but keep all rows and just duplicate the values to match original DF
data_df_summed = data_df.groupby(['OTU ID']).transform("sum")
# gropuby removes the column on wihch it was used. So I'm copying it from the original data_df 
data_df_summed['OTU ID'] = data_df['OTU ID']
# reorder the columns such the "OTU ID" is first column
data_df_summed = data_df_summed[['OTU ID']+[c for c in data_df_summed if c not in ['OTU ID']]]

# Drop the duplicates rows now with same values thanks to the transform method
data_df_summed.drop_duplicates(inplace=True)

meta_df = pd.read_csv(edited_metadta,sep='\t').rename(columns={"sampleID":"sample_name"})

In [39]:
# replace the "sample_time" with numbers corresponding the time 
sample_time_enum = {sample_time:i for i,sample_time in enumerate(meta_df.sort_values('visit_age_mo').sample_time.unique().tolist())}
sample_time_enum['sick']=len(sample_time_enum)+5

data_df_indexed = data_df.set_index("OTU ID",drop=True)
meta_df = meta_df.assign(sample_time_enum = meta_df.sample_time)
meta_df.replace({"sample_time_enum":sample_time_enum},inplace=True)
meta_df = meta_df[['sample_time_enum']+[c for c in meta_df if c not in ['sample_time_enum']]]
meta_df.head()

data_idx = meta_df.shape[1]

In [20]:
merged_df = meta_df.merge(data_df_indexed.T,right_index=True,left_on=['sample_name'])

## Creatin the Training Data 

Expects to have a `merged_df` dataframe where the rows are subjects and columns are both metadata and bacterai names.

Expects `data_idx` variable to be the first idx of the data (bacteria) in the `merged_df`. before that are the meta columns


In [32]:
meta_df.visit_age_mo

0       0.9
1       0.1
2       0.2
3       0.7
4       1.1
       ... 
1083    3.4
1084    3.9
1085    6.3
1086    6.3
1087    4.3
Name: visit_age_mo, Length: 1088, dtype: float64

In [61]:
control_merged_data = merged_df[merged_df.symptoms == "Control"]
data_df = control_merged_data.iloc[:,data_idx:]
data_df['label'] = control_merged_data.visit_age_mo

In [108]:
test_df = data_df.sample(frac=0.2,random_state=666)
train_df = data_df.drop(test_df.index)

X_train = train_df.values.astype(np.float32)
Y_train = train_df.label.values.astype(np.float32)
Y_train = Y_train[...,np.newaxis]
# Y_data = control_merged_data.visit_age_mo.values.astype(np.float32)

In [97]:
train_ds = dataframe_to_dataset(train_df)
test_df = dataframe_to_dataset(test_df)

train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)

NameError: name 'df' is not defined

In [76]:
class CustomModel(Model):
    
    def __init__(self, **kwargs):
        super(CustomModel, self).__init__(**kwargs)
        self.dense1 = Dense(128, activation='relu',name="Dense_1" )
        self.dropout1 = Dropout(0.2)
        self.dense2 = Dense(64, activation='relu',name="Dense_2")
        self.dropout2 = Dropout(0.2)
        self.dense3 = Dense(32, activation='relu',name="Dense_3")
        self.predict = Dense(1, activation='sigmoid')
        
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.dense3(x)

In [77]:
model= CustomModel(name='customModel')

In [81]:
model.compile(optimizer='Adam',
              loss='mean_squared_error',
              metrics=['accuracy'])

In [107]:
history = model.fit(X_train,Y_train,
                    batch_size=32,
                    epochs= 10)
# history = my_custom_model.fit(train_ds, y_train,
#                     batch_size= 64,
#                     epochs= 10,
#                     validation_split=0.2)

Epoch 1/10


TypeError: 'NoneType' object is not callable

In [None]:
def create_data(df,data_col_pos,elements_per_sample = 3):
    """
    elements_per_sample: how many examples will appear together for the training
    """


    train_data = list()
    is_sick_data = list()
    meta_data = list()
    records_ids = df.record_id.unique().tolist()

    #build the data for each record
    for rid in records_ids:
        subject_df = df[df.record_id == rid]

        # Transform the df to numpy and take only the columns for the data (and not the metadata)
        full_data_arr = subject_df.iloc[:,data_col_pos:].to_numpy()
        sample_meta_data = subject_df.iloc[:,data_col_pos:].to_numpy()
        n = subject_df.shape[0]

        # get the idxs ot the elements to take for each training sample
        sets_idxs = list(combinations(np.arange(0,n),elements_per_sample))

        subject_train_samples = np.take(full_data_arr,sets_idxs,axis=0)

        # decide if this each sample is sick or not if at least one of the samples is tagged as sick
        # TODO - using AP case is because the symptoms might have probmels -  sample 29 is assigned as "AP Case" but        doesn't have "Symptomatic" in the symptoms
        is_sick = np.array([any(subject_df.iloc[list(idx)].case_id == "AP Case") for idx in sets_idxs])

        train_data.append(subject_train_samples)
        is_sick_data.append(is_sick)
        
#         if (subject_df.case_id == 'AP Case').any():
#             symptoms = subject_df.symptoms.unique()
#             if len(symptoms) == 1:
#                 print("#################################")
#                 print(f"Printing symptoms for id {rid}")
#                 print(subject_df.symptoms.unique())
    return train_data, is_sick_data