In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import numpy as np
import seaborn as sns
import os
import sys

from xgboost import XGBRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold, KFold,RandomizedSearchCV


import skbio

In [None]:
MODULE_PATH = os.path.abspath('/storage/zkarwowska/causality_analysis/VAR_MODELS/helper_functions/') # TODO load from CONFIG file
if MODULE_PATH not in sys.path:
    sys.path.append(MODULE_PATH)

from ProcessingFunctions import MicrobiomeDataPreprocessing

MODULE_PATH = os.path.abspath('/storage/pszczerbiak/microbiome_interactions_project') # TODO load from CONFIG file
if MODULE_PATH not in sys.path:
    sys.path.append(MODULE_PATH)
    
from utils.measures import calculate_spearman, calculate_nrmse, inter_dissimilarity

MODULE_PATH = os.path.abspath('/storage/pszczerbiak/microbiome_interactions_project') # TODO load from CONFIG file
if MODULE_PATH not in sys.path:
    sys.path.append(MODULE_PATH)
    
from utils.transformers import CLRTransformer

In [None]:
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

#### READ DATA

In [None]:
file_donorA = '/storage/zkarwowska/microbiome-interactions/datasets/processed/ready_datasets/male_rarefied_interpolated_feces.csv'

In [None]:
savefile_name = '/storage/zkarwowska/causality_analysis/VAR_MODELS/REGRESSION_ANALYSIS_RESULTS/male_lag1/'
lag = 'lag1'

In [None]:
df = pd.read_csv(file_donorA, index_col = [0])
df = df.sort_index()

#### FILTER

In [None]:
processing = MicrobiomeDataPreprocessing()
df_filtered = processing.filter_bacteria(df, 0.1)

In [None]:
df_filtered.shape

#### TRANSFORM USING CLR

In [None]:
clr_transformer = CLRTransformer(is_pseudo_global=True, axis=1)
df_filtered_clr = clr_transformer.fit_transform(df_filtered)

#### MAKE SUPERVISED

In [None]:
df_filtered_supervised = processing.make_supervised(df_filtered, 1)
df_filtered_clr_supervised = processing.make_supervised(df_filtered_clr, 1)

#### SPLIT TO TRAIN AND TEST

In [None]:
train, test  = processing.train_test_split(df_filtered_supervised, 0.2)
train_clr, test_clr  = processing.train_test_split(df_filtered_clr_supervised, 0.2)

In [None]:
n_features = df_filtered.shape[1]

In [None]:
n_features

model = XGBRegressor(objective='reg:squarederror')
cv = RepeatedKFold(n_splits=2, n_repeats=2, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')

#### OLS XgBoost VAR

In [16]:
def ols_xgboost_var(train, i, test, train_clr, test_clr):
    
    y_train = train.filter(like = 'lag0')
    y_train = y_train.iloc[:, i] 

    y_test = test.filter(like = 'lag0')
    y_test = y_test.iloc[:, i]

    scaler = StandardScaler()

    X_train = train_clr.drop(train_clr.filter(like='lag0').columns, axis=1)
    X_train_scaled = scaler.fit_transform(X_train)

    X_test = test_clr.drop(test_clr.filter(like='lag0').columns, axis=1)  
    X_test_scaled = scaler.transform(X_test)



    params = {
            'n_estimators' : [25, 50, 100],
            'reg_alpha' : [1e-1, 1e-2, 1e-3, 1e-4],
            'reg_lambda' : [1e-1, 1e-2, 1e-3, 1e-4]
            }


    xgb = XGBRegressor(objective='reg:squarederror', n_jobs=15, booster = 'gblinear')
    folds = 5
    param_comb = 20

    kfld = KFold(n_splits=folds, shuffle = True, random_state = 1001)
    random_search = RandomizedSearchCV(xgb,
                                       param_distributions=params,
                                       n_iter=param_comb,
                                       scoring='r2',
                                       n_jobs=15,
                                       cv=kfld.split(X_train_scaled,y_train), verbose=0,
                                       random_state=1001 
                                      )

    random_search.fit(X_train_scaled, y_train)
    best_xgb = XGBRegressor(objective='reg:squarederror', n_jobs=1, **random_search.best_params_)
    best_xgb.fit(X_train_scaled, y_train)
    
    prediction = best_xgb.predict(X_test_scaled)
    
    return prediction

In [None]:
ols_predictions = pd.DataFrame()
for i in range(0, n_features):
    
    pred = ols_xgboost_var(train, i, test, train_clr, test_clr)
    ols_predictions[i] = pred
    
ols_predictions.to_csv(savefile_name + 'xgboost_linear_' + lag + '.csv')

#### Poisson XgBoost VAR

In [None]:
def poisson_xgboost_var(train, i, test, train_clr, test_clr):
    
    y_train = train.filter(like = 'lag0')
    y_train = y_train.iloc[:, i] 
    #y_train = np.exp(y_train).multiply(clr_transformer.gmean_[:y_train.shape[0]])

    y_test = test.filter(like = 'lag0')
    y_test = y_test.iloc[:, i]
    #y_test = np.exp(y_test).multiply(clr_transformer.gmean_[-y_test.shape[0]:])

    scaler = StandardScaler()

    X_train = train_clr.drop(train_clr.filter(like='lag0').columns, axis=1)
    X_train_scaled = scaler.fit_transform(X_train)

    X_test = test_clr.drop(test_clr.filter(like='lag0').columns, axis=1)  
    X_test_scaled = scaler.transform(X_test)



    params = {
            'n_estimators' : [25, 50, 100],
            'reg_alpha' : [1e-1, 1e-2, 1e-3, 1e-4],
            'reg_lambda' : [1e-1, 1e-2, 1e-3, 1e-4]
            }


    xgb = XGBRegressor(objective='count:poisson', n_jobs=15, booster = 'gblinear')
    folds = 5
    param_comb = 30

    kfld = KFold(n_splits=folds, shuffle = True, random_state = 1001)
    random_search = RandomizedSearchCV(xgb,
                                       param_distributions=params,
                                       n_iter=param_comb,
                                       scoring='r2',
                                       n_jobs=15,
                                       cv=kfld.split(X_train_scaled,y_train), verbose=0,
                                       random_state=1001 
                                      )

    random_search.fit(X_train_scaled, y_train)
    best_xgb = XGBRegressor(objective='count:poisson', n_jobs=1, **random_search.best_params_)
    best_xgb.fit(X_train_scaled, y_train)
    
    prediction = best_xgb.predict(X_test_scaled)
    
    return prediction

In [None]:
poisson_predictions = pd.DataFrame()
for i in range(0, n_features):
    
    pred = poisson_xgboost_var(train, i, test, train_clr, test_clr)
    poisson_predictions[i] = pred
    
poisson_predictions.to_csv(savefile_name + 'xgboost_poisson_' + lag + '.csv')

#### Tweedie XgBoost VAR

In [None]:
def tweedie_xgboost_var(train, i, test, train_clr, test_clr):
    
    y_train = train.filter(like = 'lag0')
    y_train = y_train.iloc[:, i] 

    y_test = test.filter(like = 'lag0')
    y_test = y_test.iloc[:, i]

    scaler = StandardScaler()

    X_train = train_clr.drop(train_clr.filter(like='lag0').columns, axis=1)
    X_train_scaled = scaler.fit_transform(X_train)

    X_test = test_clr.drop(test_clr.filter(like='lag0').columns, axis=1)  
    X_test_scaled = scaler.transform(X_test)



    params = 
    {
        'n_estimators' : [10, 20, 50, 100, 150],
        'reg_alpha' : [1e-1, 1e-2, 1e-3, 1e-4],
        'reg_lambda' : [1e-1, 1e-2, 1e-3, 1e-4],
        'tweedie_variance_power' : [0, 1, 1.5, 2, 3]
    }


    xgb = XGBRegressor(objective='reg:tweedie', n_jobs=15, booster = 'gblinear')
    
    folds = 5
    param_comb = 20
    kfld = KFold(n_splits=folds, shuffle = True, random_state = 1001)
    random_search = RandomizedSearchCV(xgb,
                                       param_distributions=params,
                                       n_iter=param_comb,
                                       scoring='r2',
                                       n_jobs=15,
                                       cv=kfld.split(X_train_scaled,y_train), 
                                       verbose=0,
                                       random_state=1001 
                                      )

    random_search.fit(X_train_scaled, y_train)
    best_xgb = XGBRegressor(objective='reg:tweedie', n_jobs=15, **random_search.best_params_)
    best_xgb.fit(X_train_scaled, y_train)
    
    
    prediction = best_xgb.predict(X_test_scaled)
    
    return prediction

In [None]:
tweedie_predictions = pd.DataFrame()
for i in range(0, n_features):
    pred = tweedie_xgboost_var(train, i, test, train_clr, test_clr)
    tweedie_predictions[i] = pred
tweedie_predictions.to_csv(savefile_name + 'xgboost_tweedie_' + lag + '.csv')