<h1 align="center">Machine learning-based prediction of intraoperative cerebrospinal fluid leakage in endoscopic transsphenoidal pituitary surgery: a pilot study <br><br>[Statistical Analysis]</h1>

<h2>[1] Library</h2>

In [None]:
# OS library
import os
import sys
import argparse
import random
from math import sqrt

# Analysis
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.proportion import proportion_confint

import pingouin as pg
%matplotlib inline

<h2>[2] Data Preprocessing</h2>

<h4>[-] Load the database</h4>

In [None]:
file = os.path.join(sys.path[0], "Pituitary DB.xlsx")
db = pd.read_excel(file)

print("N° of patients: {}".format(len(db)))
print("N° of columns: {}".format(db.shape[1]))

<h4>[-] Drop unwanted columns</h4>

In [None]:
# dropping unwanted columns
df = db.drop(['Patient', 'CS', 'Birthdate', 
              'Surgery Date', 'Dural reconstruction technique', 'Post-op fistula (y/n)', 
              'Park Grade','Resection', 'R ratio'], axis = 'columns')

print("N° of columns for analysis: {}".format(df.shape[1]))
print("\n Columns' name: ", df.columns)

# dropping patients with NA and all non-adenoma
df = df.dropna()
df = df[df['Diagnosis'] == 'adenoma']
df = df.drop(['Diagnosis'], axis = 'columns')

df['Hardy (suprasellar)'] = df['Hardy (suprasellar)'].replace(["A", "B","C","D","E"], [0,1,2,3,4])
df[['X', 'Y', 'Z', 'ICD']] = df[['X', 'Y', 'Z', 'ICD']].astype(float)

print("\n\n N° of adenoma patients with all values: {}".format(len(df)))

df.head(2)

<h2>[3] Count and Frequency</h2>

In [None]:
df.groupby(['Intra-op leakage (y/n)', '...']).count()

In [None]:
df['...'].describe()

<h2>[4] Statistical Association</h2>
<ul>
    <li>Levene's test is an inferential statistic used to assess the equality of variances for a variable calculated for two or more groups. If p-value >> 0.05, no difference in variances between the groups</li>
    <li>F-one way ANOVA test is performed if the variance is the same</li>
</ul>

In [None]:
age_nonfistola = df[df['Intra-op leakage (y/n)'] == 0]['...']
age_fistola = df[df['Intra-op leakage (y/n)'] == 1]['...']

print(non_early.shape)
print(stats.levene(non_early, early_relapse))
print(stats.f_oneway(non_early, early_relapse))

## Change equal_var to False if Levene p-value is below 0.05
print(stats.ttest_ind(non_early, early_relapse, equal_var=True))

In [None]:
sex_ct = pd.crosstab(df['...'], df['Intra-op leakage (y/n)'])
print("--- *** Contingency Table *** --- \n",sex_ct)

print("\n--- *** Chi-Square *** ---")
stat, p, dof, expected = stats.chi2_contingency(sex_ct, correction = False)
print("DOF=%d" % dof)
print("Expected values = ", expected)
print("p-value = ", p)
print("stat = ", stat)

prob = 0.95
critical = stats.chi2.ppf(prob, dof)
if abs(stat) >= critical:
    print('\nDependent (reject H0), [Critical: {}]'.format(critical))
else:
    print('\nIndependent (fail to reject H0), [Critical: {}]'.format(critical))

<h4>[-] Holm-Bonferroni correction</h4>

In [None]:
pvals = [...]
significant, adjusted = pg.multicomp(pvals, alpha=0.05, method='holm')
tab = {'Uncorrected':pvals, 'Adjusted':adjusted, 'Significant':significant}
df = pd.DataFrame(tab)
df

<h2>[5] Multivariable Analysis</h2>

<h4>[-] Label encoding</h4>

In [None]:
dummy_v = ['Secreting Status']
df = pd.get_dummies(df, columns = dummy_v, prefix = dummy_v)
df.head(5)

In [None]:
cols_to_keep = ['...']
data = df[cols_to_keep]

# manually add the intercept
data['intercept'] = 1.0
data.head()
data.columns

In [None]:
train_cols = ['...']
logit = sm.Logit(data['Intra-op leakage (y/n)'], data[train_cols], missing = 'drop')
result = logit.fit()

In [None]:
result.summary(alpha = 0.05)

In [None]:
coef = result.params
p = result.pvalues
conf = result.conf_int(alpha = 0.05)

conf['OR'] = coef
conf.columns = ['2.5%', '97.5%', 'OR']

conf = np.exp(conf)
conf['p-value'] = p

<h4>[-] Export Multivariable as Excel file</h4>

In [None]:
conf.to_excel("multivariable.xlsx")