In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from hotelling.stats import hotelling_t2
import statsmodels.api as sm 
from sklearn.model_selection import train_test_split 



In [16]:
# Load data
df = pd.read_csv('../figures/out/gdd_all_reduced_forR.csv',index_col=0)

# PCA
features = [f'G{i}' for i in range(30)]
x = df.loc[:, features].values
y = df.loc[:,['chol']].values
x = StandardScaler().fit_transform(x)
pca = PCA()
principalComponents = pca.fit_transform(x)
evr = pca.explained_variance_ratio_.cumsum()
for i,j in enumerate(evr):
    if j > 0.99:
        nPCs = i + 1
        break
print(f'using {nPCs} components for 99% variance')
pca = PCA(n_components=nPCs)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
            , columns = [f'PC{x}' for x in range(1,nPCs+1)])
finalDf = pd.concat([principalDf, df[['chol']]], axis = 1)

using 18 components for 99% variance


In [18]:
# Hotelling p https://dionresearch.github.io/hotelling/modules.html#module-hotelling.stats
# have to use PCs instead of raw features I think because some features are perfectly correlated?
    # If I don't use PCs it returns an error
x = finalDf[finalDf['chol'] == 15].drop(columns = ['chol','replicate']).to_numpy()
y = finalDf[finalDf['chol'] == 30].drop(columns = ['chol','replicate']).to_numpy()
print(f'hotelling P = {hotelling_t2(x,y)[2]}')

hotelling P = 1.685459228578796e-224


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,chol
0,-1.188550,-0.403167,-0.847201,-0.244070,0.769802,-0.038531,0.122730,0.191951,0.098074,-0.142812,0.069405,0.084278,-0.533399,0.391948,-0.995274,0.262983,0.757667,-0.480924,30
1,-2.251266,0.072179,-0.211558,1.768779,0.029938,-0.333153,-0.238478,0.497974,0.295936,1.357849,-0.732077,1.027669,-0.028490,-0.141114,-0.477535,-0.205660,-0.409325,-0.476576,30
2,-4.564615,0.038972,-1.501685,0.040163,0.642489,-0.230758,-0.013695,0.791237,0.106016,0.192871,-0.012690,-0.298599,-0.041986,-0.105504,-0.065586,0.250573,0.542786,-0.276943,15
3,-7.154549,-1.180663,0.150933,1.846764,1.719381,-0.792119,-0.050332,1.282383,1.025369,0.351295,-0.268489,0.188643,0.386835,-0.028178,-0.466722,0.156848,-0.379060,-0.161714,15
4,-1.035901,-1.133541,0.372751,1.442173,1.575153,-0.565982,0.018820,0.846137,0.970088,0.262934,-0.298256,-0.050256,-0.672395,0.044864,-0.490637,0.010630,-0.239737,-0.150163,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,-0.415915,-0.530674,1.004996,1.722486,-0.971555,-0.865749,-0.097488,-1.218192,1.032596,0.387952,-0.425546,0.840527,-0.628902,0.086260,0.219858,-0.010726,0.360622,0.628127,30
2396,0.135736,-0.293618,-0.391256,0.783866,0.513609,-0.604419,-0.195128,1.479319,0.777267,0.604625,-0.303472,-0.342986,0.100610,-0.004228,0.277071,0.192493,0.112294,0.200973,30
2397,0.433967,0.002183,-0.268777,0.969792,-0.745067,-0.323320,0.117998,-1.939640,0.243674,-0.229121,-0.168863,0.671679,0.069909,-0.555022,0.527312,-0.066013,0.251696,0.744281,30
2398,-5.796985,2.198322,3.503907,-0.564820,-1.343811,-0.174978,-0.306350,2.157808,-1.028721,-0.598853,-1.129394,0.745617,0.442336,0.345608,-0.853204,0.071351,0.306355,0.272924,30


In [28]:
# Logistic model   
X = finalDf.drop(['chol','replicate'],axis=1)
y = (finalDf['chol'] - 15 )/15
X_train, X_test, y_train, y_test = train_test_split(X,y , 
                                random_state=104,  
                                train_size=0.8,  
                                shuffle=True) 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
log_reg = sm.Logit(y_train, X_train).fit() 

# see LLR p-value for p value
print(log_reg.summary())


yhat = log_reg.predict(X_test) 
prediction = list(map(round, yhat)) 
print(f'prediction accuracy: {np.sum(y_test == prediction)/len(y_test)}')


Optimization terminated successfully.
         Current function value: 0.443841
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                   chol   No. Observations:                 1920
Model:                          Logit   Df Residuals:                     1902
Method:                           MLE   Df Model:                           17
Date:                Thu, 18 Apr 2024   Pseudo R-squ.:                  0.3578
Time:                        18:39:18   Log-Likelihood:                -852.17
converged:                       True   LL-Null:                       -1327.0
Covariance Type:            nonrobust   LLR p-value:                5.432e-191
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -0.6078      0.072     -8.460      0.000      -0.749      -0.467
x2            -2.4429      0.