In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from data_conversion import conversion
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [2]:
pd.set_option("future.no_silent_downcasting", True)

In [3]:
geno = conversion.get_geno_data()
geno = conversion.drop_single_value_cols(geno)
geno_binary = conversion.convert_geno_to_binary(geno)
geno_ternary = conversion.convert_geno_to_ternary(geno)

In [4]:
pheno = conversion.get_pheno_data()

In [5]:
df = pd.concat([geno_ternary.astype(float), pheno], axis=1)

In [6]:
# Add intersept column to the dataframe

intercept = pd.DataFrame({'intercept': np.ones(df.shape[0])})
intercept.set_index(df.index, inplace= True)
df = pd.concat([intercept, df], axis=1)

In [8]:
# Record p-values as a dictionary with {SNP : p-value}

p_values = {}
for column in geno_ternary.columns:
    model = sm.OLS(df.NEUT,df[['intercept',column]],missing = 'drop')
    res = model.fit()
    p_values[column] = res.pvalues.iloc[1]

# Sorted p_values dict
sorted_p_values = dict(sorted(p_values.items(), key=lambda item: item[1]))

In [9]:
# SNPs with p_value < 0.05

p_columns = []
for column in geno_ternary.columns:
    if p_values[column] < 0.05:
        p_columns.append(column)
    else: continue
print("The number of SNPs with p-value less than 0.05: ",len(p_columns))

The number of SNPs with p-value less than 0.05:  656


In [45]:
# Bonferroni Correction:

number_of_snps = df.shape[1]
bonferroni_corr = 0.05 / number_of_snps 

bonferroni_columns = []
for column in geno_ternary.columns:
    if p_values[column] < bonferroni_corr:
        bonferroni_columns.append(column)
        print("Bonferroni column :", column, p_values[column])
    else: continue
print("The number of SNPs with p-value less than Bonferroni Correction: ",len(bonferroni_columns))

Bonferroni column : UNC200034826 3.892089626748788e-07
Bonferroni column : UNC200261375 2.777795587553612e-07
Bonferroni column : UNC210001613 1.150731615929429e-08
The number of SNPs with p-value less than Bonferroni Correction:  3


In [53]:
# Linear regression only with Bonferroni columns

model = sm.OLS(df.NEUT, df[['intercept']+ bonferroni_columns], missing = 'drop')
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,NEUT,R-squared:,0.046
Model:,OLS,Adj. R-squared:,0.041
Method:,Least Squares,F-statistic:,9.728
Date:,"Sat, 02 Nov 2024",Prob (F-statistic):,2.82e-06
Time:,15:49:09,Log-Likelihood:,-4540.0
No. Observations:,607,AIC:,9088.0
Df Residuals:,603,BIC:,9106.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,1082.8588,202.827,5.339,0.000,684.525,1481.192
UNC200034826,-92.4679,54.943,-1.683,0.093,-200.370,15.434
UNC200261375,32.6136,54.899,0.594,0.553,-75.202,140.429
UNC210001613,-39.7443,90.958,-0.437,0.662,-218.377,138.888

0,1,2,3
Omnibus:,125.391,Durbin-Watson:,1.736
Prob(Omnibus):,0.0,Jarque-Bera (JB):,256.393
Skew:,1.14,Prob(JB):,2.11e-56
Kurtosis:,5.223,Cond. No.,32.6


In [54]:
# Linear Regression with all snps

df_snps = df[['intercept']+ list(geno_ternary.columns)]

model = sm.OLS(df.NEUT, df_snps, missing = 'drop')
result = model.fit()
result.summary()

ValueError: zero-size array to reduction operation maximum which has no identity