In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from data_conversion import conversion
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [2]:
pd.set_option("future.no_silent_downcasting", True)

In [111]:
geno = conversion.get_geno_data()
geno = conversion.drop_single_value_cols(geno)
geno_binary = conversion.convert_geno_to_binary(geno)
geno_ternary = conversion.convert_geno_to_ternary(geno)
pheno = conversion.get_pheno_data()
geno_ternary_filled = conversion.fill_nan_with_distribution(geno_ternary.astype(float))
df = pd.concat([geno_ternary_filled, pheno], axis=1)

In [112]:
# Convert Sex to binary: F = 1 and M = 0

df['Sex'] = df['Sex'] == 'F'
df['Sex'] = df['Sex'].astype(float)

# Create separate dataframes for female and male

df_female = df[df['Sex'] == 1]
df_male = df[df['Sex'] == 0]

# Drop single value columns

df_female=conversion.drop_single_value_cols(df_female)
df_male = conversion.drop_single_value_cols(df_male)

In [113]:
# Add intersept column to the dataframe

intercept = pd.DataFrame({'intercept': np.ones(df_female.shape[0])})
intercept.set_index(df_female.index, inplace= True)
df_female = pd.concat([intercept, df_female], axis=1)

intercept = pd.DataFrame({'intercept': np.ones(df_male.shape[0])})
intercept.set_index(df_male.index, inplace= True)
df_male = pd.concat([intercept, df_male], axis=1)

In [114]:
# Get female/male SNPs

snp_female = df_female.columns[1:-3]
snp_male = df_male.columns[1:-3]

In [115]:
# Record p-values for df_female as a dictionary with {SNP : p-value}

p_values_female = {}
for column in snp_female:
    model = sm.OLS(df_female.NEUT,df_female[['intercept',column]],missing = 'drop')
    res = model.fit()
    p_values_female[column] = res.pvalues.iloc[1]

# Sorted p_values dict
sorted_p_values_female = dict(sorted(p_values_female.items(), key=lambda item: item[1]))

In [116]:
# Record p-values for df_male as a dictionary with {SNP : p-value}

p_values_male = {}
for column in snp_male:
    model = sm.OLS(df_male.NEUT,df_male[['intercept',column]],missing = 'drop')
    res = model.fit()
    p_values_male[column] = res.pvalues.iloc[1]

# Sorted p_values dict
sorted_p_values_male = dict(sorted(p_values_male.items(), key=lambda item: item[1]))

In [117]:
# Female SNPs with p_value < 0.05

p_columns_female = []
for column in snp_female:
    if p_values_female[column] < 0.05:
        p_columns_female.append(column)
    else: continue
print("The number of SNPs for female with p-value less than 0.05: ",len(p_columns_female))

The number of SNPs for female with p-value less than 0.05:  496


In [119]:
# Male SNPs with p_value < 0.05

p_columns_male = []
for column in snp_male:
    if p_values_male[column] < 0.05:
        p_columns_male.append(column)
    else: continue
print("The number of SNPs for male with p-value less than 0.05: ",len(p_columns_male))

The number of SNPs for male with p-value less than 0.05:  610


In [123]:
# Female Bonferroni Correction:

number_of_snps_female = len(snp_female)
bonferroni_corr_female = 0.05 / number_of_snps_female 

bonferroni_columns_female = []
for column in snp_female:
    if p_values_female[column] < bonferroni_corr_female:
        bonferroni_columns_female.append(column)
        print("Bonferroni column :", column, p_values_female[column])
    else: continue
print("The number of SNPs for female with p-value less than Bonferroni Correction: ",len(bonferroni_columns_female))

Bonferroni column : UNC070327084 5.838825148561876e-07
The number of SNPs for female with p-value less than Bonferroni Correction:  1


In [124]:
# Male bonferroni columns

number_of_snps_male = len(snp_male)
bonferroni_corr_male = 0.05 / number_of_snps_male 

bonferroni_columns_male = []
for column in snp_male:
    if p_values_male[column] < bonferroni_corr_male:
        bonferroni_columns_male.append(column)
        print("Bonferroni column :", column, p_values_male[column])
    else: continue
print("The number of SNPs for male with p-value less than Bonferroni Correction: ",len(bonferroni_columns_male))

The number of SNPs for male with p-value less than Bonferroni Correction:  0


In [125]:
# Linear regression only with Bonferroni columns

model = sm.OLS(df_female.NEUT, df_female[['intercept']+ bonferroni_columns_female])
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,NEUT,R-squared:,0.059
Model:,OLS,Adj. R-squared:,0.057
Method:,Least Squares,F-statistic:,25.78
Date:,"Mon, 11 Nov 2024",Prob (F-statistic):,5.84e-07
Time:,11:43:25,Log-Likelihood:,-3047.0
No. Observations:,410,AIC:,6098.0
Df Residuals:,408,BIC:,6106.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,4997.5320,819.495,6.098,0.000,3386.573,6608.491
UNC070327084,-2082.2240,410.123,-5.077,0.000,-2888.441,-1276.007

0,1,2,3
Omnibus:,98.661,Durbin-Watson:,1.543
Prob(Omnibus):,0.0,Jarque-Bera (JB):,200.597
Skew:,1.281,Prob(JB):,2.76e-44
Kurtosis:,5.275,Cond. No.,101.0


In [129]:
# Linear regression only with columns of 50 smallest p-values

model = sm.OLS(df_female.NEUT, df_female[['intercept']+ list(sorted_p_values_female.keys())[0:5]], missing = 'drop')
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,NEUT,R-squared:,0.203
Model:,OLS,Adj. R-squared:,0.193
Method:,Least Squares,F-statistic:,20.53
Date:,"Mon, 11 Nov 2024",Prob (F-statistic):,2.81e-18
Time:,11:44:40,Log-Likelihood:,-3013.2
No. Observations:,410,AIC:,6038.0
Df Residuals:,404,BIC:,6062.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,1.161e+04,1318.458,8.807,0.000,9020.404,1.42e+04
UNC070327084,-1796.4793,385.162,-4.664,0.000,-2553.652,-1039.307
UNC050251706,-1860.9649,379.918,-4.898,0.000,-2607.829,-1114.101
JAX00671748,-137.7947,34.792,-3.960,0.000,-206.191,-69.398
JAX00291370,-137.6348,37.621,-3.658,0.000,-211.593,-63.677
UNC040198686,-1589.2909,379.918,-4.183,0.000,-2336.155,-842.427

0,1,2,3
Omnibus:,75.185,Durbin-Watson:,1.568
Prob(Omnibus):,0.0,Jarque-Bera (JB):,122.5
Skew:,1.102,Prob(JB):,2.5100000000000004e-27
Kurtosis:,4.521,Cond. No.,295.0


In [130]:
# Linear regression only with columns of 5 smallest p-values

model = sm.OLS(df_male.NEUT, df_male[['intercept']+ list(sorted_p_values_male.keys())[0:5]], missing = 'drop')
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,NEUT,R-squared:,0.196
Model:,OLS,Adj. R-squared:,0.183
Method:,Least Squares,F-statistic:,15.87
Date:,"Mon, 11 Nov 2024",Prob (F-statistic):,5.44e-14
Time:,11:45:07,Log-Likelihood:,-2445.0
No. Observations:,332,AIC:,4902.0
Df Residuals:,326,BIC:,4925.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,684.5513,83.325,8.215,0.000,520.629,848.474
UNC061716948,-125.2310,30.653,-4.085,0.000,-185.533,-64.929
UNC010460280,119.5083,33.109,3.610,0.000,54.374,184.643
UNC060281055,111.4153,32.346,3.445,0.001,47.783,175.048
UNC070007179,115.5622,30.084,3.841,0.000,56.379,174.746
JAX00430230,104.4531,30.332,3.444,0.001,44.783,164.124

0,1,2,3
Omnibus:,9.395,Durbin-Watson:,1.759
Prob(Omnibus):,0.009,Jarque-Bera (JB):,9.557
Skew:,0.414,Prob(JB):,0.00841
Kurtosis:,3.067,Cond. No.,11.9


In [171]:
# Train test splits for both dataframes

df_female_train, df_female_test = train_test_split(df_female, test_size=0.3, random_state=402)
df_male_train, df_male_test = train_test_split(df_male, test_size=0.3, random_state=402)

In [172]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [174]:
for i in range(0,55,5):
    linreg = LinearRegression()
    linreg.fit(df_female_train[['intercept']+ list(sorted_p_values_female.keys())[0:i]], y=df_female_train['NEUT'])
    linear_coefs = pd.Series(linreg.coef_, index=['intercept']+ list(sorted_p_values_female.keys())[0:i])
    y_pred=linreg.predict(df_female_test[['intercept']+ list(sorted_p_values_female.keys())[0:i]])
    y_true = df_female_test.NEUT
    print("The MSE for the linreg model with the first ",i," p_columns: ",mean_squared_error(y_true,y_pred))

The MSE for the linreg model with the first  0  p_columns:  125539.98881839996
The MSE for the linreg model with the first  5  p_columns:  114985.72966333307
The MSE for the linreg model with the first  10  p_columns:  99528.8617663809
The MSE for the linreg model with the first  15  p_columns:  99448.53281369634
The MSE for the linreg model with the first  20  p_columns:  115369.4560494862
The MSE for the linreg model with the first  25  p_columns:  121774.96383787104
The MSE for the linreg model with the first  30  p_columns:  119256.0451046281
The MSE for the linreg model with the first  35  p_columns:  117825.46921489721
The MSE for the linreg model with the first  40  p_columns:  120131.20723913706
The MSE for the linreg model with the first  45  p_columns:  114909.13501943572
The MSE for the linreg model with the first  50  p_columns:  121580.97421446581


In [175]:
for i in range(0,55,5):
    linreg = LinearRegression()
    linreg.fit(df_male_train[['intercept']+ list(sorted_p_values_male.keys())[0:i]], y=df_male_train['NEUT'])
    linear_coefs = pd.Series(linreg.coef_, index=['intercept']+ list(sorted_p_values_male.keys())[0:i])
    y_pred=linreg.predict(df_male_test[['intercept']+ list(sorted_p_values_male.keys())[0:i]])
    y_true = df_male_test.NEUT
    print("Male : MSE for the linreg model with the first ",i," p_columns: ",mean_squared_error(y_true,y_pred))

Male : MSE for the linreg model with the first  0  p_columns:  193813.67395470914
Male : MSE for the linreg model with the first  5  p_columns:  174701.64713550135
Male : MSE for the linreg model with the first  10  p_columns:  167563.12663585207
Male : MSE for the linreg model with the first  15  p_columns:  156727.59321555748
Male : MSE for the linreg model with the first  20  p_columns:  147025.406822117
Male : MSE for the linreg model with the first  25  p_columns:  151599.36995994175
Male : MSE for the linreg model with the first  30  p_columns:  146816.21396797206
Male : MSE for the linreg model with the first  35  p_columns:  171130.79424857296
Male : MSE for the linreg model with the first  40  p_columns:  164534.58882406977
Male : MSE for the linreg model with the first  45  p_columns:  161368.75290072543
Male : MSE for the linreg model with the first  50  p_columns:  156217.58431183282
