In [1]:
import logging, pandas as pd, numpy as np
from pathlib import Path
from fastlmm.association import single_snp
from pysnptools.snpreader import Bed, Pheno
from fastlmm.inference import FastLMM
from fastlmm.util import compute_auto_pcs
from sklearn.model_selection import KFold

In [2]:
def configLogger(_path):
    
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    
    formatter = logging.Formatter('%(asctime)s:\n%(message)s\n')
    
    fh = logging.FileHandler(_path)
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    
    sh = logging.StreamHandler()
    sh.setLevel(logging.INFO)
    sh.setFormatter(formatter)
    logger.addHandler(sh)
    
    return logger

In [6]:
working_dir = '/d/data/plasmo/fastlmm'
# bed_fn = working_dir + '/snp_small.bed'

# working_dir = '/home/j/jparkin/xescape/scratch/plasmo/fastlmm'
bed_fn = working_dir + '/snps_v3.bed'

phe_fn = working_dir + '/plink_meta.tsv'
log_fn = working_dir + '/run_fastlmm.log'

logger = configLogger(log_fn)

logger.info('starting file load')
snp_data = Bed(bed_fn)
pheno = Pheno(phe_fn)

INFO:root:starting file load
2020-03-13 14:19:53,787:
starting file load

2020-03-13 14:19:53,787:
starting file load



In [7]:
#SPLITS
def to_df(snp_data):
    genes = [x[0] for x in snp_data.iid]
    vals = snp_data.val.reshape((-1,))
    return pd.DataFrame({'genes': genes,
                         'vals' : vals}).set_index('genes')
def apply_fn(row):
    return np.abs(row.vals - row.vals_res)

c = 0
n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True)
errors = []
for train_index, test_index in kf.split(range(snp_data.iid_count)):
    logger.info('iteration {0}'.format(c))
    
    train_data = snp_data[train_index,:]
    train_meta = pheno[train_index,:]
    test_data = snp_data[test_index,:]
    test_meta = pheno[test_index,:]
    
    model = FastLMM(GB_goal=2)
    model.fit(K0_train=train_data, y=train_meta)
    res = model.predict(K0_whole_test=test_data)
#     logger.info('calculating MAE')
    real_df = to_df(test_meta.read())
    res_df = to_df(res[0])
    combined = real_df.join(res_df, how='inner', rsuffix='_res')
    mae = np.mean(combined.apply(apply_fn, axis=1), axis=None)
    
    control_df = pd.DataFrame(combined)
    control_df.loc[:, 'vals_res'] = np.mean(real_df.loc[:, 'vals'])
    control_mae = np.mean(control_df.apply(apply_fn, axis=1), axis=None)
    error_ratio = mae / control_mae
    
    logger.info('MAE is: {0}, Control MAE is: {1}, Ratio: {2}'.format(mae, control_mae, error_ratio))
    errors.append({'error': mae, 'expected':control_mae, 'ratio':error_ratio})
    c += 1

pd.DataFrame(errors).to_csv(working_dir + '/fastlmm_errors.tsv', sep='\t', index=False)
    

INFO:root:Loading fam file /d/data/plasmo/fastlmm/snps_v3.fam
2020-03-13 14:19:54,338:
Loading fam file /d/data/plasmo/fastlmm/snps_v3.fam

2020-03-13 14:19:54,338:
Loading fam file /d/data/plasmo/fastlmm/snps_v3.fam

INFO:root:iteration 0
2020-03-13 14:19:54,390:
iteration 0

2020-03-13 14:19:54,390:
iteration 0

INFO:root:Loading bim file /d/data/plasmo/fastlmm/snps_v3.bim
2020-03-13 14:19:54,505:
Loading bim file /d/data/plasmo/fastlmm/snps_v3.bim

2020-03-13 14:19:54,505:
Loading bim file /d/data/plasmo/fastlmm/snps_v3.bim

INFO:root:bed file is open /d/data/plasmo/fastlmm/snps_v3.bed
2020-03-13 14:19:54,681:
bed file is open /d/data/plasmo/fastlmm/snps_v3.bed

2020-03-13 14:19:54,681:
bed file is open /d/data/plasmo/fastlmm/snps_v3.bed

INFO:root:MAE is: 1.4768262787137911, Control MAE is: 1.3080137772675082, Ratio: 1.1290601860470757
2020-03-13 14:19:55,985:
MAE is: 1.4768262787137911, Control MAE is: 1.3080137772675082, Ratio: 1.1290601860470757

2020-03-13 14:19:55,985:
MAE is:

TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [4]:

# #NO SPLIT
# def to_df(snp_data):
#     genes = [x[0] for x in snp_data.iid]
#     vals = snp_data.val.reshape((-1,))
#     return pd.DataFrame({'genes': genes,
#                          'vals' : vals}).set_index('genes')

# def apply_fn(row):
#     return np.abs(row.vals - row.vals_res)


# logger.info('making model')
# model = FastLMM(GB_goal=2)
# model.fit(K0_train=snp_data, y=pheno)

# logger.info('predicting')
# res = model.predict(K0_whole_test=snp_data)

# logger.info('calculating MAE')
# real_df = to_df(pheno.read())
# res_df = to_df(res[0])
# combined = real_df.join(res_df, how='inner', rsuffix='_res')
# mae = np.mean(combined.apply(apply_fn, axis=1), axis=None)
# control_df = pd.DataFrame(combined)
# control_df.loc[:, 'vals_res'] = np.mean(combined.loc[:, 'vals'])
# control_mae = np.mean(control_df.apply(apply_fn, axis=1), axis=None)
# logger.info('MAE is: {0}, Control MAE is: {1}, Ratio: {2}'.format(mae, control_mae, mae / control_mae))

INFO:root:making model
2020-02-12 22:47:39,182:
making model

INFO:root:Loading fam file /d/data/plasmo/fastlmm/snp_small.fam
2020-02-12 22:47:39,191:
Loading fam file /d/data/plasmo/fastlmm/snp_small.fam

INFO:root:Loading bim file /d/data/plasmo/fastlmm/snp_small.bim
2020-02-12 22:47:39,217:
Loading bim file /d/data/plasmo/fastlmm/snp_small.bim

INFO:root:bed file is open /d/data/plasmo/fastlmm/snp_small.bed
2020-02-12 22:47:39,230:
bed file is open /d/data/plasmo/fastlmm/snp_small.bed

INFO:root:predicting
2020-02-12 22:47:39,369:
predicting



In [8]:

pd.DataFrame(errors).to_csv(working_dir + '/fastlmm_errors.tsv', sep='\t', index=False)


INFO:root:MAE is: 0.8884156287816813, Control MAE is: 1.8413304395896892, Ratio: 0.4824857123307267
2020-02-12 22:47:39,957:
MAE is: 0.8884156287816813, Control MAE is: 1.8413304395896892, Ratio: 0.4824857123307267

