In [1]:
import numpy as np
import pandas as pd

## Remove outliers from the original phenotype file

In [2]:
outliers = "/gpfs/gibbs/pi/dewan/data/UKBiobank/results/070921_pca_genotype_array/white_expanded_07_09_21_genoarray_projected/030821_ukb42495_exomed_white_189010ind.pheno.white_expanded_07_09_21_genoarray_projected.pca.projected.outliers"

org_actual_pheno = "/gpfs/gibbs/pi/dewan/data/UKBiobank/phenotype_files/pleiotropy_R01/ukb43978_OCT2020/dc2325_phenotypes/030821_ukb42495_exomed_white_189010ind.pheno.new"
new_action_pheno = "~/hearing_pca/030821_ukb42495_exomed_white_189010ind_removed_outliers.pheno.new"

In [3]:
out = pd.read_csv(outliers, delimiter="\t")
out

Unnamed: 0,1008606,1008606.1
0,1010412,1010412
1,1045757,1045757
2,1057699,1057699
3,1069457,1069457
4,1071240,1071240
...,...,...
562,4773865,4773865
563,5109700,5109700
564,5637210,5637210
565,5748329,5748329


In [4]:
def dont_keep_in_list(row, li):
    if row["FID"] in li:
        return row.index[0]
    else:
        return -1

In [5]:
phen = pd.read_csv(org_actual_pheno, delimiter="\t")

print("original pheno shape", phen.shape)

not_outlier = lambda row: dont_keep_in_list(row, out["1008606"].to_list())
i = phen.apply(not_outlier, axis=1)
all_pheno_no_outlier = phen.drop(i[i != -1].index)

print("new pheno shape", all_pheno_no_outlier.shape)

original pheno shape (189010, 4)
new pheno shape (188443, 4)


In [6]:
all_pheno_no_outlier.to_csv(new_action_pheno, sep="\t", index=False)

In [7]:
all_pheno_no_outlier

Unnamed: 0,FID,IID,pop,super_pop
0,1000019,1000019,British,extended_white
1,1000035,1000035,British,extended_white
2,1000078,1000078,British,extended_white
3,1000081,1000081,British,extended_white
4,1000198,1000198,British,extended_white
...,...,...,...,...
189005,6025295,6025295,British,extended_white
189006,6025319,6025319,British,extended_white
189007,6025346,6025346,British,extended_white
189008,6025363,6025363,British,extended_white


## Obtain the pca phenotype info for each of the four hearing aid phenos

### f3393

In [8]:
org_f3393_pheno = "/gpfs/gibbs/pi/dewan/data/UKBiobank/phenotype_files/hearing_impairment/phenotypes_exome_data/010421_UKBB_Hearing_aid_f3393_128254ind_exomes"
new_f3393_pheno = "~/hearing_pca/010421_UKBB_Hearing_aid_f3393_128254ind_exomes.pheno"

In [9]:
f3393 = pd.read_csv(org_f3393_pheno, delimiter=" ")

print("original all_pheno_no_outlier shape", all_pheno_no_outlier.shape)
print("f3393 shape", f3393.shape)

diff = set(all_pheno_no_outlier["FID"].to_list()).difference(f3393["FID"].to_list())
not_outlier = lambda row: dont_keep_in_list(row, diff)

i = all_pheno_no_outlier.apply(not_outlier, axis=1)
f3393_pca = all_pheno_no_outlier.drop(i[i != -1].index)

print("f3393 pca shape", f3393_pca.shape)

original all_pheno_no_outlier shape (188443, 4)
f3393 shape (128254, 5)
f3393 pca shape (128137, 4)


In [10]:
f3393_pca.to_csv(new_f3393_pheno, sep="\t", index=False)

### f2247

In [11]:
org_f2247_pheno = "/gpfs/gibbs/pi/dewan/data/UKBiobank/phenotype_files/hearing_impairment/phenotypes_exome_data/010421_UKBB_Hearing_difficulty_f2247_171970ind_exomes"
new_f2247_pheno = "~/hearing_pca/010421_UKBB_Hearing_difficulty_f2247_171970ind_exomes.pheno"

In [12]:
f2247 = pd.read_csv(org_f2247_pheno, delimiter=" ")

print("original all_pheno_no_outlier shape", all_pheno_no_outlier.shape)
print("f2247 shape", f2247.shape)

diff = set(all_pheno_no_outlier["FID"].to_list()).difference(f2247["FID"].to_list())
not_outlier = lambda row: dont_keep_in_list(row, diff)

i = all_pheno_no_outlier.apply(not_outlier, axis=1)
f2247_pca = all_pheno_no_outlier.drop(i[i != -1].index)

print("f2247 pca shape", f2247_pca.shape)

original all_pheno_no_outlier shape (188443, 4)
f2247 shape (171970, 5)
f2247 pca shape (171821, 4)


In [13]:
f2247_pca.to_csv(new_f2247_pheno, sep="\t", index=False)

### f2257

In [14]:
org_f2257_pheno = "/gpfs/gibbs/pi/dewan/data/UKBiobank/phenotype_files/hearing_impairment/phenotypes_exome_data/010421_UKBB_Hearing_background_noise_f2257_175531ind_exomes"
new_f2257_pheno = "~/hearing_pca/010421_UKBB_Hearing_background_noise_f2257_175531ind_exomes.pheno"

In [15]:
f2257 = pd.read_csv(org_f2257_pheno, delimiter=" ")

print("original all_pheno_no_outlier shape", all_pheno_no_outlier.shape)
print("f2257 shape", f2257.shape)

diff = set(all_pheno_no_outlier["FID"].to_list()).difference(f2257["FID"].to_list())
not_outlier = lambda row: dont_keep_in_list(row, diff)

i = all_pheno_no_outlier.apply(not_outlier, axis=1)
f2257_pca = all_pheno_no_outlier.drop(i[i != -1].index)

print("f2257 pca shape", f2257_pca.shape)

original all_pheno_no_outlier shape (188443, 4)
f2257 shape (175531, 5)
f2257 pca shape (175381, 4)


In [16]:
f2257_pca.to_csv(new_f2257_pheno, sep="\t", index=False)

### f2247 & f2257

In [17]:
org_f2247_f2257_pheno = "/gpfs/gibbs/pi/dewan/data/UKBiobank/phenotype_files/hearing_impairment/phenotypes_exome_data/010421_UKBB_f2247_f2257_136862ind_exomes"
new_f2247_f2257_pheno = "~/hearing_pca/010421_UKBB_f2247_f2257_136862ind_exomes.pheno"

In [18]:
f2247_f2257 = pd.read_csv(org_f2247_f2257_pheno, delimiter=" ")

print("original all_pheno_no_outlier shape", all_pheno_no_outlier.shape)
print("f2247_f2257 shape", f2247_f2257.shape)

diff = set(all_pheno_no_outlier["FID"].to_list()).difference(f2247_f2257["FID"].to_list())
not_outlier = lambda row: dont_keep_in_list(row, diff)

i = all_pheno_no_outlier.apply(not_outlier, axis=1)
f2247_f2257_pca = all_pheno_no_outlier.drop(i[i != -1].index)

print("f2247_f2257 pca shape", f2247_f2257_pca.shape)

original all_pheno_no_outlier shape (188443, 4)
f2247_f2257 shape (136862, 5)
f2247_f2257 pca shape (136747, 4)


In [19]:
f2247_f2257_pca.to_csv(new_f2247_f2257_pheno, sep="\t", index=False)