In [197]:
import pandas as pd

normal = pd.read_csv("../../result/Control_Female.csv")
hearing_loss = pd.read_csv("../../result/Training_Female.csv")

In [198]:
# 抽取有snp資訊的樣本
snp_normal = normal.dropna(subset=["TWB2_ID"])
print(str(snp_normal.shape))

snp_hearing_loss = hearing_loss.dropna(subset=["TWB2_ID"])
print(str(snp_hearing_loss.shape))

(2446, 166)
(1066, 166)


In [199]:
# 取得候選SNP集合可對應到TWB資料庫的SNP_ID資訊
map_file = "normal.map"
snp_name_list = []
map_f = open(map_file, "r")
for line in map_f:
    line = line.strip()
    line = line.split("\t")
    snp_id = line[1]
    snp_name_list.append(snp_id)
print(snp_name_list)

['rs111033313', 'rs80338943']


In [200]:
# 新增SNP欄位到三組dataframe
# column_num = 114 # Male
column_num = 122  # Female
for rs_id in snp_name_list:
    snp_normal.insert(column_num, rs_id, -1)
    snp_hearing_loss.insert(column_num, rs_id, -1)
    column_num += 1

In [201]:
# 將TWB2_ID欄位設為index
snp_normal = snp_normal.set_index("TWB2_ID")
snp_hearing_loss = snp_hearing_loss.set_index("TWB2_ID")

In [202]:
# 讀取SNP編碼檔案
import json

nt_biallele_code = json.load(open("snp_vocab.json", "r", encoding="utf-8"))
index_nt_biallele_code = json.load(open("vocab_nucleotide_to_index.json", "r", encoding="utf-8"))

In [203]:
# TWB SNP資料轉換使之與編碼檔一致
def trans_snp_chr(word):
    if len(word) > 1:
        if word[0] == "0":
            word[0] = "N"
        word = word[0] + "I"
    else:
        if word == "-":
            word = "DEL"
        if word == "0":
            word = "N"
    return word

In [204]:
from collections import deque

def asign_snp_label(ped_file_name, snp_df):
    ped_f = open(ped_file_name, "r")
    for line in ped_f:
        line = line.strip()
        line = line.split("\t")
        idx = line[0]
        snp_count = len(line[6:]) / 2
        snp_list = line[6:]
        print("snp_list: ", snp_list)
        queue = deque(snp_list)

        for i in range(int(len(snp_list) / 2)):
            chr_1 = queue.popleft()
            chr_2 = queue.popleft()
            snp_name = snp_name_list[i]

            if chr_1 == chr_2:
                t_chr = trans_snp_chr(chr_1)
                
            else:
                t_chr_1 = trans_snp_chr(chr_1)
                t_chr_2 = trans_snp_chr(chr_2)

                if ord(t_chr_1[0]) > ord(t_chr_2[0]):
                    temp = t_chr_1
                    t_chr_1 = t_chr_2
                    t_chr_2 = temp
                t_chr = t_chr_1 + "_" + t_chr_2
             
            if idx in snp_df.index:
                print("t_chr: ", t_chr)
                print("snp: ",index_nt_biallele_code[nt_biallele_code[t_chr]])
                snp_df.at[idx, snp_name] = index_nt_biallele_code[nt_biallele_code[t_chr]]
                
    return snp_df

In [205]:
snp_hearing_loss = asign_snp_label("hearing_loss.ped", snp_hearing_loss)
# print(snp_hearing_loss)
snp_normal = asign_snp_label("normal.ped", snp_normal)
# print(snp_normal)

snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
t_chr:  A
snp:  0
t_chr:  AI
snp:  5
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
t_chr:  A
snp:  0
t_chr:  AI
snp:  5
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
t_chr:  A
snp:  0
t_chr:  AI
snp:  5
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
t_chr:  A
snp:  0
t_chr:  AI
snp:  5
snp_list:  ['A', 'A', 'AG', 'AG']
snp_list:  ['A', 'A', 'AG', 'AG']
sn

In [206]:
snp_hearing_loss.to_csv("../../result/Training_Female_snp.csv")
snp_normal.to_csv("../../result/Control_Female_snp.csv")