In [26]:
#! /usr/bin/python3

__author__ = "Jun Hu <jh3846@columbia.edu"
__date__ = "Apr 10, 2018"

import logging

logger = logging.getLogger('etl')
logger.setLevel(logging.DEBUG)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

formatter_c = logging.Formatter('[%(asctime)s]-[%(process)d]-[%(thread)d]-[%(name)s]-[%(lineno)s]-[%(levelname)s]: %(message)s')
ch.setFormatter(formatter_c)

logger.addHandler(ch)

import json
import pandas as pd
import time

In [27]:
with open("bcr_project_data.json") as f:
    data_origin = json.load(f)

In [28]:
df = pd.DataFrame(data_origin)

In [29]:
df.head(10)

Unnamed: 0,BCR,EVENT,OS,age,cancer
0,"[[DYWGQG, NA], [RSDDTAVYFC, NA], [GDSPPFFDYWGQ...",living,389,31.0,THCA
1,"[[YYCARGVV, NA], [YYCARVGYYYDSSDYP, NA], [FNHW...",living,482,74.0,LUSC
2,"[[REHSSSSWYFDYWGQG, NA], [YYCARHEDGQSK, NA], [...",living,3388,70.0,OV
3,"[[DYYFDYWGQG, NA], [YYCAREPNGP, NA], [AYYYYGMD...",living,1419,82.0,COAD
4,"[[YSSGSTLDYWGQG, IGHG3], [YYYGMDVWGQG, NA], [E...",living,726,55.0,THCA
5,"[[YYCAREEIYCSGGRCYSLAVTRGAFDIWGQG, IGHG2], [YY...",death,565,60.0,BLCA
6,"[[YYCARDRHSSR, NA], [NYYFDFWGQG, NA], [WIDPWGQ...",living,400,54.0,HNSC
7,"[[YYCARGGGYW, IGHA1], [SWSGRFDNWGQG, IGHA1], [...",death,1516,,LUAD
8,"[[YYCARDLNYYGLGHWGQG, NA], [YYCARQNNYGS, NA], ...",living,3708,30.0,SKCM
9,"[[RSPLPSRSSTVWGRG, NA], [YYCAKDKQWLVS, NA], [Y...",living,38,61.0,THCA


In [30]:
df = df.merge(df.BCR.apply(lambda l: pd.Series({"cdr3aa": [str(ll[0]) for ll in l if str(ll[0]) != "nan"], "segmentV": [ll[1] for ll in l]})), left_index=True, right_index=True)
df.head()

Unnamed: 0,BCR,EVENT,OS,age,cancer,cdr3aa,segmentV
0,"[[DYWGQG, NA], [RSDDTAVYFC, NA], [GDSPPFFDYWGQ...",living,389,31.0,THCA,"[DYWGQG, RSDDTAVYFC, GDSPPFFDYWGQG, RSDDTAVYFC...","[NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N..."
1,"[[YYCARGVV, NA], [YYCARVGYYYDSSDYP, NA], [FNHW...",living,482,74.0,LUSC,"[YYCARGVV, YYCARVGYYYDSSDYP, FNHWGQG, LDHWGQG,...","[NA, NA, IGHA1, IGHA1, IGHA1, IGHA1, IGHA1, IG..."
2,"[[REHSSSSWYFDYWGQG, NA], [YYCARHEDGQSK, NA], [...",living,3388,70.0,OV,"[REHSSSSWYFDYWGQG, YYCARHEDGQSK, KNYYFFDYWGQG,...","[NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N..."
3,"[[DYYFDYWGQG, NA], [YYCAREPNGP, NA], [AYYYYGMD...",living,1419,82.0,COAD,"[DYYFDYWGQG, YYCAREPNGP, AYYYYGMDVWGQG, DTAVYY...","[NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N..."
4,"[[YSSGSTLDYWGQG, IGHG3], [YYYGMDVWGQG, NA], [E...",living,726,55.0,THCA,"[YSSGSTLDYWGQG, YYYGMDVWGQG, ETNYYGMDVWGQG, YY...","[IGHG3, NA, NA, IGHA1, IGHA1, IGHG3, IGHA1, IG..."


In [31]:
df = df[["cdr3aa", "cancer"]]
df.head()

Unnamed: 0,cdr3aa,cancer
0,"[DYWGQG, RSDDTAVYFC, GDSPPFFDYWGQG, RSDDTAVYFC...",THCA
1,"[YYCARGVV, YYCARVGYYYDSSDYP, FNHWGQG, LDHWGQG,...",LUSC
2,"[REHSSSSWYFDYWGQG, YYCARHEDGQSK, KNYYFFDYWGQG,...",OV
3,"[DYYFDYWGQG, YYCAREPNGP, AYYYYGMDVWGQG, DTAVYY...",COAD
4,"[YSSGSTLDYWGQG, YYYGMDVWGQG, ETNYYGMDVWGQG, YY...",THCA


In [32]:
def string2ngram(s, n, skip=1):
    res = ''
    for i in range(len(s) - n + 1):
        res += s[i: i+n: skip] + ' '
    return res


def list2ngram(l, n, skip=1):
    res = ''
    for s in l:
        for ngram in string2ngram(str(s), n, skip):
            res += ngram
    return res


Aliphatic = ['A', 'I', 'L', 'V']
Sulfur = ['C', 'M']
Hydroxyl = ['S', 'T']
Acidic = ['D', 'E']
Amide = ['N', 'Q']
Basic = ['R', 'H', 'K']

pc_dict = {}
for k in Aliphatic:
    pc_dict[k] = '1'
for k in Sulfur:
    pc_dict[k] = '2'
for k in Hydroxyl:
    pc_dict[k] = '3'
for k in Acidic:
    pc_dict[k] = '4'
for k in Amide:
    pc_dict[k] = '5'
for k in Basic:
    pc_dict[k] = '6'


def aa_class(s):
    for k in pc_dict:
        s = s.replace(k, pc_dict[k])
    return s


In [None]:
start_time = time.time()
df["unigram"] = df.cdr3aa.map(lambda x: list2ngram(x, 1))
df["bigram"] = df.cdr3aa.map(lambda x: list2ngram(x, 2))
df["trigram"] = df.cdr3aa.map(lambda x: list2ngram(x, 3))
df["four_gram"] = df.cdr3aa.map(lambda x: list2ngram(x, 4))
df["five_gram"] = df.cdr3aa.map(lambda x: list2ngram(x, 5))
df["six_gram"] = df.cdr3aa.map(lambda x: list2ngram(x, 6))
df["trigram_skip2"] = df.cdr3aa.map(lambda x: list2ngram(x, 5, 2))
df["trigram_skip3"] = df.cdr3aa.map(lambda x: list2ngram(x, 7, 3))
df["trigram_skip3"] = df.cdr3aa.map(lambda x: list2ngram(x, 9, 4))
logger.debug("--- %s seconds ---" % (time.time() - start_time))

In [None]:
df = df.drop(columns=["cdr3aa"])

In [None]:
start_time = time.time()
df["unigram_cls"] = df.unigram.map(lambda x: aa_class(x))
df["bigram_cls"] = df.bigram.map(lambda x: aa_class(x))
df["trigram_cls"] = df.trigram.map(lambda x: aa_class(x))
df["four_gram_cls"] = df.four_gram.map(lambda x: aa_class(x))
df["five_gram_cls"] = df.five_gram.map(lambda x: aa_class(x))
df["six_gram_cls"] = df.six_gram.map(lambda x: aa_class(x))
df["trigram_cls_skip2"] = df.trigram_skip2.map(lambda x: aa_class(x))
df["trigram_cls_skip3"] = df.trigram_skip3.map(lambda x: aa_class(x))
df["trigram_cls_skip3"] = df.trigram_skip3.map(lambda x: aa_class(x))
logger.debug("--- %s seconds ---" % (time.time() - start_time))

In [None]:
df.head()

In [None]:
# start_time = time.time()
# df.to_csv("processed_data.bz2", compression='bz2')
# logger.debug("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# start_time = time.time()
# df = pd.read_csv("processed_data.bz2", compression='bz2')
# logger.debug("--- %s seconds ---" % (time.time() - start_time))

In [None]:
df_write = df