# 特徴量作成

In [1]:
#practice

In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
from google.colab import drive
drive.mount('/content/drive')

data_dir = 'drive/My Drive'

Mounted at /content/drive


## Data読み込み

In [None]:
train = pd.read_json(data_dir + '/kaggle/input/stanford-covid-vaccine/train.json', lines=True)
test = pd.read_json(data_dir + '/kaggle/input/stanford-covid-vaccine/test.json', lines=True)
sample_sub = pd.read_csv(data_dir + '/kaggle/input/stanford-covid-vaccine/sample_submission.csv')

## 特徴量生成のための関数

In [None]:
#npyから結合の数を示す特徴量を作成
def number_bonds(df):
    chemical_bonds_weak = []
    chemical_bonds_normal = []
    chemical_bonds_strong = []
    for i in df.id:
        chemical_bond_weak = []
        chemical_bond_normal = []
        chemical_bond_strong = []
        file_ = glob.glob(data_dir + '/kaggle/input/stanford-covid-vaccine/bpps/' + i + '.npy')
        for k in range(len(df[df['id'] == i]['structure'].iloc[0])):
            chemical_bond_weak.append(np.count_nonzero((np.load(file_[0])[k] > 0) & (np.load(file_[0])[k] < 0.1)))
            chemical_bond_normal.append(np.count_nonzero((np.load(file_[0])[k] >= 0.1) & (np.load(file_[0])[k] < 0.5)))
            chemical_bond_strong.append(np.count_nonzero((np.load(file_[0])[k] >= 0.5) & (np.load(file_[0])[k] <= 1)))

        chemical_bonds_weak.append(chemical_bond_weak)
        chemical_bonds_normal.append(chemical_bond_normal)
        chemical_bonds_strong.append(chemical_bond_strong)
    return chemical_bonds_weak, chemical_bonds_normal, chemical_bonds_strong

#npyから結合の数を示す特徴量を作成(標準化)
def number_bonds_(df):
    chemical_bonds_weak = []
    chemical_bonds_normal = []
    chemical_bonds_strong = []
    for i in df.id:
        chemical_bond_weak = []
        chemical_bond_normal = []
        chemical_bond_strong = []
        file_ = glob.glob(data_dir + '/kaggle/input/stanford-covid-vaccine/bpps/' + i + '.npy')
        for k in range(len(df[df['id'] == i]['structure'].iloc[0])):
            chemical_bond_weak.append(np.count_nonzero((np.load(file_[0])[k] > 0) & (np.load(file_[0])[k] < 0.1))/100)
            chemical_bond_normal.append(np.count_nonzero((np.load(file_[0])[k] >= 0.1) & (np.load(file_[0])[k] < 0.5))/5)
            chemical_bond_strong.append(np.count_nonzero((np.load(file_[0])[k] >= 0.5) & (np.load(file_[0])[k] <= 1)))

        chemical_bonds_weak.append(chemical_bond_weak)
        chemical_bonds_normal.append(chemical_bond_normal)
        chemical_bonds_strong.append(chemical_bond_strong)
    return chemical_bonds_weak, chemical_bonds_normal, chemical_bonds_strong

#npyから結合の強さを示す特徴量を作成
def strength_bonds(df):
    chemical_bonds = []
    for i in df.id:
        chemical_bond = []
        for k in range(len(df[df['id'] == i]['structure'].iloc[0])):
            if df[df['id'] == i]['structure'].iloc[0][k] == '.':
                chemical_bond.append(0)
                continue
            file_ = glob.glob(data_dir + '/kaggle/input/stanford-covid-vaccine/bpps/' + i + '.npy')
            chemical_bond.append(np.max(np.load(file_[0])[k]))
        chemical_bonds.append(chemical_bond)
    return chemical_bonds

#C-G, G-C, A-U,　U-A, U-G,　G-Uのどの結合であるのかを示す特徴量を作成
def base_pairs(df):
    data_pairs = []
    for j in range(len(df)):
        pairs_dict = {}
        queue = []
        data_pair = ['0']* len(df.iloc[j]['structure'])
        for i in range(0, len(df.iloc[j]['structure'])):
            if df.iloc[j]['structure'][i] == '(':
                queue.append(i)
            if df.iloc[j]['structure'][i] == ')':
                first = queue.pop()
                data_pair[first] = df.iloc[j]['sequence'][first] + '-' + df.iloc[j]['sequence'][i]
                data_pair[i] = df.iloc[j]['sequence'][first] + '-' + df.iloc[j]['sequence'][i]
        data_pairs.append(data_pair)
    return data_pairs


def read_bpps_max(df):
    bpps_arr = []
    for mol_id in df.id.to_list():
        bpps_arr.append(np.load(data_dir + f"/kaggle/input/stanford-covid-vaccine/bpps/{mol_id}.npy").max(axis=1))
    return bpps_arr

def read_bpps_sum(df):
    bpps_arr = []
    for mol_id in df.id.to_list():
        bpps_arr.append(np.load(data_dir + f"/kaggle/input/stanford-covid-vaccine/bpps/{mol_id}.npy").sum(axis=1))
    return bpps_arr

def read_bpps_nb(df):
    # normalized non-zero number
    # from https://www.kaggle.com/symyksr/openvaccine-deepergcn 
    bpps_nb_mean = 0.077522 # mean of bpps_nb across all training data
    bpps_nb_std = 0.08914   # std of bpps_nb across all training data
    bpps_arr = []
    for mol_id in df.id.to_list():
        bpps = np.load(data_dir + f"/kaggle/input/stanford-covid-vaccine/bpps/{mol_id}.npy")
        bpps_nb = (bpps > 0).sum(axis=0) / bpps.shape[0]
        bpps_nb = (bpps_nb - bpps_nb_mean) / bpps_nb_std
        bpps_arr.append(bpps_nb)
    return bpps_arr

## 化学結合の数

In [None]:
train_number_bonds_weak, train_number_bonds_normal, train_number_bonds_strong = number_bonds(train)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/train_number_bonds_weak', train_number_bonds_weak)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/train_number_bonds_normal', train_number_bonds_normal)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/train_number_bonds_strong', train_number_bonds_strong)

test_number_bonds_weak, test_number_bonds_normal, test_number_bonds_strong = number_bonds(test)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/test_number_bonds_weak', test_number_bonds_weak)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/test_number_bonds_normal', test_number_bonds_normal)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/test_number_bonds_strong', test_number_bonds_strong)

In [None]:
train_number_bonds_weak, train_number_bonds_normal, train_number_bonds_strong = number_bonds_(train)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/train_number_bonds_weak_', train_number_bonds_weak)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/train_number_bonds_normal_', train_number_bonds_normal)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/train_number_bonds_strong_', train_number_bonds_strong)

test_number_bonds_weak, test_number_bonds_normal, test_number_bonds_strong = number_bonds_(test)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/test_number_bonds_weak_', test_number_bonds_weak)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/test_number_bonds_normal_', test_number_bonds_normal)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/test_number_bonds_strong_', test_number_bonds_strong)

## 化学結合の強さ

In [None]:
train_chemical_bonds = strength_bonds(train)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/train_chemical_bonds', train_chemical_bonds)

test_chemical_bonds = strength_bonds(test)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/test_chemical_bonds', test_chemical_bonds)

## 塩基結合の種類(C-G,G-C,A-U,U-A,U-G,G-U)

In [None]:
train_base_pairs = base_pairs(train)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/train_base_pairs', train_base_pairs)

test_base_pairs = base_pairs(test)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/test_base_pairs', test_base_pairs)

## 結合方法から新たな特徴量算出

参考 
https://www.kaggle.com/its7171/gru-lstm-with-feature-engineering-and-augmentation

In [None]:
train_bpps_sum = read_bpps_sum(train)
test_bpps_sum = read_bpps_sum(test)
train_bpps_max = read_bpps_max(train)
test_bpps_max = read_bpps_max(test)
train_bpps_nb = read_bpps_nb(train)
test_bpps_nb = read_bpps_nb(test)

np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/train_bpps_sum', train_bpps_sum)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/test_bpps_sum', test_bpps_sum)

np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/train_bpps_max', train_bpps_max)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/test_bpps_max', test_bpps_max)

np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/train_bpps_nb', train_bpps_nb)
np.save(data_dir + '/kaggle/input/stanford-covid-vaccine/test_bpps_nb', test_bpps_nb)

# Data Load

## 塩基の結合種類

In [None]:
train_base_pairs = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/train_base_pairs.npy',allow_pickle=True)
test_base_pairs = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/test_base_pairs.npy',allow_pickle=True)

train_base_pairs = pd.DataFrame({'base_pairs': train_base_pairs.tolist()})
train = pd.merge(train, train_base_pairs, left_index=True, right_index=True)

test_base_pair = pd.DataFrame({'base_pairs': test_base_pairs.tolist()})
test = pd.merge(test, test_base_pair, left_index=True, right_index=True)

## 化学結合の強さ

In [None]:
train_chemical_bonds = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/train_chemical_bonds.npy',allow_pickle=True)
test_chemical_bonds = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/test_chemical_bonds.npy',allow_pickle=True)

train_chemical_bonds = pd.DataFrame({'chemical_bonds': train_chemical_bonds.tolist()})
test_chemical_bonds = pd.DataFrame({'chemical_bonds': test_chemical_bonds.tolist()})

train_chemical_bonds_temp = np.squeeze(np.array(train_chemical_bonds.values.tolist()))
train_chemical_bonds_temp[(train_chemical_bonds_temp <= 1) & (train_chemical_bonds_temp > 0.75)] = 4
train_chemical_bonds_temp[(train_chemical_bonds_temp <= 0.75) & (train_chemical_bonds_temp > 0.5)] = 3
train_chemical_bonds_temp[(train_chemical_bonds_temp <= 0.5) & (train_chemical_bonds_temp > 0.25)] = 2
train_chemical_bonds_temp[(train_chemical_bonds_temp <= 0.25) & (train_chemical_bonds_temp > 0)] = 1
for i in range(train_chemical_bonds.shape[0]):
    train_chemical_bonds.iloc[i]['chemical_bonds'] = train_chemical_bonds_temp[i].tolist()

test_chemical_bonds_threshold = []
for i in range(test_chemical_bonds.values.shape[0]):
    test_chemical_bonds_temp = np.array(test_chemical_bonds.values.copy()[i][0])
    test_chemical_bonds_temp[(test_chemical_bonds_temp <= 1) & (test_chemical_bonds_temp > 0.75)] = 4
    test_chemical_bonds_temp[(test_chemical_bonds_temp <= 0.75) & (test_chemical_bonds_temp > 0.5)] = 3
    test_chemical_bonds_temp[(test_chemical_bonds_temp <= 0.5) & (test_chemical_bonds_temp > 0.25)] = 2
    test_chemical_bonds_temp[(test_chemical_bonds_temp <= 0.25) & (test_chemical_bonds_temp > 0)] = 1
    test_chemical_bonds_threshold.append(test_chemical_bonds_temp)
test_chemical_bonds['chemical_bonds'] = test_chemical_bonds_threshold

train = pd.merge(train, train_chemical_bonds, left_index=True, right_index=True)
test = pd.merge(test, test_chemical_bonds, left_index=True, right_index=True)

### 結合方法から新たな特徴量算出


参考 
https://www.kaggle.com/its7171/gru-lstm-with-feature-engineering-and-augmentation

In [None]:
train_bpps_sum = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/train_bpps_sum.npy',allow_pickle=True)
train_bpps_max = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/train_bpps_max.npy',allow_pickle=True)
train_bpps_nb  = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/train_bpps_nb.npy',allow_pickle=True)

test_bpps_sum = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/test_bpps_sum.npy',allow_pickle=True)
test_bpps_max = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/test_bpps_max.npy',allow_pickle=True)
test_bpps_nb  = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/test_bpps_nb.npy',allow_pickle=True)

train_bpps_sum = pd.DataFrame({'bpps_sum': train_bpps_sum.tolist()})
train_bpps_max = pd.DataFrame({'bpps_max': train_bpps_max.tolist()})
train_bpps_nb = pd.DataFrame({'bpps_nb': train_bpps_nb.tolist()})

test_bpps_sum = pd.DataFrame({'bpps_sum': test_bpps_sum.tolist()})
test_bpps_max = pd.DataFrame({'bpps_max': test_bpps_max.tolist()})
test_bpps_nb = pd.DataFrame({'bpps_nb': test_bpps_nb.tolist()})

train_bpps = pd.concat([train_bpps_sum, train_bpps_max, train_bpps_nb], axis=1)
train = pd.merge(train, train_bpps, left_index=True, right_index=True)

test_bpps = pd.concat([test_bpps_sum, test_bpps_max, test_bpps_nb], axis=1)
test = pd.merge(test, test_bpps, left_index=True, right_index=True)

## 化学結合の数

In [None]:
train_number_bonds_weak = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/train_number_bonds_weak_.npy',allow_pickle=True)
train_number_bonds_normal = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/train_number_bonds_normal_.npy',allow_pickle=True)
train_number_bonds_strong = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/train_number_bonds_strong_.npy',allow_pickle=True)

test_number_bonds_weak = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/test_number_bonds_weak_.npy',allow_pickle=True)
test_number_bonds_normal = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/test_number_bonds_normal_.npy',allow_pickle=True)
test_number_bonds_strong = np.load(data_dir + '/kaggle/input/stanford-covid-vaccine/test_number_bonds_strong_.npy',allow_pickle=True)

train_number_bonds_weak = pd.DataFrame({'number_bonds_weak': train_number_bonds_weak.tolist()})
train_number_bonds_normal = pd.DataFrame({'number_bonds_normal': train_number_bonds_normal.tolist()})
train_number_bonds_strong = pd.DataFrame({'number_bonds_strong': train_number_bonds_strong.tolist()})

test_number_bonds_weak = pd.DataFrame({'number_bonds_weak': test_number_bonds_weak.tolist()})
test_number_bonds_normal = pd.DataFrame({'number_bonds_normal': test_number_bonds_normal.tolist()})
test_number_bonds_strong = pd.DataFrame({'number_bonds_strong': test_number_bonds_strong.tolist()})

train_number_bonds = pd.concat([train_number_bonds_weak, train_number_bonds_normal,train_number_bonds_strong], axis=1)
train = pd.merge(train, train_number_bonds, left_index=True, right_index=True)

test_number_bonds = pd.concat([test_number_bonds_weak,test_number_bonds_normal,test_number_bonds_strong], axis=1)
test = pd.merge(test, test_number_bonds, left_index=True, right_index=True)

In [None]:
train.head()

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,signal_to_noise,SN_filter,seq_length,seq_scored,reactivity_error,deg_error_Mg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_error_50C,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C,base_pairs,chemical_bonds,bpps_sum,bpps_max,bpps_nb,number_bonds_weak,number_bonds_normal,number_bonds_strong
0,0,id_001f94081,GGAAAAGCUCUAAUAACAGGAGACUAGGACUACGUAUUUCUAGGUA...,.....((((((.......)))).)).((.....((..((((((......,EEEEESSSSSSHHHHHHHSSSSBSSXSSIIIIISSIISSSSSSHHH...,6.894,1,107,68,"[0.1359, 0.20700000000000002, 0.1633, 0.1452, ...","[0.26130000000000003, 0.38420000000000004, 0.1...","[0.2631, 0.28600000000000003, 0.0964, 0.1574, ...","[0.1501, 0.275, 0.0947, 0.18660000000000002, 0...","[0.2167, 0.34750000000000003, 0.188, 0.2124, 0...","[0.3297, 1.5693000000000001, 1.1227, 0.8686, 0...","[0.7556, 2.983, 0.2526, 1.3789, 0.637600000000...","[2.3375, 3.5060000000000002, 0.3008, 1.0108, 0...","[0.35810000000000003, 2.9683, 0.2589, 1.4552, ...","[0.6382, 3.4773, 0.9988, 1.3228, 0.78770000000...","[0, 0, 0, 0, 0, A-U, G-C, C-G, U-A, C-G, U-G, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 2.0, ...","[0.19854229, 0.18371220000000002, 0.0600024000...","[0.0217857, 0.0386527, 0.0275904, 0.00947066, ...","[2.0659663786252436, 1.2272143577570933, -0.34...","[0.28, 0.2, 0.05, 0.04, 0.03, 0.06, 0.19, 0.13...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.2, 0.2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,id_0049f53ba,GGAAAAAGCGCGCGCGGUUAGCGCGCGCUUUUGCGCGCGCUGUACC...,.....(((((((((((((((((((((((....)))))))))).)))...,EEEEESSSSSSSSSSSSSSSSSSSSSSSHHHHSSSSSSSSSSBSSS...,0.193,0,107,68,"[2.8272, 2.8272, 2.8272, 4.7343, 2.5676, 2.567...","[73705.3985, 73705.3985, 73705.3985, 73705.398...","[10.1986, 9.2418, 5.0933, 5.0933, 5.0933, 5.09...","[16.6174, 13.868, 8.1968, 8.1968, 8.1968, 8.19...","[15.4857, 7.9596, 13.3957, 5.8777, 5.8777, 5.8...","[0.0, 0.0, 0.0, 2.2965, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.947, 4.4523, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.8511, 4.0426, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[7.6692, 0.0, 10.9561, 0.0, 0.0, 0.0, 0.0, 0.0...","[0, 0, 0, 0, 0, A-U, A-U, G-C, C-G, G-C, C-G, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 4.0, 4.0, 4.0, ...","[0.16883627604054915, 0.10675940059456464, 0.0...","[0.11931483477784201, 0.0808186531761711, 0.06...","[0.49330633949746183, 0.49330633949746183, -0....","[0.12, 0.13, 0.05, 0.05, 0.05, 0.04, 0.02, 0.0...","[0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,2,id_006f36f57,GGAAAGUGCUCAGAUAAGCUAAGCUCGAAUAGCAAUCGAAUAGAAU...,.....((((.((.....((((.(((.....)))..((((......)...,EEEEESSSSISSIIIIISSSSMSSSHHHHHSSSMMSSSSHHHHHHS...,8.8,1,107,68,"[0.0931, 0.13290000000000002, 0.11280000000000...","[0.1365, 0.2237, 0.1812, 0.1333, 0.1148, 0.160...","[0.17020000000000002, 0.178, 0.111, 0.091, 0.0...","[0.1033, 0.1464, 0.1126, 0.09620000000000001, ...","[0.14980000000000002, 0.1761, 0.1517, 0.116700...","[0.44820000000000004, 1.4822, 1.1819, 0.743400...","[0.2504, 1.4021, 0.9804, 0.49670000000000003, ...","[2.243, 2.9361, 1.0553, 0.721, 0.6396000000000...","[0.5163, 1.6823000000000001, 1.0426, 0.7902, 0...","[0.9501000000000001, 1.7974999999999999, 1.499...","[0, 0, 0, 0, 0, G-U, U-A, G-U, C-G, 0, C-G, A-...","[0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, ...","[0.06680724164624499, 0.04433748694733778, 0.0...","[0.017340043515196805, 0.00826566577930985, 0....","[2.275654383842281, 2.5901863916678374, 0.8078...","[0.3, 0.33, 0.16, 0.13, 0.15, 0.28, 0.49, 0.17...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.2, 0.8, 0.8, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,id_0082d463b,GGAAAAGCGCGCGCGCGCGCGCGAAAAAGCGCGCGCGCGCGCGCGC...,......((((((((((((((((......))))))))))))))))((...,EEEEEESSSSSSSSSSSSSSSSHHHHHHSSSSSSSSSSSSSSSSSS...,0.104,0,107,68,"[3.5229, 6.0748, 3.0374, 3.0374, 3.0374, 3.037...","[73705.3985, 73705.3985, 73705.3985, 73705.398...","[11.8007, 12.7566, 5.7733, 5.7733, 5.7733, 5.7...","[121286.7181, 121286.7182, 121286.7181, 121286...","[15.3995, 8.1124, 7.7824, 7.7824, 7.7824, 7.78...","[0.0, 2.2399, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, -0.5083, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[3.4248, 6.8128, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, -0.8365, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[7.6692, -1.3223, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[0, 0, 0, 0, 0, 0, G-C, C-G, G-C, C-G, G-C, C-...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, ...","[0.22702912000000003, 0.18439387000000002, 0.0...","[0.0350158, 0.0332295, 0.00272567, 0.0, 0.0, 0...","[0.5981503421059805, 0.9126823499315369, -0.65...","[0.14, 0.17, 0.02, 0.0, 0.0, 0.02, 0.14, 0.1, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6, 0.6, 0.6, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,id_0087940f4,GGAAAAUAUAUAAUAUAUUAUAUAAAUAUAUUAUAGAAGUAUAAUA...,.....(((((((.((((((((((((.(((((((((....)))))))...,EEEEESSSSSSSBSSSSSSSSSSSSBSSSSSSSSSHHHHSSSSSSS...,0.423,0,107,68,"[1.665, 2.1728, 2.0041, 1.2405, 0.620200000000...","[4.2139, 3.9637000000000002, 3.2467, 2.4716, 1...","[3.0942, 3.015, 2.1212, 2.0552, 0.881500000000...","[2.6717, 2.4818, 1.9919, 2.5484999999999998, 1...","[1.3285, 3.6173, 1.3057, 1.3021, 1.1507, 1.150...","[0.8267, 2.6577, 2.8481, 0.40090000000000003, ...","[2.1058, 3.138, 2.5437000000000003, 1.0932, 0....","[4.7366, 4.6243, 1.2068, 1.1538, 0.0, 0.0, 0.7...","[2.2052, 1.7947000000000002, 0.7457, 3.1233, 0...","[0.0, 5.1198, -0.3551, -0.3518, 0.0, 0.0, 0.0,...","[0, 0, 0, 0, 0, A-U, U-A, A-U, U-A, A-U, U-A, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 3.0, 4.0, 4.0, ...","[0.04036075455181073, 0.08021642791992274, 0.0...","[0.02230060114516417, 0.052274223411953764, 0....","[0.5981503421059805, 0.8078383473230181, 0.178...","[0.14, 0.16, 0.1, 0.11, 0.11, 0.11, 0.17, 0.09...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.2, ...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, ..."


In [None]:
test.head()

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,seq_length,seq_scored,base_pairs,chemical_bonds,bpps_sum,bpps_max,bpps_nb,number_bonds_weak,number_bonds_normal,number_bonds_strong
0,0,id_00073f8be,GGAAAAGUACGACUUGAGUACGGAAAACGUACCAACUCGAUUAAAA...,......((((((((((.(((((.....))))))))((((((((......,EEEEEESSSSSSSSSSBSSSSSHHHHHSSSSSSSSSSSSSSSSHHH...,107,68,"[0, 0, 0, 0, 0, 0, G-C, U-A, A-U, C-G, G-C, A-...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, ...","[0.03701574756330438, 0.012454753930743238, 0....","[0.007221606518561295, 0.002257620678109476, 0...","[0.9126823499315369, 1.2272143577570933, 0.178...","[0.17, 0.2, 0.1, 0.07, 0.07, 0.08, 0.17, 0.37,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.2, 0.2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ..."
1,1,id_000ae4237,GGAAACGGGUUCCGCGGAUUGCUGCUAAUAAGAGUAAUCUCUAAAU...,.....((((..((((((...(((((.....((((....)))).......,EEEEESSSSIISSSSSSIIISSSSSIIIIISSSSHHHHSSSSIIII...,130,91,"[0, 0, 0, 0, 0, C-G, G-C, G-C, G-C, 0, 0, C-G,...","[0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 3.0, 3.0, 3.0, ...","[0.2304712272461556, 0.23820549694391135, 0.20...","[0.193407103672755, 0.19773924317365849, 0.195...","[0.3384598125679572, 0.5973438497782229, -0.26...","[0.13, 0.16, 0.06, 0.04, 0.03, 0.1, 0.13, 0.16...","[0.2, 0.2, 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, ..."
2,2,id_00131c573,GGAAAACAAAACGGCCUGGAAGACGAAGGAAUUCGGCGCGAAGGCC...,...........((.(((.(.(..((..((..((((...))))..))...,EEEEEEEEEEESSISSSISISIISSIISSIISSSSHHHSSSSIISS...,107,68,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, C-G, G-C, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.07493084, 0.02550902, 0.0, 0.00150035, 0.00...","[0.0294548, 0.00763603, 0.0, 0.00150035, 0.001...","[0.388462336888943, -0.030913673545132192, -0....","[0.12, 0.08, 0.0, 0.01, 0.01, 0.02, 0.13, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ..."
3,3,id_00181fd34,GGAAAGGAUCUCUAUCGAAGGAUAGAGAUCGCUCGCGACGGCACGA...,......((((((((((....))))))))))((((((..((.(((.....,EEEEEESSSSSSSSSSHHHHSSSSSSSSSSSSSSSSIISSISSSHH...,107,68,"[0, 0, 0, 0, 0, 0, G-C, A-U, U-A, C-G, U-A, C-...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 4.0, 4.0, ...","[0.15362639758779065, 0.2281113621902515, 0.12...","[0.06724628826674545, 0.12075529229189887, 0.1...","[0.388462336888943, 0.2836183342804242, -0.555...","[0.12, 0.1, 0.02, 0.02, 0.02, 0.14, 0.06, 0.0,...","[0.0, 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,4,id_0020473f7,GGAAACCCGCCCGCGCCCGCCCGCGCUGCUGCCGUGCCUCCUCUCC...,.....(((((((((((((((((((((((((((((((((((((((((...,EEEEESSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS...,130,91,"[0, 0, 0, 0, 0, C-G, C-G, C-G, G-U, C-G, C-G, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 4.0, 4.0, 4.0, ...","[0.019419062344993222, 0.005454117834146501, 3...","[0.004852784769972624, 0.001329212597223424, 3...","[-0.17930826185257406, -0.17930826185257406, -...","[0.08, 0.08, 0.01, 0.01, 0.01, 0.04, 0.02, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
train.to_json(data_dir + '/kaggle/input/stanford-covid-vaccine/train_new.json', orient='records', lines=True)
test.to_json(data_dir + '/kaggle/input/stanford-covid-vaccine/test_new.json', orient='records', lines=True)

In [None]:
train_ = pd.read_json(data_dir + '/kaggle/input/stanford-covid-vaccine/train_new.json', lines=True)
test_ = pd.read_json(data_dir + '/kaggle/input/stanford-covid-vaccine/test_new.json', lines=True)

## 新しいjsonファイルの作成

In [45]:
sub = pd.read_csv(data_dir + "/kaggle/input/stanford-covid-vaccine/sample_submission.csv")

In [46]:
train_aug = pd.read_json(data_dir + "/kaggle/input/stanford-covid-vaccine/train_aug.json",lines=True)
train_gnn = pd.read_json(data_dir + "/kaggle/input/stanford-covid-vaccine/train_gnn.json",lines=True)

In [47]:
train_targets = np.array(train_aug[targets].values.tolist().copy())/2 + np.array(train_gnn[targets].values.tolist().copy())/2
train_targets = train_targets.transpose(0,2,1)

In [48]:
for i, target in enumerate(targets):
    train_aug[target] = train_targets[:,:,i].tolist()
train_aug.to_json(data_dir + '/kaggle/input/stanford-covid-vaccine/train_mix.json', orient='records', lines=True)