In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader
from transformers import AutoModelForMaskedLM, AutoTokenizer
import json

## Seed Everything

In [2]:
def seed_everything():
    seed = 42
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    print('-----Seed Set!-----') 

In [3]:
seed_everything()

-----Seed Set!-----


## Download the pretrained ChemBERTa model

## Read Competition Datasets

In [4]:
de_train = pd.read_parquet('../input/open-problems-single-cell-perturbations/de_train.parquet')
id_map = pd.read_csv('../input/open-problems-single-cell-perturbations/id_map.csv')
sample_submission = pd.read_csv('../input/open-problems-single-cell-perturbations/sample_submission.csv', index_col='id')

In [5]:
xlist  = ['cell_type','sm_name']
_ylist = ['cell_type','sm_name','sm_lincs_id','SMILES','control']

y = de_train.drop(columns=_ylist)
y.shape

(614, 18211)

## Use Scikit-Learn's One Hot Encoder
This helps encode each pair (cell_type, sm_name) as a multi-dimensional binary vector

In [6]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
encoder.fit(de_train[xlist])
one_hot_encode_features = encoder.transform(de_train[xlist])
one_hot_test = encoder.transform(id_map[xlist])

X = pd.DataFrame(one_hot_encode_features.toarray().astype(float))
test = pd.DataFrame(one_hot_test.toarray().astype(float))

## 

## First Data Augmentation
Compute the mean and std differential expression for each cell type and each small molecule name. Take this as additional input features to the model to be built.

In [7]:
de_cell_type = de_train.iloc[:, [0] + list(range(5, de_train.shape[1]))]
de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))]
mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index()
mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()

std_cell_type = de_cell_type.groupby('cell_type').std().reset_index()
std_sm_name = de_sm_name.groupby('sm_name').std().reset_index()

I also consider the 25%, 50%, and 75% percentiles (see below). Not that I explored different combinations of these features. In particular, when using all additional features (i.e., mean, std, 25%, 50%, and 75% percentiles) I refer to the corresponding models as "heavy".

In [8]:
cell_types = de_cell_type.groupby('cell_type').quantile(0.1).reset_index()['cell_type']
desc_cell_type = pd.concat([pd.DataFrame(cell_types)]+[de_cell_type.groupby('cell_type')[col]\
.quantile([0.25, 0.50, 0.75], interpolation='linear').unstack().reset_index(drop=True) for col in list(de_train.columns)[5:]], axis=1)

In [9]:
sm_name2smiles = {smname:smiles for smname, smiles in zip(de_train['sm_name'], de_train['SMILES'])}
test_smiles = list(map(sm_name2smiles.get, id_map['sm_name'].values))

# ChemBERTa Features
Thanks to ALEKSEY TREPETSKY (I upvoted) https://www.kaggle.com/code/alekseytrepetsky/create-chemberta-embed/notebook, I could either build my own ChemBERTa features or use the ones she/he has created and shared publicly.

In [10]:
def build_ChemBERTa_features(smiles_list):
    chemberta = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
    tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
    chemberta.eval()
    embeddings = torch.zeros(len(smiles_list), 600)
    embeddings_mean = torch.zeros(len(smiles_list), 600)

    with torch.no_grad():
        for i, smiles in enumerate(tqdm(smiles_list)):
            encoded_input = tokenizer(smiles, return_tensors="pt", padding=False, truncation=True)
            model_output = chemberta(**encoded_input)
            
            embedding = model_output[0][::,0,::]
            embeddings[i] = embedding
            
            embedding = torch.mean(model_output[0], 1)
            embeddings_mean[i] = embedding
            
    return embeddings.numpy(), embeddings_mean.numpy()

In [11]:
train_chem_feat, train_chem_feat_mean = build_ChemBERTa_features(de_train.SMILES)
test_chem_feat, test_chem_feat_mean = build_ChemBERTa_features(test_smiles)

Downloading config.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/14.0M [00:00<?, ?B/s]

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/8.26k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/420 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 614/614 [00:05<00:00, 120.11it/s]
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 255/255 [00:01<00:00, 136.60it/s]


## Now Define the Function to Combine the Created Features in Different Ways
For each combination, we create and train 3 deep learning architectures (see below)

In [12]:
def combine_features(data_aug_dfs, chem_feats, main_df, one_hot_dfs=None, quantiles_df=None):
    """
    This function concatenates the provided vectors, matrices and data frames (i.e., one hot, std, mean, etc) into a single long vector. This is done for each pair (cell_type, sm_name)
    """
    new_vecs = []
    chem_feat_dim = 600
    if len(data_aug_dfs) > 0:
        add_len = sum(aug_df.shape[1]-1 for aug_df in data_aug_dfs)+chem_feat_dim*len(chem_feats)+one_hot_dfs.shape[1] if\
        one_hot_dfs is not None else sum(aug_df.shape[1]-1 for aug_df in data_aug_dfs)+chem_feat_dim*len(chem_feats)
    else:
        add_len = chem_feat_dim*len(chem_feats)+one_hot_dfs.shape[1] if\
        one_hot_dfs is not None else chem_feat_dim*len(chem_feats)
    if quantiles_df is not None:
        add_len += (quantiles_df.shape[1]-1)//3
    for i in range(len(main_df)):
        if one_hot_dfs is not None:
            vec_ = (one_hot_dfs.iloc[i,:].values).copy()
        else:
            vec_ = np.array([])
        for df in data_aug_dfs:
            if 'cell_type' in df.columns:
                values = df[df['cell_type']==main_df.iloc[i]['cell_type']].values.squeeze()[1:].astype(float)
                vec_ = np.concatenate([vec_, values])
            else:
                assert 'sm_name' in df.columns
                values = df[df['sm_name']==main_df.iloc[i]['sm_name']].values.squeeze()[1:].astype(float)
                vec_ = np.concatenate([vec_, values])
        for chem_feat in chem_feats:
            vec_ = np.concatenate([vec_, chem_feat[i]])
        final_vec = np.concatenate([vec_,np.zeros(add_len-vec_.shape[0],)])
        new_vecs.append(final_vec)
    return np.stack(new_vecs, axis=0).astype(float).reshape(len(main_df), 1, add_len)

In [13]:
test_vec = combine_features([mean_cell_type,std_cell_type,mean_sm_name,std_sm_name],\
                   [test_chem_feat, test_chem_feat_mean], id_map, test)
test_vec_light = combine_features([mean_cell_type,mean_sm_name],\
                   [test_chem_feat, test_chem_feat_mean], id_map, test)
test_vec_heavy = combine_features([desc_cell_type,mean_cell_type,mean_sm_name],\
                   [test_chem_feat,test_chem_feat_mean], id_map, test, desc_cell_type)

## Evaluation Metric Function

In [14]:
def mrrmse_np(y_pred, y_true):
    return np.sqrt(np.square(y_true - y_pred).mean(axis=1)).mean()

## Additional Loss Functions
I discovered experimentally that by combining different loss functions (others defined in the models), we can achieve a better performance. In particular, I used the binary cross entropy loss to push each predicted value in the target to a value other than 0 (i.e., push the value to a strictly positive or negative value). This is motivated by the fact that several values in the target are close to zero, and I wanted to make sure models do not learn this naively. The rest of the loss functions are suited for regression tasks and used normally to enforce the predicted value to be close to the target.

In [15]:
class LogCoshLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, y_prime_t, y_t):
        ey_t = (y_t - y_prime_t)/3 # divide by 3 to avoid numerical overflow in cosh
        return torch.mean(torch.log(torch.cosh(ey_t + 1e-12)))

In [16]:
dims_dict = {'conv': {'heavy': 13400, 'light': 4576, 'initial': 8992},
                                    'rnn': {'linear': {'heavy': 99968, 'light': 24192, 'initial': 29568},
                                           'input_shape': {'heavy': [779,142], 'light': [187,202], 'initial': [229,324]}
                                           }}

## Modeling

In [17]:
class Conv(nn.Module):
    def __init__(self, scheme):
        super(Conv, self).__init__()
        self.name = 'Conv'
        self.conv_block = nn.Sequential(nn.Conv1d(1, 8, 5, stride=1, padding=0),
                                        nn.Dropout(0.3),
                                        nn.Conv1d(8, 8, 5, stride=1, padding=0),
                                        nn.ReLU(),
                                        nn.Conv1d(8, 16, 5, stride=2, padding=0),
                                        nn.Dropout(0.3),
                                        nn.AvgPool1d(11),
                                        nn.Conv1d(16, 8, 3, stride=3, padding=0),
                                        nn.Flatten())
        self.scheme = scheme
        self.linear = nn.Sequential(
                nn.Linear(dims_dict['conv'][self.scheme], 1024),
                nn.Dropout(0.3),
                nn.ReLU(),
                nn.Linear(1024, 512),
                nn.Dropout(0.3),
                nn.ReLU())
        self.head1 = nn.Linear(512, 18211)
        
        self.loss1 = nn.MSELoss()
        self.loss2 = LogCoshLoss()
        self.loss3 = nn.L1Loss()
        self.loss4 = nn.BCELoss()
        
    def forward(self, x, y=None):
        if y is None:
            out = self.conv_block(x)
            out = self.head1(self.linear(out))
            return out
        else:
            out = self.conv_block(x)
            out = self.head1(self.linear(out))
            loss1 = 0.4*self.loss1(out, y) + 0.3*self.loss2(out, y) + 0.3*self.loss3(out, y)
            yhat = torch.sigmoid(out)
            yy = torch.sigmoid(y)
            loss2 = self.loss4(yhat, yy)
            return 0.8*loss1 + 0.2*loss2
        

class LSTM(nn.Module):
    def __init__(self, scheme):
        super(LSTM, self).__init__()
        self.name = 'LSTM'
        self.scheme = scheme
        self.lstm = nn.LSTM(dims_dict['rnn']['input_shape'][self.scheme][1], 128, num_layers=2, batch_first=True)
        self.linear = nn.Sequential(
            nn.Linear(dims_dict['rnn']['linear'][self.scheme], 1024),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.Dropout(0.3),
            nn.ReLU())
        self.head1 = nn.Linear(512, 18211)
        
        self.loss1 = nn.MSELoss()
        self.loss2 = LogCoshLoss()
        self.loss3 = nn.L1Loss()
        self.loss4 = nn.BCELoss()
        
    def forward(self, x, y=None):
        shape1, shape2 = dims_dict['rnn']['input_shape'][self.scheme]
        x = x.reshape(x.shape[0],shape1,shape2)
        if y is None:
            out, (hn, cn) = self.lstm(x)
            out = out.reshape(out.shape[0],-1)
            out = torch.cat([out, hn.reshape(hn.shape[1], -1)], dim=1)
            out = self.head1(self.linear(out))
            return out
        else:
            out, (hn, cn) = self.lstm(x)
            out = out.reshape(out.shape[0],-1)
            out = torch.cat([out, hn.reshape(hn.shape[1], -1)], dim=1)
            out = self.head1(self.linear(out))
            loss1 = 0.4*self.loss1(out, y) + 0.3*self.loss2(out, y) + 0.3*self.loss3(out, y)
            yhat = torch.sigmoid(out)
            yy = torch.sigmoid(y)
            loss2 = self.loss4(yhat, yy)
            return 0.8*loss1 + 0.2*loss2
        
        
class GRU(nn.Module):
    def __init__(self, scheme):
        super(GRU, self).__init__()
        self.name = 'GRU'
        self.scheme = scheme
        self.gru = nn.GRU(dims_dict['rnn']['input_shape'][self.scheme][1], 128, num_layers=2, batch_first=True)
        self.linear = nn.Sequential(
            nn.Linear(dims_dict['rnn']['linear'][self.scheme], 1024),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.Dropout(0.3),
            nn.ReLU())
        self.head1 = nn.Linear(512, 18211)
        
        self.loss1 = nn.MSELoss()
        self.loss2 = LogCoshLoss()
        self.loss3 = nn.L1Loss()
        self.loss4 = nn.BCELoss()
        
    def forward(self, x, y=None):
        shape1, shape2 = dims_dict['rnn']['input_shape'][self.scheme]
        x = x.reshape(x.shape[0],shape1,shape2)
        if y is None:
            out, hn = self.gru(x)
            out = out.reshape(out.shape[0],-1)
            out = torch.cat([out, hn.reshape(hn.shape[1], -1)], dim=1)
            out = self.head1(self.linear(out))
            return out
        else:
            out, hn = self.gru(x)
            out = out.reshape(out.shape[0],-1)
            out = torch.cat([out, hn.reshape(hn.shape[1], -1)], dim=1)
            out = self.head1(self.linear(out))
            loss1 = 0.4*self.loss1(out, y) + 0.3*self.loss2(out, y) + 0.3*self.loss3(out, y)
            yhat = torch.sigmoid(out)
            yy = torch.sigmoid(y)
            loss2 = self.loss4(yhat, yy)
            return 0.8*loss1 + 0.2*loss2

## Create Dataset Class

In [18]:
class Dataset:
    def __init__(self, data_x, data_y=None):
        super(Dataset, self).__init__()
        self.data_x = data_x
        self.data_y = data_y

    def __len__(self):
        return len(self.data_x)
    
    def __getitem__(self, idx):
        if self.data_y is not None:
            return self.data_x[idx], self.data_y[idx]
        else:
            return self.data_x[idx]

## Define 2nd Data Augmentation Function
In the following function, we augment the training data by randomly dropping 30% of our 1-dimensional input feature vectors' entries. Input features are of shape (batch, 1, d)

In [19]:
import random
def augment_data(x_, y_):
    copy_x = x_.copy()
    new_x = []
    new_y = y_.copy()
    dim = x_.shape[2]
    k = int(0.3*dim)
    for i in range(x_.shape[0]):
        idx = random.sample(range(dim), k=k)
        copy_x[i,:,idx] = 0
        new_x.append(copy_x[i])
    return np.stack(new_x, axis=0), new_y

## Define Helper and Main Training Functions
GRU experienced numerical overflow with a learning rate of 0.001, so I used 0.0003 instead

In [20]:
def train_step(dataloader, model, opt, clip_norm):
    model.train()
    train_losses = []
    for x, target in dataloader:
        if torch.cuda.is_available():
            model.cuda()
            x = x.cuda()
            target = target.cuda()
        loss = model(x, target)
        train_losses.append(loss.item())
        opt.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), clip_norm)
        opt.step()
    return np.mean(train_losses)

def validation_step(dataloader, model):
    model.eval()
    val_losses = []
    val_mrrmse = []
    for x, target in dataloader:
        if torch.cuda.is_available():
            model.cuda()
            x = x.cuda()
            target = target.cuda()
        loss = model(x,target)
        pred = model(x).detach().cpu().numpy()
        val_mrrmse.append(mrrmse_np(pred, target.cpu().numpy()))
        val_losses.append(loss.item())
    return np.mean(val_losses), np.mean(val_mrrmse)


def train_function(model, x_train, y_train, x_val, y_val, epochs=20, clip_norm=1.0):
    if model.name in ['GRU']:
        print('lr', 0.0003)
        opt = torch.optim.Adam(model.parameters(), lr=0.0003)
    else:
        opt = torch.optim.Adam(model.parameters(), lr=0.001)
    model.cuda()
    x_train_aug, y_train_aug = augment_data(x_train, y_train)
    x_train_aug = np.concatenate([x_train, x_train_aug], axis=0)
    y_train_aug = np.concatenate([y_train, y_train_aug], axis=0)
    data_x_train = torch.FloatTensor(x_train_aug)
    data_y_train = torch.FloatTensor(y_train_aug)
    data_x_val = torch.FloatTensor(x_val)
    data_y_val = torch.FloatTensor(y_val)
    train_dataloader = DataLoader(Dataset(data_x_train, data_y_train), num_workers=4, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(Dataset(data_x_val, data_y_val), num_workers=4, batch_size=32, shuffle=False)
    best_loss = np.inf
    best_weights = None
    train_losses = []
    val_losses = []
    for e in range(epochs):
        loss = train_step(train_dataloader, model, opt, clip_norm)
        val_losses.append(loss.item())
        val_loss, val_mrrmse = validation_step(val_dataloader, model)
        if val_mrrmse < best_loss:
            best_loss = val_mrrmse
            best_weights = model.state_dict()
            print('BEST ----> ')
        print(f"{model.name} Epoch {e}, train_loss {round(loss,3)}, val_loss {round(val_loss, 3)}, val_mrrmse {val_mrrmse}")
    model.load_state_dict(best_weights)
    return model

In [21]:
from sklearn.model_selection import KFold as KF
splits = 5
kf_cv = KF(n_splits=splits, shuffle=True, random_state=42)

In [22]:
def cross_validate_models(X, y, epochs=120, scheme='initial', clip_norm=1.0):
    trained_models = []
    for i, (train_idx,val_idx) in enumerate(kf_cv.split(X)):
        print(f"\nSplit {i+1}/{splits}...")
        x_train, x_val = X[train_idx], X[val_idx]
        y_train, y_val = y.values[train_idx], y.values[val_idx]
        for Model in [LSTM, Conv, GRU]:
            model = Model(scheme)
            model = train_function(model, x_train, y_train, x_val, y_val, epochs=epochs, clip_norm=clip_norm)
            model.to('cpu')
            trained_models.append(model)
            torch.cuda.empty_cache()
            torch.save(model.state_dict(), f'pytorch_{model.name}_{scheme}_fold{i}.pt')
    return trained_models

## Define Inference Functions

In [23]:
def inference_pytorch(model, dataloader):
    model.eval()
    preds = []
    for x in dataloader:
        if torch.cuda.is_available():
            model.cuda()
            x = x.cuda()
        pred = model(x).detach().cpu().numpy()
        preds.append(pred)
    model.to('cpu')
    torch.cuda.empty_cache()
    return np.concatenate(preds, axis=0)

In [24]:
def average_prediction(X_test, trained_models):
    all_preds = []
    test_dataloader = DataLoader(Dataset(torch.FloatTensor(X_test)), num_workers=4, batch_size=64, shuffle=False)
    for i,model in enumerate(trained_models):
        #if model.name == "Conv": continue
        current_pred = inference_pytorch(model, test_dataloader)
        all_preds.append(current_pred)
    return np.stack(all_preds, axis=1).mean(axis=1)

In [25]:
def weighted_average_prediction(X_test, trained_models, model_wise=[0.25, 0.35, 0.40], fold_wise=None):
    all_preds = []
    test_dataloader = DataLoader(Dataset(torch.FloatTensor(X_test)), num_workers=4, batch_size=64, shuffle=False)
    for i,model in enumerate(trained_models):
        current_pred = inference_pytorch(model, test_dataloader)
        current_pred = model_wise[i%3]*current_pred
        if fold_wise:
            current_pred = fold_wise[i//3]*current_pred
        all_preds.append(current_pred)
    return np.stack(all_preds, axis=1).sum(axis=1)

In [26]:
def reproduce(epochs=1):
    trained_models = {'initial': [], 'light': [], 'heavy': []}
    for scheme, clip_norm, input_features in zip(['initial', 'light', 'heavy'], [5.0, 1.0, 1.0], [X_vec, X_vec_light, X_vec_heavy]):
        seed_everything()
        models = cross_validate_models(input_features, y, epochs=epochs, scheme=scheme, clip_norm=clip_norm)
        trained_models[scheme].extend(models)
    return trained_models

In [27]:
def load_trained_models(path="/kaggle/input/best-models-single-cell/", kf_n_splits=5):
    trained_models = {'initial': [], 'light': [], 'heavy': []}
    for scheme in ['initial', 'light', 'heavy']:
        for fold in range(kf_n_splits):
            for Model in [LSTM, Conv, GRU]:
                model = Model(scheme)
                for weights_path in os.listdir(path):
                    if model.name in weights_path and scheme in weights_path and f'fold{fold}' in weights_path:
                        model.load_state_dict(torch.load(f'{path}{weights_path}', map_location='cpu'))
                        trained_models[scheme].append(model)
    return trained_models

In [28]:
trained_models = load_trained_models()#reproduce(epochs=250)

In [29]:
model_weights = [0.29, 0.33, 0.38]
fold_weights = [0.25, 0.15, 0.2, 0.15, 0.25]

In [30]:
pred1 = average_prediction(test_vec_light, trained_models['light'])
pred2 = weighted_average_prediction(test_vec_light, trained_models['light'],\
                                        model_wise=model_weights, fold_wise=fold_weights)

In [31]:
pred3 = average_prediction(test_vec, trained_models['initial'])
pred4 = weighted_average_prediction(test_vec, trained_models['initial'],\
                                        model_wise=model_weights, fold_wise=fold_weights)

In [32]:
pred5 = average_prediction(test_vec_heavy, trained_models['heavy'])
pred6 = weighted_average_prediction(test_vec_heavy, trained_models['heavy'],\
                                    model_wise=model_weights, fold_wise=fold_weights)

## Read Submission Sample File

In [33]:
col = list(de_train.columns[5:])
submission = sample_submission.copy()

## Ensemble Prediction

In [34]:
submission[col] = 0.23*pred1 + 0.15*pred2 + 0.18*pred3 + 0.15*pred4 + 0.15*pred5 + 0.14*pred6
df1 = submission.copy()

In [35]:
submission[col] =  0.13*pred1 + 0.15*pred2 + 0.23*pred3 + 0.15*pred4 + 0.20*pred5 + 0.14*pred6
df2 = submission.copy()

In [36]:
submission[col] = 0.17*pred1 + 0.16*pred2 + 0.17*pred3 + 0.16*pred4 + 0.18*pred5 + 0.16*pred6
df3 = submission.copy()

In [37]:
df_sub = 0.34*df1 + 0.33*df2 + 0.33*df3

## Save Submission Dataframe

In [38]:
df_sub

Unnamed: 0_level_0,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,AAK1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.004891,0.257333,0.350504,0.783664,0.991103,0.424397,-0.072302,0.096655,-0.235352,0.644939,...,-1.231125,-0.238978,-0.240077,0.218761,0.024762,0.433813,0.391118,0.399233,-0.637794,0.134152
1,0.086987,0.100221,0.116760,0.225081,0.657403,0.378152,-0.036609,0.145430,0.148688,0.112047,...,-0.192942,-0.010191,-0.171862,0.278123,0.139727,0.177368,0.199524,0.093824,-0.228720,-0.072083
2,0.570574,0.200264,0.552647,0.946792,2.579547,1.750001,-0.010679,0.386336,0.392121,0.505168,...,-0.422660,0.097118,-0.036983,0.778453,0.439742,0.460379,0.526051,0.231570,-0.365261,0.043398
3,0.027916,0.101314,0.084691,0.190825,0.442343,0.189132,-0.066829,0.086405,0.063511,0.107463,...,-0.258952,-0.022202,-0.199640,0.189844,0.081758,0.166348,0.170276,0.114126,-0.152267,-0.062795
4,0.011895,0.100863,0.114376,0.293312,0.614760,0.323282,-0.060771,0.133719,0.081242,0.245043,...,-0.485432,-0.061439,-0.269130,0.220721,0.064593,0.232333,0.234596,0.163050,-0.304614,-0.021434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,0.071806,-0.015070,-0.270762,-0.305674,0.309278,0.096855,0.105902,0.133263,0.088458,0.152961,...,0.041829,-0.111343,-0.377440,-0.090412,0.057765,-0.008004,-0.155536,-0.185297,-0.375359,-0.172615
251,0.353880,0.058280,0.146316,0.051660,2.291123,0.885869,0.209340,-0.039648,0.161889,0.259124,...,-0.476890,-0.376750,-0.120565,-0.399312,0.457138,0.096086,-0.215959,-0.071947,-0.139824,-0.009215
252,0.070027,0.009405,-0.093254,-0.231917,0.406057,0.149935,0.069441,0.123252,0.105856,0.171478,...,-0.045171,-0.102367,-0.168883,-0.061508,0.109178,0.048182,-0.106359,-0.132385,-0.179049,-0.112557
253,0.769563,1.120203,-4.138731,0.050159,3.966650,2.874411,0.706871,-0.120767,0.327062,1.393089,...,-0.077082,-0.236828,-4.114982,-0.373702,0.234885,-0.191915,-1.683750,-0.063814,0.515996,-0.404022


In [40]:
df_sub.to_csv('submission.csv')