In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import GroupKFold
from functions_refactor import RAdam
from functions_cnn import *

In [2]:
type_ = '2JHN'
num_workers = 4
batch_size = 512
clip = 2
n_epochs = 50
model_struct = CNN2
root_dir ="../Data/full-images-2jhn/Image_2JHN/"

In [3]:
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')
train_ids,test_ids = process_data(train_df,test_df,type_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train_ids['file_name'] = train_ids.molecule_name.str.cat(train_ids.id.astype(str),sep='_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  test_ids['file_name'] = test_ids.molecule_name.str.cat(test_ids.id.astype(str),sep='_')


In [4]:
group_kfold = GroupKFold(n_splits=5)
for n_fold, (train_idx, valid_idx) in enumerate(group_kfold.split(train_ids,groups=train_ids['molecule_name'])):
    print('\nstart fold: {}'.format(n_fold))
    
    # set-up data
    train_dl = train_ids.iloc[train_idx]
    valid_dl = train_ids.iloc[valid_idx]
    train_dl = CustomImageDataset(train_dl, root_dir, transform=None, IsTrain=True)
    valid_dl = CustomImageDataset(valid_dl, root_dir, transform=None, IsTrain=True)
    test_dl = CustomImageDataset(test_ids, root_dir, transform=None, IsTrain=False)
    train_dl = torch.utils.data.DataLoader(train_dl,batch_size=batch_size,shuffle=True,num_workers=num_workers)
    valid_dl = torch.utils.data.DataLoader(valid_dl,batch_size=batch_size,shuffle=False,num_workers=num_workers)
    test_dl = torch.utils.data.DataLoader(test_dl,batch_size=batch_size,shuffle=False,num_workers=num_workers)
    
    # train model
    model = model_struct().to('cuda')
    criterion = nn.SmoothL1Loss()
    optimizer = RAdam(model.parameters(),lr=0.0001,weight_decay=1e-2)
    scheduler = ReduceLROnPlateau(optimizer, 'min',factor=0.5,patience=5)
    model = train_cnn(model,optimizer,train_dl,valid_dl,n_epochs,clip,scheduler)
    
    # predict oof
    model.eval()
    yhat_list = []
    with torch.no_grad():
        for x_torch,_ in valid_dl:
            x_torch = x_torch.to('cuda:0')
            yhat_list.append(model(x_torch).squeeze(1))
    yhat = torch.cat(yhat_list).cpu().detach().numpy()        
    
    assert yhat.shape[0]==train_ids.iloc[valid_idx].shape[0],'yhat and test_id should have same shape'
    submit_ = dict(zip(train_ids.iloc[valid_idx]['id'].values,yhat))
    train_df['fold'+str(n_fold)+'_'+type_] = train_df.id.map(submit_)
    
    # predict test
    model.eval()
    yhat_list = []
    with torch.no_grad():
        for data_torch in test_dl:
            data_torch = data_torch.to('cuda:0')
            yhat_list.append(model(data_torch).squeeze(1))
    yhat = torch.cat(yhat_list).cpu().detach().numpy()        

    # join
    assert yhat.shape[0]==test_ids.shape[0],'yhat and test_id should have same shape'
    submit_ = dict(zip(test_ids['id'].values,yhat))
    test_df['fold'+str(n_fold)+'_'+type_] = test_df.id.map(submit_)
    break


start fold: 0
Epoch: 1 	Training Loss: 1.641698 	Validation Loss: 0.434549
Epoch: 2 	Training Loss: 0.315032 	Validation Loss: 0.291733
Epoch: 3 	Training Loss: 0.207307 	Validation Loss: 0.175524
Epoch: 4 	Training Loss: 0.149404 	Validation Loss: 0.141235
Epoch: 5 	Training Loss: 0.111764 	Validation Loss: 0.118738
Epoch: 6 	Training Loss: 0.094020 	Validation Loss: 0.083164
Epoch: 7 	Training Loss: 0.078700 	Validation Loss: 0.126344
Epoch: 8 	Training Loss: 0.066964 	Validation Loss: 0.070978
Epoch: 9 	Training Loss: 0.060208 	Validation Loss: 0.059814
Epoch: 10 	Training Loss: 0.054800 	Validation Loss: 0.061182
Epoch: 11 	Training Loss: 0.052374 	Validation Loss: 0.056253
Epoch: 12 	Training Loss: 0.047273 	Validation Loss: 0.067774
Epoch: 13 	Training Loss: 0.042804 	Validation Loss: 0.044480
Epoch: 14 	Training Loss: 0.037071 	Validation Loss: 0.046309
Epoch: 15 	Training Loss: 0.037755 	Validation Loss: 0.061434
Epoch: 16 	Training Loss: 0.030628 	Validation Loss: 0.040354
Ep

In [17]:
group_kfold = GroupKFold(n_splits=5)
for n_fold, (train_idx, valid_idx) in enumerate(group_kfold.split(train_ids,groups=train_ids['molecule_name'])):
    print('\nstart fold: {}'.format(n_fold))
    if n_fold==0:continue
    # set-up data
    train_dl = train_ids.iloc[train_idx]
    valid_dl = train_ids.iloc[valid_idx]
    train_dl = CustomImageDataset(train_dl, root_dir, transform=None, IsTrain=True)
    valid_dl = CustomImageDataset(valid_dl, root_dir, transform=None, IsTrain=True)
    test_dl = CustomImageDataset(test_ids, root_dir, transform=None, IsTrain=False)
    train_dl = torch.utils.data.DataLoader(train_dl,batch_size=batch_size,shuffle=True,num_workers=num_workers)
    valid_dl = torch.utils.data.DataLoader(valid_dl,batch_size=batch_size,shuffle=False,num_workers=num_workers)
    test_dl = torch.utils.data.DataLoader(test_dl,batch_size=batch_size,shuffle=False,num_workers=num_workers)
    
    # train model
    model = model_struct().to('cuda')
    criterion = nn.SmoothL1Loss()
    optimizer = RAdam(model.parameters(),lr=0.0001,weight_decay=1e-2)
    scheduler = ReduceLROnPlateau(optimizer, 'min',factor=0.5,patience=5)
    model = train_cnn(model,optimizer,train_dl,valid_dl,n_epochs,clip,scheduler)
    
    # predict oof
    model.eval()
    yhat_list = []
    with torch.no_grad():
        for x_torch,_ in valid_dl:
            x_torch = x_torch.to('cuda:0')
            yhat_list.append(model(x_torch).squeeze(1))
    yhat = torch.cat(yhat_list).cpu().detach().numpy()        
    
    assert yhat.shape[0]==train_ids.iloc[valid_idx].shape[0],'yhat and test_id should have same shape'
    submit_ = dict(zip(train_ids.iloc[valid_idx]['id'].values,yhat))
    train_df['fold'+str(n_fold)+'_'+type_] = train_df.id.map(submit_)
    
    # predict test
    model.eval()
    yhat_list = []
    with torch.no_grad():
        for data_torch in test_dl:
            data_torch = data_torch.to('cuda:0')
            yhat_list.append(model(data_torch).squeeze(1))
    yhat = torch.cat(yhat_list).cpu().detach().numpy()        

    # join
    assert yhat.shape[0]==test_ids.shape[0],'yhat and test_id should have same shape'
    submit_ = dict(zip(test_ids['id'].values,yhat))
    test_df['fold'+str(n_fold)+'_'+type_] = test_df.id.map(submit_)
    #break


start fold: 0

start fold: 1
Epoch: 1 	Training Loss: 1.762554 	Validation Loss: 0.428096
Epoch: 2 	Training Loss: 0.288497 	Validation Loss: 0.221795
Epoch: 3 	Training Loss: 0.190965 	Validation Loss: 0.194087
Epoch: 4 	Training Loss: 0.136508 	Validation Loss: 0.123796
Epoch: 5 	Training Loss: 0.109118 	Validation Loss: 0.114164
Epoch: 6 	Training Loss: 0.099961 	Validation Loss: 0.092920
Epoch: 7 	Training Loss: 0.083005 	Validation Loss: 0.161890
Epoch: 8 	Training Loss: 0.071006 	Validation Loss: 0.099755
Epoch: 9 	Training Loss: 0.062941 	Validation Loss: 0.062690
Epoch: 10 	Training Loss: 0.057434 	Validation Loss: 0.061134
Epoch: 11 	Training Loss: 0.052759 	Validation Loss: 0.079954
Epoch: 12 	Training Loss: 0.050826 	Validation Loss: 0.052246
Epoch: 13 	Training Loss: 0.043867 	Validation Loss: 0.055242
Epoch: 14 	Training Loss: 0.036940 	Validation Loss: 0.080921
Epoch: 15 	Training Loss: 0.038753 	Validation Loss: 0.044290
Epoch: 16 	Training Loss: 0.031092 	Validation Lo

Epoch: 32 	Training Loss: 0.014614 	Validation Loss: 0.030639
Epoch: 33 	Training Loss: 0.014970 	Validation Loss: 0.030902
Epoch: 34 	Training Loss: 0.012469 	Validation Loss: 0.031075
Epoch: 35 	Training Loss: 0.011373 	Validation Loss: 0.034365
Epoch: 36 	Training Loss: 0.009821 	Validation Loss: 0.028747
Epoch: 37 	Training Loss: 0.011507 	Validation Loss: 0.032274
Epoch: 38 	Training Loss: 0.011638 	Validation Loss: 0.033440
Epoch: 39 	Training Loss: 0.012474 	Validation Loss: 0.029723
Epoch: 40 	Training Loss: 0.010087 	Validation Loss: 0.030158
Epoch: 41 	Training Loss: 0.009575 	Validation Loss: 0.032488
Epoch: 42 	Training Loss: 0.010853 	Validation Loss: 0.039610
Epoch: 43 	Training Loss: 0.006540 	Validation Loss: 0.026055
Epoch: 44 	Training Loss: 0.004509 	Validation Loss: 0.025361
Epoch: 45 	Training Loss: 0.003549 	Validation Loss: 0.025784
Epoch: 46 	Training Loss: 0.003941 	Validation Loss: 0.025807
Epoch: 47 	Training Loss: 0.004034 	Validation Loss: 0.027689
Epoch: 4

In [18]:
#assert set(test.iloc[:,5:].isnull().sum(1)) == set([7*5])
test_df['yhat'] = np.nanmean(test_df.iloc[:,5:],1)
#test = test[['id','yhat']]
test_df.to_csv('../Data/test_oof_'+type_,index=False)

#assert set(train.iloc[:,6:].isnull().sum(1)) == set([train.iloc[:,6:].shape[1]-1])
train_df['yhat'] = np.nanmean(train_df.iloc[:,6:],1)
#train = train[['id','yhat']]
train_df.to_csv('../Data/train_oof_'+type_,index=False)

In [21]:
train_df['yhat'] = np.nanmean(train_df.iloc[:,6:],1)

  """Entry point for launching an IPython kernel.


In [23]:
train_df2 = train_df.loc[~train_df.yhat.isnull()]

In [24]:
train_df2.shape

(119253, 12)

In [26]:
np.log(np.mean(np.abs(train_df2.scalar_coupling_constant-train_df2.yhat)))

-1.8672321232564857

In [27]:
np.mean(np.abs(train_df2.scalar_coupling_constant-train_df2.yhat))

0.15455084804097682

In [32]:
test_df

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,fold0_2JHN,fold1_2JHN,fold2_2JHN,fold3_2JHN,fold4_2JHN,yhat
0,4658147,dsgdb9nsd_000004,2,0,2JHC,,,,,,
1,4658148,dsgdb9nsd_000004,2,1,1JHC,,,,,,
2,4658149,dsgdb9nsd_000004,2,3,3JHH,,,,,,
3,4658150,dsgdb9nsd_000004,3,0,1JHC,,,,,,
4,4658151,dsgdb9nsd_000004,3,1,2JHC,,,,,,
5,4658152,dsgdb9nsd_000015,3,0,1JHC,,,,,,
6,4658153,dsgdb9nsd_000015,3,2,3JHC,,,,,,
7,4658154,dsgdb9nsd_000015,3,4,2JHH,,,,,,
8,4658155,dsgdb9nsd_000015,3,5,2JHH,,,,,,
9,4658156,dsgdb9nsd_000015,4,0,1JHC,,,,,,


In [35]:
index_ = test_df.type==type_

In [33]:
submit = pd.read_csv('../Data/test_model2_bigger_0823_-2.596.csv')

In [43]:
submit.loc[index_,'scalar_coupling_constant'] = test_df.loc[index_,'yhat'].values

In [45]:
submit.to_csv('../Submission/CNN_kfold_2JHN.csv',index=False)