# Kernel: [Keras Neural Net for CHAMPS](https://www.kaggle.com/todnewman/keras-neural-net-for-champs?scriptVersionId=16600198)
## Competition: [Predicting Molecular Properties](https://www.kaggle.com/c/champs-scalar-coupling)

### Keras Multiple Output Solution

references <br/>
https://www.pyimagesearch.com/2018/06/04/keras-multiple-outputs-and-multiple-losses/ <br/>
https://www.kaggle.com/kmat2019/neural-network-modeling-with-multiple-outputs <br/>

### 특징
- 현재 이 컴피티션에서는 이 방식으로 문제를 푸는게 대세는 아님
- overfitting이 일어나지 않는 것으로 보아, full dataset(public + private)에서 좋은 성과를 거둘 것을 기대

### 향상시킬 방법
- 화학 분자 분야의 전문 지식
- 네트워크 아키텍쳐 수정
- overfitting이 일어나지 않는 선에서의 more epochs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import tensorflow as tf
from keras.layers import Dense, Input, Activation
from keras.layers import BatchNormalization,Add,Dropout
from keras.optimizers import Adam
from keras.models import Model, load_model
from keras import callbacks
from keras import backend as K
from keras.layers.advanced_activations import LeakyReLU
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore",category=DeprecationWarning)
warnings.filterwarnings(action="ignore",category=FutureWarning)
import os

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


## 먼저 데이터를 가져옵니다.

이 커널의 저자는 solution과 EDA work를 분리한 notebook을 선호한다고 합니다.

그래서 EDA를 통해 생성한 데이터들을 읽어옵니다. 

>EDA kernel을 공개한 흔적은 없음

In [2]:
df_train=pd.read_csv('datasets/train.csv')
df_test=pd.read_csv('datasets/test.csv')
df_struct=pd.read_csv('datasets/structures.csv')

#df_train_sub_potential=pd.read_csv('/content/champs/potential_energy.csv')
#df_train_sub_moment=pd.read_csv('../input/dipole_moments.csv')
df_train_sub_charge=pd.read_csv('datasets/mulliken_charges.csv')
df_train_sub_tensor=pd.read_csv('datasets/magnetic_shielding_tensors.csv')

## 메모리 사용 줄이기

작은 cloud instance에서는 실행이 안되기 때문에, 이러한 방법을 사용

> 난 안할래

In [4]:
# def reduce_mem_usage(df, verbose=True):
#     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#     start_mem = df.memory_usage().sum() / 1024**2    
#     for col in df.columns:
#         col_type = df[col].dtypes
#         if col_type in numerics:
#             c_min = df[col].min()
#             c_max = df[col].max()
#             if str(col_type)[:3] == 'int':
#                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
#                     df[col] = df[col].astype(np.int8)
#                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
#                     df[col] = df[col].astype(np.int32)
#                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
#                     df[col] = df[col].astype(np.int64)  
#             else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
#                     df[col] = df[col].astype(np.float32)
#                 else:
#                     df[col] = df[col].astype(np.float64)    
#     end_mem = df.memory_usage().sum() / 1024**2
#     if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
#     return df
# print(df_train.shape, df_test.shape, df_struct.shape, df_train_sub_charge.shape, df_train_sub_tensor.shape)
# df_train = reduce_mem_usage(df_train)
# df_test = reduce_mem_usage(df_test)
# df_struct = reduce_mem_usage(df_struct)
# df_train_sub_charge = reduce_mem_usage(df_train_sub_charge)
# df_train_sub_tensor = reduce_mem_usage(df_train_sub_tensor)
# print(df_train.shape, df_test.shape, df_struct.shape, df_train_sub_charge.shape, df_train_sub_tensor.shape)

## master dataframe에 data를 맵핑하기

- csv 파일로 data들이 나눠져있다. 요놈들을 mapping해서 합친다.
- drop duplicates를 수행한다. test dataset이 뱉는 predictions가 정확하지 않게 되기 때문.

> - 생각보다 시간이 좀 걸리는 과정 (분명히 EDA 따로 한댔던거 같은데..)
> - 26분 소요;

> map_atom_info()
> - structures.csv 파일의 정보를 train/test 데이터에 mapping

> show_ram_usage()
> - ram 사용량을 보여준다. 유용한듯!

> atom_idx = [0, 1]을 이용해 for loop
> - x,y,z와 xx~zz를 각 atom에 대한 값으로 label을 rename해준다. (구분이 안될까봐 하는 듯)
> - 그 이후에 map_atom_info() 수행

In [6]:
%%time
''' 
Map atom info from the structures.csv into the train/test files
'''
import psutil
import os

def map_atom_info(df_1,df_2, atom_idx):
    print('Mapping...', df_1.shape, df_2.shape, atom_idx)
    
    df = pd.merge(df_1, df_2.drop_duplicates(subset=['molecule_name', 'atom_index']), how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)

    return df

def show_ram_usage():
    py = psutil.Process(os.getpid())
    print('RAM usage: {} GB'.format(py.memory_info()[0]/2. ** 30))

show_ram_usage()

for atom_idx in [0,1]:
    df_train = map_atom_info(df_train,df_struct, atom_idx)
    df_train = map_atom_info(df_train,df_train_sub_charge, atom_idx)
    df_train = map_atom_info(df_train,df_train_sub_tensor, atom_idx)
    df_train = df_train.rename(columns={'atom': f'atom_{atom_idx}',
                                        'x': f'x_{atom_idx}',
                                        'y': f'y_{atom_idx}',
                                        'z': f'z_{atom_idx}',
                                        'mulliken_charge': f'charge_{atom_idx}',
                                        'XX': f'XX_{atom_idx}',
                                        'YX': f'YX_{atom_idx}',
                                        'ZX': f'ZX_{atom_idx}',
                                        'XY': f'XY_{atom_idx}',
                                        'YY': f'YY_{atom_idx}',
                                        'ZY': f'ZY_{atom_idx}',
                                        'XZ': f'XZ_{atom_idx}',
                                        'YZ': f'YZ_{atom_idx}',
                                        'ZZ': f'ZZ_{atom_idx}',})
    df_test = map_atom_info(df_test,df_struct, atom_idx)
    df_test = df_test.rename(columns={'atom': f'atom_{atom_idx}',
                                'x': f'x_{atom_idx}',
                                'y': f'y_{atom_idx}',
                                'z': f'z_{atom_idx}'})
    #add some features
    
    df_struct['c_x']=df_struct.groupby('molecule_name')['x'].transform('mean')
    df_struct['c_y']=df_struct.groupby('molecule_name')['y'].transform('mean')
    df_struct['c_z']=df_struct.groupby('molecule_name')['z'].transform('mean')
    df_struct['atom_n']=df_struct.groupby('molecule_name')['atom_index'].transform('max')
    
    show_ram_usage()
    print(df_train.shape, df_test.shape)

RAM usage: 1.3939285278320312 GB
Mapping... (4658147, 10) (2358657, 6) 0
Mapping... (4658147, 14) (1533537, 3) 0
Mapping... (4658147, 15) (1533537, 11) 0
Mapping... (2505542, 5) (2358657, 6) 0
RAM usage: 2.7081680297851562 GB
(4658147, 24) (2505542, 9)
Mapping... (4658147, 24) (2358657, 10) 1
Mapping... (4658147, 32) (1533537, 3) 1
Mapping... (4658147, 33) (1533537, 11) 1
Mapping... (2505542, 9) (2358657, 10) 1
RAM usage: 3.6630401611328125 GB
(4658147, 42) (2505542, 17)
CPU times: user 21.7 s, sys: 26min 17s, total: 26min 38s
Wall time: 26min 34s


## 더 복잡한 features 개발 시작하기

> 설명이 없어서 열심히 분석해야한다아...
> 시간도 엄청 소요됨;

> make_features()
> - 원자_1과 원자_0의 x, y, z축의 위치 차이 = 거리 값을 dx, dy, dz로 계산하여 df에 추가
> - distance값은 (dx^2 + dy^2 + dz^2)^(1/2)로 계산하여 df에 추가

> get_dist()
> - 인덱스를 바꾸고, concat을 하는데 이유를 모르겠음;
> - min_distance와 max_istance를 계산

> transform vs apply
  - transform은 각각 구별된/독립적인 Series(columns)에 쓴다.
  ```python
  df.groupby('A').transform(lambda x: (x['C'] - x['D']))
  df.groupby('A').transform(lambda x: (x['C'] - x['D']).mean())
  ```

In [12]:
df_train.keys()

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'atom_x', 'x_x', 'y_x', 'z_x', 'atom_y',
       'x_y', 'y_y', 'z_y', 'charge_0', 'XX_0', 'YX_0', 'ZX_0', 'XY_0', 'YY_0',
       'ZY_0', 'XZ_0', 'YZ_0', 'ZZ_0', 'atom_1', 'x_1', 'y_1', 'z_1', 'c_x',
       'c_y', 'c_z', 'atom_n', 'charge_1', 'XX_1', 'YX_1', 'ZX_1', 'XY_1',
       'YY_1', 'ZY_1', 'XZ_1', 'YZ_1', 'ZZ_1'],
      dtype='object')

In [11]:
%%time
def make_features(df):
    df['dx']=df['x_1']-df['x_0']
    df['dy']=df['y_1']-df['y_0']
    df['dz']=df['z_1']-df['z_0']
    df['distance']=(df['dx']**2+df['dy']**2+df['dz']**2)**(1/2)
    return df

df_train=make_features(df_train)
df_test=make_features(df_test) 
#df_train = reduce_mem_usage(df_train)
#df_test = reduce_mem_usage(df_test)
test_prediction=np.zeros(len(df_test))
show_ram_usage()
print(df_train.shape, df_test.shape)

KeyError: 'x_0'

In [9]:
def get_dist(df):
    df_temp=df.loc[:,["molecule_name","atom_index_0","atom_index_1","distance","x_0","y_0","z_0","x_1","y_1","z_1"]].copy()
    df_temp_=df_temp.copy()
    df_temp_= df_temp_.rename(columns={'atom_index_0': 'atom_index_1',
                                       'atom_index_1': 'atom_index_0',
                                       'x_0': 'x_1',
                                       'y_0': 'y_1',
                                       'z_0': 'z_1',
                                       'x_1': 'x_0',
                                       'y_1': 'y_0',
                                       'z_1': 'z_0'})
    df_temp_all=pd.concat((df_temp,df_temp_),axis=0)

    df_temp_all["min_distance"]=df_temp_all.groupby(['molecule_name', 'atom_index_0'])['distance'].transform('min')
    df_temp_all["max_distance"]=df_temp_all.groupby(['molecule_name', 'atom_index_0'])['distance'].transform('max')
    
    df_temp= df_temp_all[df_temp_all["min_distance"]==df_temp_all["distance"]].copy()
    df_temp=df_temp.drop(['x_0','y_0','z_0','min_distance'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                         'atom_index_1': 'atom_index_closest',
                                         'distance': 'distance_closest',
                                         'x_1': 'x_closest',
                                         'y_1': 'y_closest',
                                         'z_1': 'z_closest'})
    
    for atom_idx in [0,1]:
        df = map_atom_info(df,df_temp, atom_idx)
        df = df.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                        'distance_closest': f'distance_closest_{atom_idx}',
                                        'x_closest': f'x_closest_{atom_idx}',
                                        'y_closest': f'y_closest_{atom_idx}',
                                        'z_closest': f'z_closest_{atom_idx}'})
        
    df_temp= df_temp_all[df_temp_all["max_distance"]==df_temp_all["distance"]].copy()
    df_temp=df_temp.drop(['x_0','y_0','z_0','max_distance'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                         'atom_index_1': 'atom_index_farthest',
                                         'distance': 'distance_farthest',
                                         'x_1': 'x_farthest',
                                         'y_1': 'y_farthest',
                                         'z_1': 'z_farthest'})
        
    for atom_idx in [0,1]:
        df = map_atom_info(df,df_temp, atom_idx)
        df = df.rename(columns={'atom_index_farthest': f'atom_index_farthest_{atom_idx}',
                                        'distance_farthest': f'distance_farthest_{atom_idx}',
                                        'x_farthest': f'x_farthest_{atom_idx}',
                                        'y_farthest': f'y_farthest_{atom_idx}',
                                        'z_farthest': f'z_farthest_{atom_idx}'})
    return df
df_test=(get_dist(df_test))    
df_train=(get_dist(df_train)) 

print(df_train.shape, df_test.shape)
show_ram_usage()

KeyError: 'x_0'