In [2]:
import pandas as pd
import numpy as np
import gc

gc.enable()

DATA = "~/Data/Molecular"

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [7]:
train = pd.read_csv(f"{DATA}/train.csv", dtype={
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category',
    'scalar_coupling_constant': 'float32'
})
test = pd.read_csv(f"{DATA}/test.csv", dtype={
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category',
    'scalar_coupling_constant': 'float32'
})
structure = pd.read_csv(f"{DATA}/structures.csv", dtype={
    'molecule_name': 'category',
    'atom_index': 'int8',
    'atom': 'category',
    'x': 'float32',
    'y': 'float32',
    'z': 'float32'
})
data = pd.concat([train.drop(columns=['scalar_coupling_constant']), test], ignore_index=True)
qm9 = pd.read_pickle(f"{DATA}/data.covs.pickle")

In [10]:
qm9.head(n=1)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,rc_A,rc_B,rc_C,mu,...,Cv,freqs_min,freqs_max,freqs_mean,linear,mulliken_min,mulliken_max,mulliken_mean,mulliken_atom_0,mulliken_atom_1
0,5174511,dsgdb9nsd_033805,11,7,2JHC,,3.54257,1.50643,1.34544,4.4029,...,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,-0.088328


In [11]:
qm9 = data.merge(qm9.drop(columns=['atom_index_0', 'atom_index_1', 'type', 'scalar_coupling_constant']),
           how='left', on=['id', 'molecule_name'])

In [14]:
qm9.head(1)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,rc_A,rc_B,rc_C,mu,alpha,...,Cv,freqs_min,freqs_max,freqs_mean,linear,mulliken_min,mulliken_max,mulliken_mean,mulliken_atom_0,mulliken_atom_1
0,0,dsgdb9nsd_000001,1,0,1JHC,157.7118,157.70997,157.70699,0.0,13.21,...,6.469,1341.307,3151.7078,2182.525478,1.0,-0.535689,0.133923,0.0,0.133921,-0.535689


In [15]:
qm9 = reduce_mem_usage(qm9.drop(columns=['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'linear', 'U', 'G', 'H', 'mulliken_mean', 'r2', 'U0']))

Mem. usage decreased to 519.22 Mb (47.2% reduction)


In [16]:
qm9.to_pickle(f"{DATA}/qm9.gz")

In [17]:
qm9.head()

Unnamed: 0,rc_A,rc_B,rc_C,mu,alpha,homo,lumo,gap,zpve,Cv,freqs_min,freqs_max,freqs_mean,mulliken_min,mulliken_max,mulliken_atom_0,mulliken_atom_1
0,157.711807,157.709976,157.706985,0.0,13.21,-0.3877,0.1171,0.5048,0.044749,6.469,1341.307007,3151.707764,2182.525391,-0.535689,0.133923,0.133921,-0.535689
1,157.711807,157.709976,157.706985,0.0,13.21,-0.3877,0.1171,0.5048,0.044749,6.469,1341.307007,3151.707764,2182.525391,-0.535689,0.133923,0.133921,0.133922
2,157.711807,157.709976,157.706985,0.0,13.21,-0.3877,0.1171,0.5048,0.044749,6.469,1341.307007,3151.707764,2182.525391,-0.535689,0.133923,0.133921,0.133923
3,157.711807,157.709976,157.706985,0.0,13.21,-0.3877,0.1171,0.5048,0.044749,6.469,1341.307007,3151.707764,2182.525391,-0.535689,0.133923,0.133921,0.133923
4,157.711807,157.709976,157.706985,0.0,13.21,-0.3877,0.1171,0.5048,0.044749,6.469,1341.307007,3151.707764,2182.525391,-0.535689,0.133923,0.133922,-0.535689
