In [1]:
import numpy as np
import polars as pl
import time

In [9]:
blocks = pl.read_parquet('../out/train/building_blocks.parquet')
blocks = blocks.with_columns(pl.col('index').cast(pl.Int32))
blocks = blocks.select(['index', 'ecfp_pca'])
blocks.head()

index,ecfp_pca
i32,list[f64]
0,"[-0.623631, 1.179455, … 0.042602]"
1,"[-1.059422, 0.214484, … -0.026173]"
2,"[-0.7829, 0.581478, … 0.137127]"
3,"[-0.613241, 0.365505, … 0.10522]"
4,"[3.472075, 0.095289, … -0.086727]"


In [3]:
train = pl.read_parquet('../out/train/mols.parquet', n_rows = 100)
train = train.select(['molecule_id', 'buildingblock1_index', 'buildingblock2_index', 'buildingblock3_index'])
train.head()

molecule_id,buildingblock1_index,buildingblock2_index,buildingblock3_index
u32,i32,i32,i32
0,4,2,0
1,4,2,1
2,4,2,2
3,4,2,11
4,4,2,18


In [4]:
def features(dt, blocks):
    iblocks_ecfp_pca = [np.vstack(blocks['ecfp_pca'][dt[x]]) for x in ['buildingblock1_index', 'buildingblock2_index', 'buildingblock3_index']]
    iblocks_ecfp_pca = np.concatenate(iblocks_ecfp_pca, axis = 1)
    return(iblocks_ecfp_pca)

start_time = time.time()
for i in range(10000):
    x = features(train, blocks)
print(f"Elapsed time: {(time.time() - start_time)/60:.2f} minutes")
print(x[0:5])

Elapsed time: 1.72 minutes
[[ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.23624085e-01
   2.06185508e-02  4.26017057e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -2.33263095e-02
   9.50242620e-02 -2.61726410e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  6.76700498e-04
  -2.34305350e-02  1.37127162e-01]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  1.47651512e-03
  -5.06382091e-04 -1.01064856e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.86572024e-02
   4.98181144e-02 -6.63089832e-02]]


In [5]:
# def features(dt, blocks):
#     iblocks_ecfp_pca = [[list(ix) for ix in blocks['ecfp_pca'][dt[x]]] for x in ['buildingblock1_index', 'buildingblock2_index', 'buildingblock3_index']]
#     iblocks_ecfp_pca = np.concatenate(iblocks_ecfp_pca, axis = 1)
#     return(iblocks_ecfp_pca)

# start_time = time.time()
# x = features(train, blocks)
# print(f"Elapsed time: {(time.time() - start_time)/60:.2f} minutes")
# print(x[0:5])

# too long

In [6]:
def features(dt, blocks):
    
    blocks_ecfp_pca = []
    for i in ['buildingblock1_index', 'buildingblock2_index', 'buildingblock3_index']:
        iblocks_ecfp_pca = blocks['ecfp_pca'][dt[i]].to_numpy()
        iblocks_ecfp_pca = np.array([list(x) for x in iblocks_ecfp_pca]).astype('float')
        blocks_ecfp_pca.append(iblocks_ecfp_pca)
        del iblocks_ecfp_pca, i
        
    blocks_ecfp_pca = np.concatenate(blocks_ecfp_pca, axis = 1)
    return(blocks_ecfp_pca)

start_time = time.time()
for i in range(10000):
    x = features(train, blocks)
print(f"Elapsed time: {(time.time() - start_time)/60:.2f} minutes")
print(x[0:5])

Elapsed time: 2.93 minutes
[[ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.23624085e-01
   2.06185508e-02  4.26017057e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -2.33263095e-02
   9.50242620e-02 -2.61726410e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  6.76700498e-04
  -2.34305350e-02  1.37127162e-01]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  1.47651512e-03
  -5.06382091e-04 -1.01064856e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.86572024e-02
   4.98181144e-02 -6.63089832e-02]]


In [7]:
def features(dt, blocks):

    dt = dt.join(blocks, left_on = 'buildingblock1_index', right_on = 'index', how = 'inner').drop('buildingblock1_index').rename({'ecfp_pca': 'ecfp_pca1'})
    dt = dt.join(blocks, left_on = 'buildingblock2_index', right_on = 'index', how = 'inner').drop('buildingblock2_index').rename({'ecfp_pca': 'ecfp_pca2'})
    dt = dt.join(blocks, left_on = 'buildingblock3_index', right_on = 'index', how = 'inner').drop('buildingblock3_index').rename({'ecfp_pca': 'ecfp_pca3'})

    dt = dt.drop(['buildingblock1_index', 'buildingblock2_index', 'buildingblock3_index', 'index'])

    features = dt['ecfp_pca1'].list.concat(
        dt['ecfp_pca2'].list.concat(
            dt['ecfp_pca3']
    ))
    del dt, blocks

    return(np.vstack(features))

start_time = time.time()
for i in range(10000):
    x = features(train, blocks)
print(f"Elapsed time: {(time.time() - start_time)/60:.2f} minutes")
print(x[0:5])

Elapsed time: 1.24 minutes
[[ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.23624085e-01
   2.06185508e-02  4.26017057e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -2.33263095e-02
   9.50242620e-02 -2.61726410e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  6.76700498e-04
  -2.34305350e-02  1.37127162e-01]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  1.47651512e-03
  -5.06382091e-04 -1.01064856e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.86572024e-02
   4.98181144e-02 -6.63089832e-02]]


In [10]:
def features(dt, blocks):

    dt = dt.join(blocks, left_on = 'buildingblock1_index', right_on = 'index', how = 'inner').drop('buildingblock1_index').rename({'ecfp_pca': 'ecfp_pca1'})
    dt = dt.join(blocks, left_on = 'buildingblock2_index', right_on = 'index', how = 'inner').drop('buildingblock2_index').rename({'ecfp_pca': 'ecfp_pca2'})
    dt = dt.join(blocks, left_on = 'buildingblock3_index', right_on = 'index', how = 'inner').drop('buildingblock3_index').rename({'ecfp_pca': 'ecfp_pca3'})

    features = dt['ecfp_pca1'].list.concat(
        dt['ecfp_pca2'].list.concat(
            dt['ecfp_pca3']
    ))

    return(np.vstack(features))

start_time = time.time()
for i in range(10000):
    x = features(train, blocks)
print(f"Elapsed time: {(time.time() - start_time)/60:.2f} minutes")
print(x[0:5])

Elapsed time: 1.19 minutes
[[ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.23624085e-01
   2.06185508e-02  4.26017057e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -2.33263095e-02
   9.50242620e-02 -2.61726410e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  6.76700498e-04
  -2.34305350e-02  1.37127162e-01]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  1.47651512e-03
  -5.06382091e-04 -1.01064856e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.86572024e-02
   4.98181144e-02 -6.63089832e-02]]


In [12]:
def features(dt, blocks):

    idt = dt.join(blocks, left_on = 'buildingblock1_index', right_on = 'index', how = 'inner') \
        .rename({'ecfp_pca': 'ecfp_pca1'}) \
        .join(blocks, left_on = 'buildingblock2_index', right_on = 'index', how = 'inner') \
        .rename({'ecfp_pca': 'ecfp_pca2'}) \
        .join(blocks, left_on = 'buildingblock3_index', right_on = 'index', how = 'inner') \
        .rename({'ecfp_pca': 'ecfp_pca3'})

    features = idt['ecfp_pca1'].list.concat(
        idt['ecfp_pca2'].list.concat(
            idt['ecfp_pca3']
    ))

    return(np.vstack(features))

start_time = time.time()
for i in range(10000):
    x = features(train, blocks)
print(f"Elapsed time: {(time.time() - start_time)/60:.2f} minutes")
print(x[0:5])

Elapsed time: 1.14 minutes
[[ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.23624085e-01
   2.06185508e-02  4.26017057e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -2.33263095e-02
   9.50242620e-02 -2.61726410e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  6.76700498e-04
  -2.34305350e-02  1.37127162e-01]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  1.47651512e-03
  -5.06382091e-04 -1.01064856e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.86572024e-02
   4.98181144e-02 -6.63089832e-02]]


In [17]:
def features(dt, blocks):

    idt = dt.join(blocks, left_on = 'buildingblock1_index', right_on = 'index', how = 'inner', suffix = '1') \
        .join(blocks, left_on = 'buildingblock2_index', right_on = 'index', how = 'inner', suffix = '2') \
        .join(blocks, left_on = 'buildingblock3_index', right_on = 'index', how = 'inner', suffix = '3')

    features = idt['ecfp_pca'].list.concat(
        idt['ecfp_pca2'].list.concat(
            idt['ecfp_pca3']
    ))

    return(np.vstack(features))

start_time = time.time()
for i in range(10000):
    x = features(train, blocks)
print(f"Elapsed time: {(time.time() - start_time)/60:.2f} minutes")
print(x[0:5])

Elapsed time: 1.19 minutes
[[ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.23624085e-01
   2.06185508e-02  4.26017057e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -2.33263095e-02
   9.50242620e-02 -2.61726410e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  6.76700498e-04
  -2.34305350e-02  1.37127162e-01]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  1.47651512e-03
  -5.06382091e-04 -1.01064856e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.86572024e-02
   4.98181144e-02 -6.63089832e-02]]


In [18]:
def features(dt, blocks):

    idt = dt.join(blocks, left_on = 'buildingblock1_index', right_on = 'index', how = 'inner', suffix = '1') \
        .join(blocks, left_on = 'buildingblock2_index', right_on = 'index', how = 'inner', suffix = '2') \
        .join(blocks, left_on = 'buildingblock3_index', right_on = 'index', how = 'inner', suffix = '3')

    features = idt['ecfp_pca'].list.concat(
        idt['ecfp_pca2'].list.concat(
            idt['ecfp_pca3']
    ))

    return(np.vstack(features, casting = 'no'))

start_time = time.time()
for i in range(10000):
    x = features(train, blocks)
print(f"Elapsed time: {(time.time() - start_time)/60:.2f} minutes")
print(x[0:5])

Elapsed time: 1.17 minutes
[[ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.23624085e-01
   2.06185508e-02  4.26017057e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -2.33263095e-02
   9.50242620e-02 -2.61726410e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  6.76700498e-04
  -2.34305350e-02  1.37127162e-01]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ...  1.47651512e-03
  -5.06382091e-04 -1.01064856e-02]
 [ 3.47207511e+00  9.52888548e-02  1.36100695e+00 ... -1.86572024e-02
   4.98181144e-02 -6.63089832e-02]]
