In [32]:
import pandas as pd
from src.prepare_data import prepare_data
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset

### Analyse the data

load the raw data

In [None]:
data = pd.read_csv("data/fe_atom/raw/train.csv")
data = Dataset.from_pandas(data)
data

Dataset({
    features: ['mof_name', 'Topology', 'mofkey', 'mofid_v1', 'FE_atom'],
    num_rows: 43554
})

In [52]:
print('mof_name:', data['mof_name'][0])
print('mofid:', data['mofid_v1'][0])
print('FE_atom:', data['FE_atom'][0])

mof_name: SR_zyg_v1-3c_triazine_Ch_v2-3c_pyrrole_Ch_v3-4c_Cu_1_Ch_1TrU_Ch_1DoU_Ch
mofid: [Cu][Cu].[O-]C(=O)[CH][CH]c1ccc(n1C#CC1=NC(=[N]=C([N]1)C#Cn1c([CH][CH]C(=O)[O-])ccc1C=CC(=O)[O-])C#Cn1c(C=CC(=O)[O-])ccc1C=CC(=O)[O-])C=CC(=O)[O-].[O-]C(=O)[CH][CH]c1ccc(n1C#CC1=NC(=[N]=C([N]1)C#Cn1c([CH][CH]C(=O)[O-])ccc1C=CC(=O)[O-])C#Cn1c([CH][CH]C(=O)[O-])ccc1C=CC(=O)[O-])C=CC(=O)[O-].[O-]C(=O)[CH][CH]c1ccc(n1C#CC1=NC(=[N]=C([N]1)C#Cn1c([CH][CH]C(=O)[O-])ccc1C=CC(=O)[O-])C#Cn1c([CH][CH]C(=O)[O-])ccc1C=CC(=O)[O-])[CH][CH]C(=O)[O-].[O-]C(=O)[CH][CH]c1ccc(n1C#CC1=[N]=C(N=C([N]1)C#Cn1c(C=CC(=O)[O-])ccc1C=CC(=O)[O-])C#Cn1c([CH][CH]C(=O)[O-])ccc1C=CC(=O)[O-])C=CC(=O)[O-] MOFid-v1.stp,zyg.cat0;SR_zyg_v1-3c_triazine_Ch_v2-3c_pyrrole_Ch_v3-4c_Cu_1_Ch_1TrU_Ch_1DoU_Ch
FE_atom: 20.5758597175909


get the sequence length statistics for **mof_name** and **mofid**.

In [38]:
def get_tokens(sample, tokenizer):
    mof_name_tokens = tokenizer(sample['mof_name'])['input_ids']
    mofid_tokens = tokenizer(sample['mofid_v1'])['input_ids']
    return {
        'mof_name_len': len(mof_name_tokens),
        'mofid_len': len(mofid_tokens)
    }

In [53]:
tokenizer = AutoTokenizer.from_pretrained('t5-small')
data_with_lengths = data.map(lambda x: get_tokens(x, tokenizer=tokenizer), num_proc=8)
data_with_lengths

#0:   0%|          | 0/5445 [00:00<?, ?ex/s]Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors
#0:  15%|█▍        | 811/5445 [00:00<00:04, 1147.65ex/s]Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
#0:  30%|███       | 1648/5445 [00:01<00:03, 1156.48ex/s]
#0:  32%|███▏      | 1764/5445 [00:01<00:03, 1130.49ex/s]Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors
#0:  35%|███▍      | 1890/5445 [00:01<00:03, 1166.50ex/s]
[A
#0:  37%|███▋      | 2007/5445 [00:01<00:03, 1099.70ex/s]
#0:  39%|███▉      | 2122/5445 [00:01<00:02, 1111.44ex/s]
#0:  41%|████      | 2234/5445 [00:01<00:02, 1113.80ex/s]
#0:

Dataset({
    features: ['mof_name', 'Topology', 'mofkey', 'mofid_v1', 'FE_atom', 'mof_name_len', 'mofid_len'],
    num_rows: 43554
})

In [None]:
# train data stats
data_with_lengths_df = data_with_lengths.to_pandas()
mof_name_len_stats = data_with_lengths_df['mof_name_len'].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
mofid_len_stats = data_with_lengths_df['mofid_len'].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
mof_name_len_stats, mofid_len_stats

(count    43554.000000
 mean        48.907517
 std         13.780475
 min         22.000000
 25%         41.000000
 50%         48.000000
 75%         54.000000
 90%         63.000000
 95%         70.000000
 99%        109.000000
 max        154.000000
 Name: mof_name_len, dtype: float64,
 count    43554.000000
 mean       838.998278
 std       1055.160883
 min         57.000000
 25%        251.000000
 50%        432.000000
 75%       1015.000000
 90%       2036.000000
 95%       2818.350000
 99%       5151.470000
 max      16518.000000
 Name: mofid_len, dtype: float64)

In [None]:
# validation data stats
data = pd.read_csv("data/fe_atom/raw/validation.csv")
data = Dataset.from_pandas(data)

data_with_lengths = data.map(lambda x: get_tokens(x, tokenizer=tokenizer), num_proc=8)
data_with_lengths_df = data_with_lengths.to_pandas()
mof_name_len_stats = data_with_lengths_df['mof_name_len'].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
mofid_len_stats = data_with_lengths_df['mofid_len'].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
mof_name_len_stats, mofid_len_stats

#0:   0%|          | 0/681 [00:00<?, ?ex/s]Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors
#0:  10%|▉         | 65/681 [00:00<00:00, 648.91ex/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2577 > 512). Running this sequence through the model will result in indexing errors
#0:  24%|██▍       | 164/681 [00:00<00:00, 841.84ex/s]
[AToken indices sequence length is longer than the specified maximum sequence length for this model (1466 > 512). Running this sequence through the model will result in indexing errors
#0:  39%|███▉      | 265/681 [00:00<00:00, 917.64ex/s]
#0:  55%|█████▍    | 373/681 [00:00<00:00, 980.69ex/s]

[A[AToken indices sequence length is longer than the specified maximum sequence length for this model (1973 > 512). Running this sequence through the model will result in indexing error

(count    5444.000000
 mean       49.044453
 std        14.291391
 min        22.000000
 25%        41.000000
 50%        48.000000
 75%        55.000000
 90%        63.000000
 95%        72.000000
 99%       111.000000
 max       154.000000
 Name: mof_name_len, dtype: float64,
 count     5444.000000
 mean       834.158707
 std       1113.463498
 min         62.000000
 25%        246.000000
 50%        430.000000
 75%        964.250000
 90%       1969.700000
 95%       2731.850000
 99%       5366.090000
 max      16508.000000
 Name: mofid_len, dtype: float64)

In [None]:
# test data stats
data = pd.read_csv("data/fe_atom/raw/test.csv")
data = Dataset.from_pandas(data)

data_with_lengths = data.map(lambda x: get_tokens(x, tokenizer=tokenizer), num_proc=8)
data_with_lengths_df = data_with_lengths.to_pandas()
mof_name_len_stats = data_with_lengths_df['mof_name_len'].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
mofid_len_stats = data_with_lengths_df['mofid_len'].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
mof_name_len_stats, mofid_len_stats

#0:   0%|          | 0/681 [00:00<?, ?ex/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1803 > 512). Running this sequence through the model will result in indexing errors
#0:  12%|█▏        | 81/681 [00:00<00:00, 807.37ex/s]Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors
#0:  27%|██▋       | 185/681 [00:00<00:00, 939.90ex/s]
[AToken indices sequence length is longer than the specified maximum sequence length for this model (1230 > 512). Running this sequence through the model will result in indexing errors
#0:  41%|████      | 279/681 [00:00<00:00, 919.52ex/s]
[A

#0:  57%|█████▋    | 391/681 [00:00<00:00, 991.58ex/s]Token indices sequence length is longer than the specified maximum sequence length for this model (3351 > 512). Running this sequence through the model will result in indexing errors



(count    5445.000000
 mean       48.963269
 std        13.354673
 min        22.000000
 25%        42.000000
 50%        48.000000
 75%        54.000000
 90%        63.000000
 95%        70.000000
 99%       100.560000
 max       150.000000
 Name: mof_name_len, dtype: float64,
 count     5445.00000
 mean       853.32764
 std       1083.68251
 min         59.00000
 25%        261.00000
 50%        437.00000
 75%       1012.00000
 90%       2046.60000
 95%       2844.00000
 99%       5254.60000
 max      14481.00000
 Name: mofid_len, dtype: float64)

Free Energy data

In [None]:
max_len = 2000 # Adjust this value as needed
input_path = "data/fe_atom/raw/test.csv"
output_path = "data/fe_atom/mofseq/test.csv"
tokenizer = AutoTokenizer.from_pretrained("t5-small")

prepare_data(input_path, output_path, tokenizer, max_length=max_len)

In [None]:
max_len = 2000 # Adjust this value as needed
input_path = "data/fe_atom/raw/validation.csv"
output_path = "data/fe_atom/mofseq/validation.csv"
tokenizer = AutoTokenizer.from_pretrained("t5-small")

prepare_data(input_path, output_path, tokenizer, max_length=max_len)

In [None]:
max_len = 2000 # Adjust this value as needed
input_path = "data/fe_atom/raw/train.csv"
output_path = "data/fe_atom/mofseq/train.csv"
tokenizer = AutoTokenizer.from_pretrained("t5-small")

prepare_data(input_path, output_path, tokenizer, max_length=max_len)

Strain Enegry data

In [None]:
max_len = 2000 # Adjust this value as needed
input_path = "data/se_atom/raw/test.csv"
output_path = "data/se_atom/mofseq/test.csv"
tokenizer = AutoTokenizer.from_pretrained("t5-small")

prepare_data(input_path, output_path, tokenizer, max_length=max_len)


In [None]:
max_len = 2000 # Adjust this value as needed
input_path = "data/se_atom/raw/validation.csv"
output_path = "data/se_atom/mofseq/validation.csv"
tokenizer = AutoTokenizer.from_pretrained("t5-small")

prepare_data(input_path, output_path, tokenizer, max_length=max_len)

In [None]:
max_len = 2000 # Adjust this value as needed
input_path = "data/se_atom/raw/train.csv"
output_path = "data/se_atom/mofseq/train.csv"
tokenizer = AutoTokenizer.from_pretrained("t5-small")

prepare_data(input_path, output_path, tokenizer, max_length=max_len)