# Inspect datasets for the parameter sharing model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
# read this file:
# data/fingerprints/mid_class_train.npy

mid_class_train_fp = np.load('../data/fingerprints/mid_class_train.npy')

In [3]:
mid_class_train_fp.shape

(271394, 4096)

In [4]:
mid_class_train_df = pd.read_parquet('../data/mid_class_train.parquet')

In [5]:
mid_class_train_df.shape

(271394, 14)

In [9]:
set(mid_class_train_df['super class'])

{'1', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9'}

In [12]:
mid_class_train_df['super class'].astype(np.int8)

2         9
3         7
4         1
5         8
7         4
         ..
356901    1
356902    2
356903    9
356904    1
356905    1
Name: super class, Length: 271394, dtype: int8

In [5]:
# orderly-mid-class dataset size:
mid_class_train_path = '../data/mid_class_train.parquet'
mid_class_train_df = pd.read_parquet(mid_class_train_path)
mid_class_test_path = '../data/mid_class_test.parquet'
mid_class_test_df = pd.read_parquet(mid_class_test_path)

print('total df size: ', mid_class_train_df.shape[0] + mid_class_test_df.shape[0])

total df size:  306427


# Check the datasets

In [5]:
train = pd.read_parquet('../data/super_class_cc_train.parquet')
test = pd.read_parquet('../data/super_class_cc_test.parquet')

In [30]:
train['super class'].value_counts()

1     81799
6     61144
2     40993
7     33476
9     29514
4     10423
8      8241
10     8151
5      3991
11      508
Name: super class, dtype: int64

In [32]:
test['super class'].value_counts()

3    28187
Name: super class, dtype: int64

In [6]:
train.shape, test.shape

((278240, 14), (28187, 14))

In [26]:
data_test = (train['agent_000'], train['agent_001'])
data_train = (test['agent_000'], test['agent_001'])
top_n=3

In [27]:
data_train[0].shape

(28187,)

In [28]:
data_train_np = np.array(data_train).transpose()
data_test_np = np.array(data_test).transpose()
data_train_np = np.where(data_train_np == None, "NULL", data_train_np)
data_test_np = np.where(data_test_np == None, "NULL", data_test_np)
data_train_np = np.sort(data_train_np, axis=1)
data_test_np = np.sort(data_test_np, axis=1)

data_train_list = [tuple(row) for row in data_train_np]
data_test_list = [tuple(row) for row in data_test_np]

row_counts = Counter(data_train_list)

# Find the most frequent row and its count
most_frequent_rows = row_counts.most_common(top_n)

# Count the occurrences of the most frequent row in data_train_np
correct_predictions = 0
for row in most_frequent_rows:
    correct_predictions += data_test_list.count(row[0])

print(correct_predictions / len(data_test_list), most_frequent_rows)

0.2194867740080506 [(('Cl', 'NULL'), 2535), (('O=C([O-])[O-]', '[K+]'), 1999), (('[Na+]', '[OH-]'), 1609)]


In [22]:
most_frequent_rows

[(('Cl', 'NULL'), 25097),
 (('O=C([O-])[O-]', '[K+]'), 19579),
 (('[Na+]', '[OH-]'), 16394)]

In [29]:
most_frequent_rows

[(('Cl', 'NULL'), 2535),
 (('O=C([O-])[O-]', '[K+]'), 1999),
 (('[Na+]', '[OH-]'), 1609)]

In [24]:
train.shape

(278240, 14)

In [25]:
24499/278240

0.08804988499137435

In [17]:
test.shape

(28187, 14)

In [18]:
2421/28187

0.08589065881434704