## Filter classes with not enough support

This script expects a .hdf file containing simap2 to eggNOG5 matches. It then excludes all sequences which have a support lower then a set threshold and saves the newly generated dataset.

In [9]:
import os
import pandas

In [10]:
data_file = '/cube/proteinUniverse/data/simap2_eggnog_bacteria_singlelabel_keep-False.hdf'
data_key = 'enog5_bact'
min_support = 10000
save_path = '/proj/cube/deepfam/data/eggNOG5'
save_file = f'simap2_eggnog5_bacteria_singlelabel_keep-False_classsize-{min_support}.hdf'
save_key = 'enog5_bact'

### Filter data

In [15]:
data = pandas.read_hdf(data_file, key=data_key)
# Count the support of each eggNOG5 class label
data_counts = data['enog5'].value_counts()
# Create series containing only eggNOG5 class label with enough support
data_counts_mask = data_counts >= min_support 
#data_counts_mask1 = data_counts >= 10000 
#data_counts_mask2 = data_counts <= 10015
#data_counts_mask = data_counts_mask1 == data_counts_mask2
data_support = data_counts[data_counts_mask]
# Select only those sequences which are associated to a frequent eggNOG5 class 
new_data_mask = data.isin(data_support.index)
new_data = data[new_data_mask['enog5']]

### Save filtered data

In [13]:
path = os.path.join(save_path, save_file)
new_data.to_hdf(path, key = save_key, mode = 'w')

### Print statistics

In [14]:
# statistic
print('Original data items: ', data.count())
print('Original data classes: ', len(data_counts.index))
print('Filtered data items: ', new_data.count())
print('Filtered data classes: ', len(data_support.index))

Original data items:  enog5    13830448
dtype: int64
Original data classes:  206675
Filtered data items:  enog5    20017
dtype: int64
Filtered data classes:  2
