In [1]:
import os
from glob import glob
import pickle 
import pandas as pd
import json
import numpy as np



In [3]:
import RNA

seq_base = "AUGCAUGCU"
dotB = "(.......)"
energy = RNA.energy_of_struct(seq_base, dotB)
energy

6.199999809265137

In [2]:
def discover_files(folder):
    print(f"discovering files in {folder}")

    x_files = glob(os.path.join(folder, "X5-exp-loc-*"))
    ybase_files = glob(os.path.join(folder, "y5-exp-base-*"))
    yloc_files = glob(os.path.join(folder, "y5-exp-loc-*"))

    #pid is the puzzle id
    x_pids = [int(f.split("-")[-1]) for f in x_files]
    ybase_pids = [int(f.split("-")[-1]) for f in ybase_files]
    yloc_pids = [int(f.split("-")[-1]) for f in yloc_files]

    complete_pids = set(x_pids) & set(ybase_pids) & set(yloc_pids)

    print(f"X_files = {len(x_files)}")
    print(f"ybase files = {len(ybase_files)}")
    print(f"yloc files = {len(yloc_files)}")
    print(f"Complete files length  = {len(complete_pids)}")

    #organizing puzzles by puzzle ID

    file_map = {}
    for pid in complete_pids:
        file_map[pid] = {
            "features": os.path.join(folder, f"X5-exp-loc-{pid}"),
            "base_labels": os.path.join(folder, f"y5-exp-base-{pid}"),
            "loc_labels": os.path.join(folder, f"y5-exp-loc-{pid}")
        }

    return file_map, complete_pids

file_path = "../X5"
file_map, pids = discover_files(file_path)


discovering files in ../X5
X_files = 86
ybase files = 86
yloc files = 86
Complete files length  = 86


In [8]:
def load_pickle(filepath):
    try:
        with open(filepath, "rb") as f:
            return pickle.load(f)
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return None
    
def decode_base(one_hot):

    bases = ['A', 'U', 'G', 'C']
    idx = np.argmax(one_hot)
    return bases[idx]

def decode_location(one_hot):
    return np.argmax(one_hot)

def decode_structure(encoded):
    structure_map = {1: '.', 2: '(', 3: ')'}
    return ''.join([structure_map. get(int(x), '.') for x in encoded])

def decode_sequence(encoded):
    seq_map = {1: 'A', 2: 'U', 3: 'G', 4: 'C'}
    return ''.join([seq_map.get(int(x), 'N') for x in encoded])


def extract_features(file_map, pids):
    print("Extracting features")

    feature_info = []

    for pid in pids:
        X = load_pickle(file_map[pid]["features"])
        if X is None:
            continue

        X = np.array(X)
        if pid == 973010:
            print(X.shape)
            [print(f"Ch{i}: {X[60][i]}") for i in range(len(X[0]))]
            
        #print(X.shape)

        info = {
            'pid': pid,
            'num_samples': len(X),
            'num_channels': X[0].shape[0] if len(X) > 0 else 0,
            'puzzle_length': X[0].shape[1] if len(X) > 0 else 0,
        }
        
        # Sample first move for this puzzle
        if len(X) > 0:
            sample = X[0]
            sample2= X[1]
            info['sample_sequence'] = decode_sequence(sample[0])
            info["sample_sequence_next"] = decode_sequence(sample2[0])
            info['sample_current_structure'] = decode_structure(sample[1])
            info['sample_target_structure'] = decode_structure(sample[2])
            info['sample_current_energy'] = float(sample[3][0])
            info['sample_target_energy'] = float(sample[4][0])
            info['sample_location'] = int(decode_location(sample[0]))
        
        feature_info.append(info)

    df = pd.DataFrame(feature_info)
    #print(feature_info[0])
    return df 

df = extract_features(file_map, pids)
#df


Extracting features
(189, 8, 42)
Ch0: [1. 1. 2. 1. 2. 1. 2. 2. 3. 1. 1. 1. 1. 1. 1. 2. 1. 2. 1. 2. 1. 1. 1. 2.
 1. 2. 1. 1. 1. 2. 3. 1. 1. 1. 1. 2. 2. 1. 2. 1. 2. 1.]
Ch1: [1. 2. 2. 2. 2. 2. 2. 2. 1. 1. 1. 1. 1. 3. 3. 3. 3. 3. 3. 3. 1. 1. 2. 2.
 2. 2. 2. 2. 1. 1. 1. 1. 1. 1. 1. 3. 3. 3. 3. 3. 3. 1.]
Ch2: [1. 2. 2. 2. 2. 2. 2. 2. 1. 1. 1. 1. 3. 1. 3. 3. 3. 3. 3. 3. 1. 1. 2. 2.
 2. 2. 2. 2. 1. 2. 1. 1. 1. 1. 3. 3. 3. 3. 3. 3. 3. 1.]
Ch3: [-3.5999999  0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.       ]
Ch4: [1.20000005 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.      

In [11]:
# Add this new cell to compare puzzle IDs

def compare_puzzle_ids(teaching_puzzles_file, x5_folder):

    # Read teaching puzzle IDs
    with open('../movesets/teaching-puzzle-ids.txt') as f:
        content = f.readlines()
        # you may also want to remove whitespace characters like `\n` at the end of each line
        content = [x.strip() for x in content]
        content = [int(x) for x in content]
        progression = [6502966,6502968,6502973,6502976,6502984,6502985,6502993,
                        6502994,6502995,6502996,6502997,6502998,6502999,6503000] # 6502957
        content.extend(progression)

        # content = getPid()
        content.remove(6502966)
        content.remove(6502976)
        content.remove(6502984)

    teaching_pids = set(content)
    
    print(f" Teaching puzzles: {len(teaching_pids)} puzzles")
    
    # Get X5 file puzzle IDs
    x_files = glob(os.path.join(x5_folder, "X5-exp-loc-*"))
    x5_pids = set([int(f.split("-")[-1]) for f in x_files])
    
    print(f" X5 folder: {len(x5_pids)} puzzles")
    
    # Find common puzzles
    common_pids = teaching_pids & x5_pids
    teaching_only = teaching_pids - x5_pids
    x5_only = x5_pids - teaching_pids
    
    print(f"\nCommon puzzles: {len(common_pids)}")
    print(f" Teaching only: {len(teaching_only)}")
    print(f" X5 only: {len(x5_only)}")
    
    print(f"\nCommon puzzle IDs:")
    print(sorted(common_pids))
    
    return {
        'common': sorted(common_pids),
        'teaching_only': sorted(teaching_only),
        'x5_only': sorted(x5_only)
    }

# Run comparison
result = compare_puzzle_ids('movesets/teaching-puzzle-ids.txt', '../X5')

 Teaching puzzles: 85 puzzles
 X5 folder: 86 puzzles

Common puzzles: 83
 Teaching only: 2
 X5 only: 3

Common puzzle IDs:
[470862, 924481, 969610, 969616, 970889, 972995, 973010, 977775, 988108, 1005186, 1021173, 1074756, 2173229, 2438008, 2440447, 2442301, 2586215, 2817913, 2826515, 2836984, 2859701, 2863060, 2904837, 3124709, 3149691, 3174768, 3177248, 3245898, 3251484, 3294054, 3450464, 3450485, 3468526, 3468547, 3475185, 3484458, 3500158, 3522605, 3522647, 3536334, 3536373, 3536444, 3542351, 3633213, 3634228, 3634635, 3682204, 3682239, 3704385, 3704388, 3704391, 3857631, 3912739, 3951241, 3951286, 3951333, 4024177, 4062836, 4255265, 4258598, 4263778, 4265757, 4292000, 4312727, 4561479, 4960718, 5353710, 5359220, 5721020, 5723240, 5797240, 6396259, 6502968, 6502973, 6502985, 6502993, 6502994, 6502995, 6502996, 6502997, 6502998, 6502999, 6503000]


In [12]:
folder = "X5"
x_files = glob(os.path.join(folder, "X5-exp-loc-*"))
ybase_files = glob(os.path.join(folder, "y5-exp-base-*"))
yloc_files = glob(os.path.join(folder, "y5-exp-loc-*"))
#pid is the puzzle id
x_pids = [int(f.split("-")[-1]) for f in x_files]
ybase_pids = [int(f.split("-")[-1]) for f in ybase_files]
yloc_pids = [int(f.split("-")[-1]) for f in yloc_files]
complete_pids = set(x_pids) & set(ybase_pids) & set(yloc_pids)
print(f"X_files = {len(x_files)}")
print(f"ybase files = {len(ybase_files)}")
print(f"yloc files = {len(yloc_files)}")
print(f"Complete files length  = {len(complete_pids)}")
file_map = {}
for pid in complete_pids:
    file_map[pid] = {
        'features': os.path.join(folder, f'X5-exp-loc-{pid}'),
        'base_labels': os.path.join(folder, f'y5-exp-base-{pid}'),
        'loc_labels': os.path.join(folder, f'y5-exp-loc-{pid}')
    }


X_files = 0
ybase files = 0
yloc files = 0
Complete files length  = 0


In [13]:
X = []
for pid in complete_pids:
        x = load_pickle(file_map[pid]["features"])
        
        if x is None:
            continue

        x = np.array(x)
        X.append(x)
    


In [54]:
X[13][0][0].shape

(29,)

In [4]:
X[13][0]

NameError: name 'X' is not defined

In [58]:
max = 0
for i in range(len(X)):
    if len(X[i][0][0]) > max:
        index = i
        max = len(X[i][0][0])

max, index




(350, 81)

In [57]:
for i in range(len(X)):
    print(X[i].shape)

(94, 8, 73)
(154, 8, 40)
(691, 8, 56)
(124, 8, 109)
(1311, 8, 31)
(325, 8, 70)
(35, 8, 31)
(17, 8, 111)
(174, 8, 35)
(2886, 8, 104)
(323, 8, 31)
(74, 8, 29)
(429, 8, 12)
(6, 8, 29)
(31, 8, 54)
(3454, 8, 36)
(1897, 8, 51)
(84, 8, 29)
(2722, 8, 105)
(1097, 8, 38)
(1303, 8, 108)
(3011, 8, 22)
(798, 8, 80)
(1104, 8, 80)
(359, 8, 56)
(2756, 8, 103)
(1306, 8, 104)
(196, 8, 44)
(17, 8, 9)
(22, 8, 116)
(160, 8, 15)
(151, 8, 44)
(17, 8, 8)
(4, 8, 34)
(103, 8, 36)
(199, 8, 73)
(215, 8, 28)
(243, 8, 195)
(142, 8, 149)
(498, 8, 28)
(40, 8, 44)
(189, 8, 42)
(6, 8, 34)
(88, 8, 28)
(35, 8, 55)
(167, 8, 42)
(39, 8, 16)
(12, 8, 7)
(71, 8, 37)
(142, 8, 66)
(6, 8, 57)
(84, 8, 40)
(106, 8, 42)
(25, 8, 28)
(19, 8, 28)
(481, 8, 186)
(25, 8, 46)
(34, 8, 11)
(119, 8, 15)
(56, 8, 36)
(140, 8, 11)
(89, 8, 31)
(26, 8, 156)
(58, 8, 30)
(196, 8, 104)
(445, 8, 42)
(18, 8, 34)
(161, 8, 32)
(235, 8, 51)
(118, 8, 31)
(302, 8, 51)
(37, 8, 39)
(162, 8, 55)
(45, 8, 64)
(205, 8, 13)
(36, 8, 70)
(53, 8, 77)
(76, 8, 12)
(56

In [59]:
X[81].shape

(1034, 8, 350)

In [65]:
Y = []

for pid in complete_pids:
    y = load_pickle(file_map[pid]["loc_labels"])
    if y is None:
        continue

    y = np.array(y)

    Y.append(y)

len(Y)

86

In [66]:
Y[0][0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [62]:
Y[0].shape

(94, 4)