In [1]:
import numpy as np
import pandas as pd
import os 
import pickle
from glob import glob

In [2]:
def get_data(folder):
    print("Getting Data...")

    try:
        X_files = glob(os.path.join(folder, "X5-exp-loc-*"))
        y_loc_files = glob(os.path.join(folder, "y5-exp-loc-*"))
        y_base_files = glob(os.path.join(folder, "y5-exp-base-*"))
    except Exception as e:
        print(f"error getting files: {e}")

    X_pids = [int(f.split("-")[-1]) for f in X_files]
    y_loc_pids = [int(f.split("-")[-1]) for f in y_loc_files]
    y_base_pids = [int(f.split("-")[-1]) for f in y_base_files]

    complete_pids = set(X_pids) & set(y_base_pids) & set(y_loc_pids)

    file_map = {}
    for pid in complete_pids:
        file_map[pid] = {
            "features": os.path.join(folder, f"X5-exp-loc-{pid}"),
            "base_labels": os.path.join(folder, f"y5-exp-base-{pid}"),
            "loc_labels": os.path.join(folder, f"y5-exp-loc-{pid}")
        }
    
    return file_map, complete_pids

folder = "../X5"
map, pids = get_data(folder)

len(pids)


Getting Data...


86

In [3]:
def load_pickle(file):
    try:
        with open(file, 'rb') as f:
            return pickle.load(f)
    except Exception as e:
        print(f"Error loading file {file}:  {e}")
        return None

def load_data_base(file_map, complete_pids):
    print("Loading Data for base CNN...")

    full_X = []
    full_y_base = []
    drop_pids = []

    for pid in complete_pids:
        feats_file = file_map[pid]["features"]
        ybase_file = file_map[pid]["base_labels"]
        yloc_file = file_map[pid]["loc_labels"]

        feats = load_pickle(feats_file)
        ybase = load_pickle(ybase_file)
        yloc = load_pickle(yloc_file)

        if len(feats) != len(ybase):
            print(f"Length error in ybase pid: {pid}")
            drop_pids.append(pid)
            continue

        if len(feats) != len(yloc):
            print(f"Length error in yloc pid: {pid}")
            drop_pids.append(pid)
            continue

        for i in range(len(feats)):
            feats[i].append(yloc[i])

        full_X.extend(feats)
        full_y_base.extend(ybase)

    for pid in drop_pids:
        complete_pids.remove(pid)
    
    print(len(drop_pids))

    return full_X, full_y_base

        
x, y = load_data_base(map, pids)


Loading Data for base CNN...
Length error in ybase pid: 6502966
Length error in ybase pid: 6502976
Length error in ybase pid: 6502984
3


In [4]:
print(len(x))
print(len(y))
print(len(pids))
#len(y)

29915
29915
83


In [None]:
def load_data_loc(file_map, complete_pids):
    print("Loading Data for loc CNN...")

    full_X = []
    full_y_loc = []

    for pid in complete_pids:
        feats_file = file_map[pid]["features"]
        yloc_file = file_map[pid]["loc_labels"]

        feats = load_pickle(feats_file)
        yloc = load_pickle(yloc_file)

        full_X.extend(feats)
        full_y_loc.extend(yloc)

    return full_X, full_y_loc

x_loc, y_loc = load_data_loc(file_map, )

In [6]:
puzzle_length = []
for puz in range(len(x)):
    length = len(x[puz][0])
    puzzle_length.append(length)

max_length = max(puzzle_length)
max_length

350

In [7]:
for i, datapoint in enumerate(x):
    if len(datapoint[0]) < max_length:
        k = len(datapoint[0])
        for i in range(len(datapoint) - 1):
            datapoint[i].extend([0] * (max_length - k))



In [8]:
len(x[2][4])


350

In [9]:
puzzle_length = []
for puz in range(len(x)):
    for j in range(len(x[puz])):
        length = len(x[puz][j])
        puzzle_length.append(length)

max(puzzle_length)

350

In [10]:
X = np.array(x)
y = np.array(y)

In [12]:
X.shape

(29915, 9, 350)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size = 0.2,
    random_state = 69,
    stratify = y
)

X_test, X_val, y_test, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=69,
    stratify=y_temp
)

print(f"Train: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"Val: {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")


Train: 23932 samples (80.0%)
Test: 2991 samples (10.0%)
Val: 2992 samples (10.0%)
