In [1]:
from src.dataset.Dataset import Dataset
from os.path import join,basename
from os import makedirs
import json
from tqdm import tqdm
from experiments import DataRefiner
from dataset.DataPreprocessor import DataPreprocessor
from labelregions.LabelRegionLoader import LabelRegionLoader
from experiments.CrossValidationTraining import CrossValidationTraining
import experiments.EdgePropabilityCallback as EdgePropabilityCallback

In [2]:
DATA_DIR = '../data'

In [3]:
DECO = Dataset(join(DATA_DIR, "Deco"), "Deco")
FUSTE = Dataset(join(DATA_DIR, "FusTe"), "FusTe")
TEST = Dataset(join(DATA_DIR, "Test"), "Test")

In [4]:
datasets = dict([(ds.name, ds) for ds in [DECO, FUSTE, TEST]])
dataset = datasets['Deco']
data_preprocessor = DataPreprocessor(DATA_DIR, "preprocessed_annotations_elements.json")
data_preprocessor.preprocess(dataset.name)

In [5]:
DataRefiner.refine(dataset)

In [6]:
label_region_loader = LabelRegionLoader(introduce_noise=False)

In [7]:
edge_probability_callback = EdgePropabilityCallback.default_edge_mutation_probability_callback
experiment_class = CrossValidationTraining

In [8]:
import uuid

k=10
weight_tuning_rounds = 10
search_rounds = 10
noise_part = 'noise' if label_region_loader.introduce_noise else 'no_noise'
run_id = uuid.uuid1().hex
dir_name = f"{noise_part}_{1}_{run_id}"
out_path = join('../jupy/output/', 'NoImprovement', dataset.name, dir_name)
makedirs(out_path, exist_ok=True)

In [9]:
from random import shuffle
from typing import List, Dict
from numpy import array_split

def get_folds() -> List[Dict[str, List]]:
    """Creates folds for cross validation while balancing single and multi table files per fold"""
    single_table_keys = dataset.single_table_keys
    multi_table_keys = dataset.multi_table_keys

    shuffle(single_table_keys)
    shuffle(multi_table_keys)

    single_table_chunks = array_split(single_table_keys, k)
    multi_table_chunks = array_split(multi_table_keys, k)

    # Combine Single and Multi Table Chunks to test folds
    test_chunks = [list(s_chunk) + list(m_chunk) for s_chunk, m_chunk in
                   zip(single_table_chunks, multi_table_chunks)]
    # Train fold is all keys without the test fold
    train_chunks = [list(set(dataset.keys).difference(test_chunk)) for test_chunk in test_chunks]

    return [{"train": train_chunk, "test": test_chunk} for train_chunk, test_chunk in
            zip(train_chunks, test_chunks)]

def dump(file_name, data, subdir=""):
    """Dump given data to a json file"""
    if subdir != "":
        makedirs(join(out_path, subdir), exist_ok=True)
    with open(join(out_path, subdir, file_name), "w") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [10]:
folds = get_folds()
dump("config.json", {
            "dataset": dataset.name,
            "noise": label_region_loader.introduce_noise,
            "k": k,
            "weight_tuning_rounds": weight_tuning_rounds,
            "search_rounds": search_rounds,
            "seed": 1,
        })
dump("folds.json", dict([(i, fold) for i, fold in enumerate(folds)]))


In [11]:
from src.experiments import Analyser
from src.search.GeneticSearchConfiguration import GeneticSearchConfiguration
from src.search.GeneticSearch import GeneticSearch
from src.graph.SpreadSheetGraph import SpreadSheetGraph
from src.search.FitnessRater import FitnessRater, get_initial_weights
from labelregions.BoundingBox import BoundingBox
from scipy.optimize import minimize, Bounds
from graph.Edge import Edge
from typing import Union, Callable
edge_mutation_probability_callback: Callable[[Edge], int] = lambda x: 1
def genetic_search_accuracy(
        ground_truth: List[BoundingBox],
        sheet_graph: SpreadSheetGraph,
        rater: FitnessRater,
):
    """Runs genetic searches, evaluates the results against the ground truth, and returns the avg. accuracy score"""
    search = GeneticSearch(
        sheet_graph,
        rater,
        GeneticSearchConfiguration(
            sheet_graph,
            edge_mutation_probability_callback=edge_mutation_probability_callback,
        ),
    )
    # Genetic Search runs multiple times and gets averaged
    results = [search.run() for _ in range(search_rounds)]

    print('------------------result---------------------')
    for result in  results:
        for item in result.get_table_definitions():
            print(item)
    print('---------------------------------------------')

    accuracies = [
        Analyser.accuracy_based_on_jacard_index(ground_truth, result.get_table_definitions())
        for result in results
    ]
    return sum(accuracies) / len(accuracies)

def process_fold( fold: Dict[str, List], fold_num: int) -> float:
    """Evaluates on fold of the cross validation, returns the accuracy of the fold"""
    # Train multiple rounds
    # weights_and_errors = [
    #     train(fold["train"], fold_num, i)
    #     for i in tqdm(range(weight_tuning_rounds), desc=f"Training Rounds of fold {fold_num}")
    # ]
    # Average the training results weighted by their error

    #加载权重
    with open('/Users/lizixuan/Desktop/output/NoImprovement/Deco/no_noise_1_fe301df8eb2111eda40ee0be0345a1e0/fold_9/fold_9_weights.json') as file:
        weights_and_errors = json.load(file)
        for e in weights_and_errors['weights_and_errors']:
            print(e['error_rate'])
    weights = CrossValidationTraining.weighted_average(weights_and_errors['weights_and_errors'])



    dump(
        f"fold_{fold_num}_weights.json",
        {"weights_and_errors": weights_and_errors, "weights": weights},
        subdir=f"fold_{fold_num}"
    )
    # Test accuracies on gold standard
    # Disable any noise
    label_region_loader.introduce_noise = False
    file_accuracies = {}
    for key in tqdm(fold["test"], desc=f"Test Set Validation of fold {fold_num}"):
        # Get ground truth data
        print('#########################################################################')
        sheet_data = dataset.get_specific_sheetdata(key, label_region_loader)
        sheet_graph = SpreadSheetGraph(sheet_data)
        ground_truth = sheet_graph.get_table_definitions()

        print('------------------Truth---------------------')
        for item in ground_truth:
            print(item)
        print('---------------------------------------------')

        print('*********************************')
        print(f'sheet_data:{sheet_data}')
        print(f'sheet_graph:{sheet_graph}')
        print(f'ground_truth:{ground_truth}')
        print('*********************************')

        # Evaluate the prediction
        rater = FitnessRater(weights)
        if len(sheet_graph.nodes) <= 10:
            accuracy = CrossValidationTraining.exhaustive_search_accuracy(ground_truth, sheet_graph, rater)
        else:
            accuracy = genetic_search_accuracy(ground_truth, sheet_graph, rater)
        file_accuracies[key] = accuracy
        print('#########################################################################')
    # Average fold accuracies of test data
    fold_accuracy = sum(file_accuracies.values()) / len(file_accuracies.values())
    dump(
        f"fold_{fold_num}_file_accuracies.json",
        {"fold_file_accuracies": file_accuracies, "fold_accuracy": fold_accuracy},
        subdir=f"fold_{fold_num}",
    )
    return fold_accuracy

def train( train_keys: List[str], fold_num: int, training_round: int) -> Dict[str, Union[List[float], float]]:
    """Performs SQP on the given keys, outputs the resulting weights and their error rate"""
    graphs = [
        SpreadSheetGraph(dataset.get_specific_sheetdata(key, label_region_loader))
        for key in train_keys
    ]
    partitions = {}
    for graph in graphs:
        # Create more alternative partitions on multi table files (10 alternatives per table in file)
        partitions[graph] = CrossValidationTraining.generate_alternatives(graph, 10 * len(graph.get_components()))
    dump(
        f"fold_{fold_num}_training_round_{training_round}_input.json",
        dict([(str(graph.sheet_data), partitions) for graph, partitions in partitions.items()]),
        subdir=join(f"fold_{fold_num}", "training")
    )
    initial_weights = get_initial_weights()
    # Create rater object outside to leverage caching
    rater = FitnessRater(initial_weights)
    # Use SQP to minimize the obj. function
    res = minimize(
        CrossValidationTraining.objective_function,
        initial_weights,
        args=(partitions, rater),
        method="SLSQP",
        bounds=Bounds(0, 1000),
    )
    weights = list(res.x)
    rater.weights = weights
    # Calculate error rate components
    total_alternative_count = 0
    better_than_original_alternative_count = 0
    for graph, alternatives in partitions.items():
        obj_score_original_graph = rater.rate(graph, graph.edge_toggle_list)
        total_alternative_count += len(alternatives)
        better_than_original_alternative_count += len([
            alternative
            for alternative in alternatives
            if rater.rate(graph, alternative) < obj_score_original_graph
        ])
    dump(
        f"fold_{fold_num}_training_round_{training_round}_result.json",
        {
            "weights": weights,
            "total_alternative_count": total_alternative_count,
            "better_than_original_alternative_count": better_than_original_alternative_count,
        },
        subdir=join(f"fold_{fold_num}", "training"),
    )
    return {"weights": weights, "error_rate": better_than_original_alternative_count / total_alternative_count}

In [12]:
fold_accuracies = []
for fold_num_and_fold in tqdm(enumerate(folds)):
    fold_accuracies.append(process_fold(fold_num_and_fold[1], fold_num_and_fold[0]))

0it [00:00, ?it/s]
Test Set Validation of fold 0:   0%|          | 0/63 [00:00<?, ?it/s][A

0.021444444444444443
0.019222222222222224
0.022555555555555554
0.022555555555555554
0.02188888888888889
0.025444444444444443
0.021
0.023555555555555555
0.021444444444444443
0.022444444444444444
#########################################################################
------------------Truth---------------------
"top": 5
"left": 1
"bottom": 37
"right": 1
---------------------------------------------
*********************************
sheet_data:Sheetdata(JAN01 of ../data/Deco/xls/john_forney__15502__SCHED2001.xlsx
sheet_graph:<src.graph.SpreadSheetGraph.SpreadSheetGraph object at 0x136a8b3a0>
ground_truth:[<labelregions.BoundingBox.BoundingBox object at 0x136a8b370>]
*********************************



Test Set Validation of fold 0:   2%|▏         | 1/63 [01:36<1:39:50, 96.63s/it][A

------------------result---------------------
"top": 5
"left": 1
"bottom": 37
"right": 1
"top": 6
"left": 2
"bottom": 8
"right": 2
"top": 5
"left": 1
"bottom": 37
"right": 1
"top": 6
"left": 2
"bottom": 8
"right": 2
"top": 5
"left": 1
"bottom": 37
"right": 1
"top": 6
"left": 2
"bottom": 8
"right": 2
"top": 5
"left": 1
"bottom": 37
"right": 1
"top": 6
"left": 2
"bottom": 8
"right": 2
"top": 5
"left": 1
"bottom": 37
"right": 1
"top": 6
"left": 2
"bottom": 8
"right": 2
"top": 5
"left": 1
"bottom": 37
"right": 1
"top": 6
"left": 2
"bottom": 8
"right": 2
"top": 5
"left": 1
"bottom": 37
"right": 1
"top": 6
"left": 2
"bottom": 8
"right": 2
"top": 5
"left": 1
"bottom": 37
"right": 1
"top": 6
"left": 2
"bottom": 8
"right": 2
"top": 5
"left": 1
"bottom": 37
"right": 1
"top": 6
"left": 2
"bottom": 8
"right": 2
"top": 5
"left": 1
"bottom": 37
"right": 1
"top": 6
"left": 2
"bottom": 8
"right": 2
---------------------------------------------
##########################################################

Test Set Validation of fold 0:   2%|▏         | 1/63 [06:26<6:39:42, 386.81s/it]
0it [06:26, ?it/s]


KeyboardInterrupt: 