# Remove Fields

This notebook will remove specified field from all files in the `json` folder. It is intended primarily for creating sample data sets for testing, but we could consider documenting it fully and keeping it in the release version of the module.

## Configuration

Configure a list of fields to remove. By default, the folder containing the json files is the project's `json` directory, but you can configure another folder (relative to the project root). Errors will be logged to a file saved with the name you configure.

In [None]:
# Configuration
fields_to_remove = ['features', 'bag_of_words']
json_dir = 'project_data/json' # Relative to the project directory
log_file = 'field_removal_log.txt'

## Setup

In [None]:
# Python imports
import json
import os
import ipywidgets
from IPython.display import display, HTML
from ipywidgets import HBox, IntProgress, Label
from pathlib import Path    
from time import time

# Get the project directory
current_dir = %pwd
project_dir = str(Path(current_dir).parent.parent)
json_dir = os.path.join(project_dir, json_dir)

# Job class
class Job():
    """Create a job object."""

    def __init__(self, json_dir='project_data/json', fields_to_remove=['features', 'bag_of_words'],
                 log_file='field_removal_log.txt'):
        """Initialise the job object."""
        self.json_dir = json_dir
        self.fields_to_remove = fields_to_remove
        self.log_file = log_file
        self.file_list = os.listdir(json_dir)
        self.num_files = len(self.file_list)
        self.this_iter = 0

    def log_error(self, filename, error_type='read'):
        """Save an error to the log file."""
        with open(log_file, 'a') as f:
            f.write(filename + ' - ' + error_type + ' error\n')

    def read_file(self, filename):
        """Read a json doc from file into a dict."""
        try:
            with open(os.path.join(self.json_dir, filename), 'r') as f:
                return json.loads(f.read())
        except (IOError, ValueError):
            self.log_error(filename, error_type='read')
            return None

    def remove_fields(self, doc):
        """ Remove fields from a dict if they exist."""
        for field in self.fields_to_remove:
            if field in doc:
                del doc[field]
        return doc

    def write_file(self, doc, filename):
        """Write a dict to a json doc file."""
        try:
            with open(os.path.join(self.json_dir, filename), 'w') as f:
                f.write(json.dumps(doc))
        except IOError:
            self.log_error(filename, error_type='write')

    def run(self):
        """Run the script."""
        timer = Timer()
        pbar = IntProgress(min=0, max=100) # instantiate the progress bar
        percent = ipywidgets.HTML(value='0%')
        display(HBox([Label('Removing fields...'), pbar, percent]))
        for file in self.file_list:
            doc = self.read_file(file)
            doc = self.remove_fields(doc)
            self.write_file(doc, file)
            progress = int(100. * self.this_iter/self.num_files)
            pbar.value = progress
            percent.value = '{0}%'.format(progress)
            self.this_iter += 1
        pbar.value = 100
        percent.value = '100%'
        display(HTML('<p style="color: green;">Done!</p>'))
        display('Time elapsed: %s' % timer.get_time_elapsed())


# Timer class
class Timer:
    """Create a timer object."""

    def __init__(self):
        """Initialise the timer object."""
        self.start = time()

    def get_time_elapsed(self):
        """Get the elapsed time and format it as hours, minutes, and seconds."""
        end = time()
        m, s = divmod(end - self.start, 60)
        h, m = divmod(m, 60)
        time_str = "%02d:%02d:%02d" % (h, m, s)
        return time_str
    
display(HTML('<p style="color: green;">Setup complete. Run the next cell to remove the json fields.</p>'))

## Remove Fields

In [None]:
# Run the script
job = Job(json_dir, fields_to_remove)
job.run()