app.py

import tools.funcs as funcs
import gradio as gr

"""
All development in versions 1.3 and before credited to Daniel Mead. Further improvements (by Sean Pedrick-Case) not annotated here - see Github commit messages.

---
V1.3
Added ability to customise frame names and number

V1.1 
Added ground truth frame, includes original, un-noised entries. These are no longer present in the original frames
This means there is roughly a 25% chance of an entry being included un-noised. 
There is now 0% chance of 0 repeats, 30% of 1, 30% of 2, 20% 3 etc. to account for the removal of ground truth values.

V1.0

Generates 10,000 people

50/50 gender split
10% include titles on the first name (only one first name at a time at the moment)
Birth Dates are all post 1950, randomly distributed
Addresses are randomly generated by faker.
    Cities are: 20 fake cities (1), Cardiff (2), Birmingham (3), Manchester (3), Durham (1), and London (6)
        Numbers in brackets above indicate weightings. IE 5x more likely to be in London than random city.
Postcodes are randomly generated, and are unrelated to addresses (no way of linking these for realism, currently)

For each repeat, all property have a 20% change to have one type of noise added (via typo library). 
Also have a further 20% chance for a second type of noise if one is already added (4% chance total).
Each property has an equal chance and they are non-exclusive. Ie. 0.2^n chance that n properties have errors.

The list of data is then split into 3 randomly sized (20-40% each) frames, which are randomised.

Each item contains a property called group which indicates its ground truth grouping.

"""
#### Defaults
num_people = 1000
frames = 2 # number of data frames

def update_slider_if_value_greater_than_1(slider_1_value, slider_2_value):

    if (slider_1_value + slider_2_value) > 1:
        out_value = round(1-slider_1_value,1)
    else:
        out_value = round(slider_2_value,1)

    return gr.Slider.update(value=out_value)

import pandas as pd

def update_dataframe_value(frames_number):

    frames = range(1,frames_number + 1)
    value = [0] * frames_number
    # Creating a dictionary where keys are column names and values are the data
    data_dict = {frames[i]: [value[i]] for i in range(len(frames))}

    # Creating the DataFrame
    df = pd.DataFrame(data_dict)

    return df

''' Create the gradio interface '''

block = gr.Blocks(theme = gr.themes.Base())

with block:
    gr.Markdown(
    """
    # Create dummy datasets
    """)

    with gr.Accordion(label="Number of data frames and people", open=True):
        number_of_frames = gr.Slider(label = "Number of data frames", value=frames, minimum=1, maximum=10, step=1)
        number_of_people = gr.Dataframe(label = "Number of people in each data frame", col_count=(frames,'dynamic'), datatype="number", row_count=(1,'fixed'))

    with gr.Accordion(label="Repeat people in or across data frames. Default = no repeated people", open=False):
        percentage_overlap = gr.Slider(label = "What proportion of people (of the smallest dataframe) are shared across all data frames?", value=0, minimum=0, maximum=1, step=0.05)
        not_common_appear_once = gr.Dropdown(label = "Are people (not included in the group defined above) unique across all data frames? If no, people will be randomly sampled randomly from the master list and may appear multiple times or not at all across all datasets.", value="Yes", choices=["Yes", "No"])
        percentage_duplicates = gr.Slider(label = "What proportion of each data frame is made up of duplicate people?", value=0, minimum=0, maximum=0.9, step=0.05)

    with gr.Accordion(label="Add typos or missing values to person details. Default = None", open=False):
        noise_prob = gr.Slider(label="Probability of typos in each field", minimum=0, maximum=1, value=0, step = 0.05)
        missing_prob = gr.Slider(label="Probability of each field being blank", minimum=0, maximum=1, value=0, step = 0.05)
        random_seed = gr.Number(label="Choose random seed", value=42)

    with gr.Accordion(label="Additional fields to add to each data frame", open=False):
        add_additional_fields = gr.CheckboxGroup(label = "Which additional fields are needed?", choices = ["First name", "Last name", "Date of birth", "Address", "Title", "Full name",  "Email", "Phone number"],
                                                 value=["First name", "Last name", "Date of birth", "Address"])
        # email = fake.providers.internet.free_email()
        # phone = fake.providers.phone_number()
        # title = fake.prefix_male()
        # title = fake.prefix_female()

    create_df_btn = gr.Button("Create dummy datasets")
    
    with gr.Row():
        output_summary = gr.Textbox(label="Output result")
        output_file = gr.File(label="Output file")
        
    # Updates to components
    percentage_overlap.change(fn=update_slider_if_value_greater_than_1, inputs=[percentage_overlap, percentage_duplicates], outputs=[percentage_duplicates])
    percentage_duplicates.change(fn=update_slider_if_value_greater_than_1, inputs=[percentage_duplicates, percentage_overlap], outputs=[percentage_overlap])
    number_of_frames.change(fn=update_dataframe_value,inputs=[number_of_frames], outputs=[number_of_people])

    create_df_btn.click(fn=funcs.create_fake_df, inputs=[number_of_people, number_of_frames, percentage_overlap, random_seed, not_common_appear_once, percentage_duplicates, noise_prob, missing_prob, add_additional_fields], outputs=[output_file, output_summary], api_name="faker")
    
block.queue(concurrency_count=1).launch(debug=True)