In [30]:
import pandas as pd
import random
from datetime import timedelta, datetime

## All information is randomly generated, contains no PII whatsoever

## This dataset simulates common scenarios with duplicates
## Where, names have different accents, legal ids might have a hyphen,
## or a spouse signs up again, having only partially similar fields of the other spouse

# Load CSV file into a dataframe
df_random = pd.read_csv('csv/05-random_names.csv')

# Generate random email
domains = ['gmail.xyz', 'yahoo.123', 'gmx.bbb', 'hotmail.abc', 'outlook.456', '523344123.net']

def generate_random_email(name):
    parts = name.lower().split()
    if len(parts) == 1:
        return parts[0]  # for names with only one part
    first, last = parts[0], parts[-1]
    choices = [
        f"{first[0]}{last}",  # e.g., lstevens
        f"{first}{last}{random.randint(10, 99)}",  # e.g., listevens48
        f"{first[0:3]}_{last[0:3]}",  # e.g., lil_ste
    ]
    return random.choice(choices)

# Generate email1, email2, email1_b
df_random['email1'] = df_random['name'].apply(lambda x: generate_random_email(x) + '@' + random.choice(domains))
df_random['email1_b'] = df_random['name'].apply(lambda x: generate_random_email(x) + '@' + random.choice(domains))
df_random['email2'] = df_random['name'].apply(lambda x: generate_random_email(x) + '@' + random.choice(domains))

# Generate legal
def generate_legal():
    if random.choice([True, False]):
        # Generate format: <letter><7digits>
        return random.choice(['Y', 'Z', 'X']) + ''.join([str(random.randint(0, 9)) for _ in range(7)])
    else:
        # Generate format: <10digits>
        return ''.join([str(random.randint(0, 9)) for _ in range(10)])

df_random['legal'] = df_random.apply(lambda x: generate_legal(), axis=1)

# Generated id_b sequentialially from id
last_id = df_random['id'].max()
df_random['id_b'] = range(last_id + 1, last_id + 1 + len(df_random))

# Function to add accents and possibly a space to names
def add_accents(name):
    vowels = 'aeiou'
    accented_vowels = {
        'a': ['á', 'à'],
        'e': ['é', 'è'],
        'i': ['í', 'ì'],
        'o': ['ó', 'ò'],
        'u': ['ú', 'ù']
    }
    name_list = list(name)
    vowel_positions = [i for i, char in enumerate(name_list) if char in vowels]

    # Randomly decide to add a space between first and last names
    parts = name.split()
    if len(parts) > 1 and random.choice([True, False]):
        name = ' '.join(parts[0]) + '  ' + ' '.join(parts[1:])  # Adds an extra space

    # Ensure at least one vowel is accented if possible
    if vowel_positions:
        random.shuffle(vowel_positions)
        for pos in vowel_positions[:random.randint(1, 2)]:  # Accent 1 or 2 vowels randomly
            original_vowel = name_list[pos]
            name_list[pos] = random.choice(accented_vowels[original_vowel])
            if len(vowel_positions) < 2:
                break  # Break if there aren't enough vowels to accent two

    return ''.join(name_list)

# Apply function to create name_b
df_random['name_b'] = df_random['name'].apply(add_accents)

# Generate date_b
def generate_earlier_date(date_str):
    date_format = '%d/%m/%Y'  # Adjusted date format to match your input
    original_date = datetime.strptime(date_str, date_format)
    years_to_subtract = random.randint(2, 5)
    try:
        new_date = original_date.replace(year=original_date.year - years_to_subtract)
    except ValueError:  # Handling February 29 issue
        # Adjust the day if subtracting the year results in an invalid date (e.g., Feb 29 on a non-leap year)
        new_date = original_date.replace(year=original_date.year - years_to_subtract, day=original_date.day - 1)
    return new_date.strftime(date_format)
df_random['date_b'] = df_random['date'].apply(generate_earlier_date)

# Assign email2_b
def assign_email2(email1):
    # 20% chance to copy email1, 80% chance to leave it blank
    return email1 if random.choice([True, False, False, False, False]) else ''

# Apply function to create email2_b based on email1
df_random['email2_b'] = df_random['email1'].apply(assign_email2)

# Assign legal_b
def modify_or_generate_legal(legal):
    # 50% chance to generate new legal, 50% chance to modify existing
    if random.choice([True, False]):
        return generate_legal()
    else:
        # Check if starts with a letter and add dash if true
        return legal[0] + '-' + legal[1:] if legal[0].isalpha() else legal

# Apply function to create legal_b based on condition
df_random['legal_b'] = df_random['legal'].apply(modify_or_generate_legal)

# Simulate sign ups of the same family, but a spouse is signed up by error by not detecting and email, or slightly modified legal id
# Calculate 30% of the total number of people, ensuring an even number for pairing
num_pairs = int(len(df_random) * 0.3 // 2 * 2)

# Randomly select indices for pairing
selected_indices = random.sample(list(df_random.index), num_pairs)
random.shuffle(selected_indices)  # Shuffle to randomize pairings

# Clear any previous spouse data to avoid conflicts
df_random['spouse'] = ''

# Assign pairs without repetition
pair_number = 1
for i in range(0, len(selected_indices), 2):
    if i+1 < len(selected_indices):  # Ensure there's a pair to assign
        df_random.at[selected_indices[i], 'spouse'] = f'{pair_number}-1'
        df_random.at[selected_indices[i+1], 'spouse'] = f'{pair_number}-2'
        pair_number += 1

def update_spouse_data(df):
    # Make updates based on pairings
    for i in range(0, len(selected_indices), 2):
        if i+1 < len(selected_indices):
            idx1 = selected_indices[i]
            idx2 = selected_indices[i+1]
            if random.choice([True, False]):
                df.at[idx2, 'email2'] = df.at[idx1, 'email1']
            else:
                df.at[idx2, 'legal'] = df.at[idx1, 'legal']

# Apply the update function to the DataFrame
update_spouse_data(df_random)

# Filter and sort pairs
df_with_spouses = df_random[df_random['spouse'].notna() & (df_random['spouse'] != '')].copy()
df_with_spouses['pair_number'] = df_with_spouses['spouse'].apply(lambda x: int(x.split('-')[0]))
df_with_spouses_sorted = df_with_spouses.sort_values('pair_number')

# Display the sorted DataFrame
df_random

# Filter out rows where the 'spouse' column is empty
unpaired_rows = df_random[df_random['spouse'].isna() | (df_random['spouse'] == '')]

# Prepare the data from '_b' columns
consolidated_data = unpaired_rows[['id_b', 'name_b', 'date_b', 'email1_b', 'email2_b', 'legal_b']].copy()
consolidated_data.columns = ['id', 'name', 'date', 'email1', 'email2', 'legal']  # Rename columns to match the original

# Filter out original data to exclude '_b' columns if you want to create a new consolidated DataFrame
df_original = df_random[['id', 'name', 'date', 'email1', 'email2', 'legal']]

# Concatenate the original DataFrame with the new consolidated data
df_random_consolidated = pd.concat([df_original, consolidated_data], ignore_index=True)

# Sort by id
df_random_consolidated = df_random_consolidated.sort_values('id').reset_index(drop=True)


############


df_randomized = df_random_consolidated.sample(frac=1).reset_index(drop=True)

# Select the first 1000 rows after randomization
df_first_5000 = df_randomized.head(5000)

# Save the first 1000 rows to a CSV file, formatted for Excel
df_first_5000.to_csv('clients_with_duplicates.csv', index=False, sep=',', encoding='utf-8-sig')