In [1]:
import math
import random
from itertools import combinations

import numpy as np

In [None]:
import pandas as pd

In [2]:
# Initialize variables
n_rows = 33
letters = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
age_diff_distributions = {
    "fs": ("gaussian", 26.11, 11.24),
    "fd": ("gaussian", 27.88, 11.48),
    "ms": ("gaussian", 22.52, 9.88),
    "md": ("gaussian", 22.53, 10.02),
    "gfgs": ("gaussian", 38.24, 18.47),
    "gfgd": ("gaussian", 37.90, 18.31),
    "gmgs": ("gaussian", 32.78, 17.04),
    "gmgd": ("gaussian", 35.53, 17.63),
    "bb": ("power-law", 1, 1),
    "ss": ("power-law", 1, 1),
    "sibs": ("power-law", 1, 1),
}
gender_constraints = {
    "fs": ("male", "male"),
    "fd": ("male", "female"),
    "ms": ("female", "male"),
    "md": ("female", "female"),
    "gfgs": ("male", "male"),
    "gfgd": ("male", "female"),
    "gmgs": ("female", "male"),
    "gmgd": ("female", "female"),
    "bb": ("male", "male"),
    "ss": ("female", "female"),
    "sibs": ("either", "either"),
}

In [3]:
# Function to generate age difference
def generate_age_diff(distribution, *params):
    if distribution == "gaussian":
        mean, std_dev = params
        return max(0, np.random.normal(mean, std_dev))
    else:
        constant, exponent = params
        return max(0, np.random.pareto(exponent) + constant)

In [32]:
# Create df_final
df_final = pd.DataFrame(columns=["x1", "x2", "age_x1", "age_x2", "gender_x1", "gender_x2", "kinship_type", "age_diff"])

for i in range(n_rows):
    letter = random.choice(letters)
    x1 = f"{letter}{random.randint(1, 100)}"
    x2 = f"{letter}{random.randint(1, 100)}"
    kinship_type = random.choice(list(age_diff_distributions.keys()))
    gender_x1, gender_x2 = gender_constraints[kinship_type]
    if kinship_type == 'sibs':
        gender_x1 = random.choice(["male", "female"])
        gender_x2 = "male" if gender_x1 == "female" else "female"
    distribution, *params = age_diff_distributions[kinship_type]
    age_diff = generate_age_diff(distribution, *params)
    age_x1 = random.randint(1, 100)
    age_x2 = max(0, int(age_x1 - age_diff)) if age_x1 >= age_diff else int(age_x1 + age_diff)
    if random.choice([True, False]):
        x1, x2 = x2, x1
        age_x1, age_x2 = age_x2, age_x1
        gender_x1, gender_x2 = gender_x2, gender_x1
    df_final.loc[i] = [x1, x2, age_x1, age_x2, gender_x1, gender_x2, kinship_type, math.ceil(age_diff)]

df_final

Unnamed: 0,x1,x2,age_x1,age_x2,gender_x1,gender_x2,kinship_type,age_diff
0,X86,X91,46,102,female,female,gmgd,57
1,B38,B46,83,29,male,female,gfgd,54
2,X93,X40,93,62,female,male,ms,31
3,Z97,Z13,38,7,male,male,gfgs,32
4,Z28,Z55,26,82,female,male,gmgs,57
5,B59,B89,24,59,male,female,ms,35
6,T5,T63,50,95,male,male,gfgs,45
7,M84,M10,40,58,male,male,fs,18
8,Q11,Q73,81,88,male,female,sibs,7
9,L51,L2,65,99,male,male,fs,34


In [33]:
# Create df_pairwise
kinship_mapping = {
    "fs": "father-son",
    "fd": "father-daughter",
    "ms": "mother-son",
    "md": "mother-daughter",
    "bb": "brother-brother",
    "ss": "sister-sister",
    "sibs": "siblings",
    "gfgs": "grandfather-grandson",
    "gfgd": "grandfather-granddaughter",
    "gmgs": "grandmother-grandson",
    "gmgd": "grandmother-granddaughter",
}
rows = []
for (idx1, row1), (idx2, row2) in combinations(df_final.iterrows(), 2):
    age_diff = abs(row1["age_x1"] - row2["age_x1"])
    if row1["kinship_type"] == "sibs":
        first_term = "siblings"
    else:
        first_term = kinship_mapping[row1["kinship_type"]].split("-")[0]
    if row2["kinship_type"] == "sibs":
        second_term = "siblings"
    else:
        second_term = kinship_mapping[row2["kinship_type"]].split("-")[-1]
    new_kinship_type = first_term + "-" + second_term
    rows.append(
        {
            "x1": row1["x1"],
            "x2": row2["x1"],
            "age_x1": row1["age_x1"],
            "age_x2": row2["age_x1"],
            "gender_x1": row1["gender_x1"],
            "gender_x2": row2["gender_x1"],
            "age_diff": age_diff,
            "kinship_type": new_kinship_type,
        }
    )
df_pairwise = pd.DataFrame(rows)

In [34]:
# Add is_kin and age_diff columns
df_final["is_kin"] = 1
df_pairwise["is_kin"] = 0
df_pairwise.rename(columns={"new_kinship_type": "kinship_type"}, inplace=True)

In [35]:
# Merge to create df_merged
df_merged = pd.concat([df_final, df_pairwise], ignore_index=True)

In [42]:
df_merged.kinship_type.sort_values().unique()

array(['bb', 'brother-granddaughter', 'brother-siblings',
       'father-brother', 'father-daughter', 'father-granddaughter',
       'father-grandson', 'father-siblings', 'father-sister',
       'father-son', 'fd', 'fs', 'gfgd', 'gfgs', 'gmgd', 'gmgs',
       'grandfather-brother', 'grandfather-daughter',
       'grandfather-granddaughter', 'grandfather-grandson',
       'grandfather-siblings', 'grandfather-sister', 'grandfather-son',
       'grandmother-brother', 'grandmother-daughter',
       'grandmother-granddaughter', 'grandmother-grandson',
       'grandmother-siblings', 'grandmother-sister', 'grandmother-son',
       'md', 'mother-brother', 'mother-daughter', 'mother-granddaughter',
       'mother-grandson', 'mother-siblings', 'mother-sister',
       'mother-son', 'ms', 'siblings-brother', 'siblings-daughter',
       'siblings-granddaughter', 'siblings-grandson', 'siblings-siblings',
       'siblings-sister', 'siblings-son', 'sibs', 'sister-brother',
       'sister-daughter', 's