In [1]:
import math
import random
from itertools import combinations

import numpy as np

In [None]:
import pandas as pd

In [2]:
# Initialize variables
n_rows = 33
letters = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
age_diff_distributions = {
    "fs": ("gaussian", 26.11, 11.24),
    "fd": ("gaussian", 27.88, 11.48),
    "ms": ("gaussian", 22.52, 9.88),
    "md": ("gaussian", 22.53, 10.02),
    "gfgs": ("gaussian", 38.24, 18.47),
    "gfgd": ("gaussian", 37.90, 18.31),
    "gmgs": ("gaussian", 32.78, 17.04),
    "gmgd": ("gaussian", 35.53, 17.63),
    "bb": ("power-law", 1, 1),
    "ss": ("power-law", 1, 1),
    "sibs": ("power-law", 1, 1),
}
gender_constraints = {
    "fs": ("male", "male"),
    "fd": ("male", "female"),
    "ms": ("female", "male"),
    "md": ("female", "female"),
    "gfgs": ("male", "male"),
    "gfgd": ("male", "female"),
    "gmgs": ("female", "male"),
    "gmgd": ("female", "female"),
    "bb": ("male", "male"),
    "ss": ("female", "female"),
    "sibs": ("either", "either"),
}

In [3]:
# Function to generate age difference
def generate_age_diff(distribution, *params):
    if distribution == "gaussian":
        mean, std_dev = params
        return max(0, np.random.normal(mean, std_dev))
    else:
        constant, exponent = params
        return max(0, np.random.pareto(exponent) + constant)

In [22]:
# Create df_final
df_final = pd.DataFrame(columns=["x1", "x2", "age_x1", "age_x2", "gender_x1", "gender_x2", "kinship_type", "age_diff"])

for i in range(n_rows):
    letter = random.choice(letters)
    x1 = f"{letter}{random.randint(1, 100)}"
    x2 = f"{letter}{random.randint(1, 100)}"
    kinship_type = random.choice(list(age_diff_distributions.keys()))
    gender_x1, gender_x2 = gender_constraints[kinship_type]
    if gender_x1 == "either":
        gender_x1 = random.choice(["male", "female"])
    if gender_x2 == "either":
        gender_x2 = random.choice(["male", "female"])
    distribution, *params = age_diff_distributions[kinship_type]
    age_diff = generate_age_diff(distribution, *params)
    age_x1 = random.randint(1, 100)
    age_x2 = max(0, int(age_x1 - age_diff)) if age_x1 >= age_diff else int(age_x1 + age_diff)
    if random.choice([True, False]):
        x1, x2 = x2, x1
        age_x1, age_x2 = age_x2, age_x1
        gender_x1, gender_x2 = gender_x2, gender_x1
    df_final.loc[i] = [x1, x2, age_x1, age_x2, gender_x1, gender_x2, kinship_type, math.ceil(age_diff)]

df_final

Unnamed: 0,x1,x2,age_x1,age_x2,gender_x1,gender_x2,kinship_type,age_diff
0,Y40,Y40,46,68,female,female,md,22
1,T75,T40,18,37,female,female,md,20
2,K59,K59,42,40,male,male,sibs,2
3,Y2,Y70,31,1,male,female,gmgs,31
4,K29,K49,35,48,female,female,md,13
5,D47,D62,80,15,female,male,gfgd,66
6,T61,T40,53,14,male,male,gfgs,39
7,L25,L20,39,6,male,male,fs,34
8,N48,N56,62,60,male,male,bb,2
9,C77,C76,53,23,female,female,md,31


In [23]:
# Create df_pairwise
kinship_mapping = {
    "fs": "father-son",
    "fd": "father-daughter",
    "ms": "mother-son",
    "md": "mother-daughter",
    "bb": "brother-brother",
    "ss": "sister-sister",
    "sibs": "siblings",
    "gfgs": "grandfather-grandson",
    "gfgd": "grandfather-granddaughter",
    "gmgs": "grandmother-grandson",
    "gmgd": "grandmother-granddaughter",
}
rows = []
for (idx1, row1), (idx2, row2) in combinations(df_final.iterrows(), 2):
    age_diff = abs(row1["age_x1"] - row2["age_x1"])
    if row1["kinship_type"] == "sibs":
        first_term = "siblings"
    else:
        first_term = kinship_mapping[row1["kinship_type"]].split("-")[0]
    if row2["kinship_type"] == "sibs":
        second_term = "siblings"
    else:
        second_term = kinship_mapping[row2["kinship_type"]].split("-")[-1]
    new_kinship_type = first_term + "-" + second_term
    rows.append(
        {
            "x1": row1["x1"],
            "x2": row2["x1"],
            "age_x1": row1["age_x1"],
            "age_x2": row2["age_x1"],
            "gender_x1": row1["gender_x1"],
            "gender_x2": row2["gender_x1"],
            "age_diff": age_diff,
            "kinship_type": new_kinship_type,
        }
    )
df_pairwise = pd.DataFrame(rows)

In [24]:
# Add is_kin and age_diff columns
df_final["is_kin"] = 1
df_pairwise["is_kin"] = 0
df_pairwise.rename(columns={"new_kinship_type": "kinship_type"}, inplace=True)

In [25]:
# Merge to create df_merged
df_merged = pd.concat([df_final, df_pairwise], ignore_index=True)

In [26]:
df_merged

Unnamed: 0,x1,x2,age_x1,age_x2,gender_x1,gender_x2,kinship_type,age_diff,is_kin
0,Y40,Y40,46,68,female,female,md,22,1
1,T75,T40,18,37,female,female,md,20,1
2,K59,K59,42,40,male,male,sibs,2,1
3,Y2,Y70,31,1,male,female,gmgs,31,1
4,K29,K49,35,48,female,female,md,13,1
...,...,...,...,...,...,...,...,...,...
556,X85,X53,10,74,female,female,sister-siblings,64,0
557,X85,N83,10,14,female,male,sister-granddaughter,4,0
558,A65,X53,31,74,female,female,father-siblings,43,0
559,A65,N83,31,14,female,male,father-granddaughter,17,0


In [27]:
df_merged.sample(n=10)

Unnamed: 0,x1,x2,age_x1,age_x2,gender_x1,gender_x2,kinship_type,age_diff,is_kin
540,F1,R70,13,28,female,male,grandmother-son,15,0
477,Z40,R70,46,28,female,male,mother-son,18,0
555,X85,A65,10,31,female,female,sister-daughter,21,0
327,N76,A65,46,31,male,female,siblings-daughter,15,0
38,Y40,T61,46,53,female,male,mother-grandson,7,0
127,Y2,D47,31,80,male,female,grandmother-granddaughter,49,0
167,K29,N90,35,28,female,female,mother-son,7,0
280,N48,K55,62,35,male,male,brother-siblings,27,0
332,D63,G94,15,85,female,male,grandmother-daughter,70,0
97,K59,K29,42,35,male,female,siblings-daughter,7,0


In [28]:
df_merged[df_merged.x1 == "F1"]

Unnamed: 0,x1,x2,age_x1,age_x2,gender_x1,gender_x2,kinship_type,age_diff,is_kin
26,F1,F19,13,1,female,female,gmgd,13,1
540,F1,R70,13,28,female,male,grandmother-son,15,0
541,F1,K55,13,35,female,male,grandmother-siblings,22,0
542,F1,X85,13,10,female,female,grandmother-sister,3,0
543,F1,A65,13,31,female,female,grandmother-daughter,18,0
544,F1,X53,13,74,female,female,grandmother-siblings,61,0
545,F1,N83,13,14,female,male,grandmother-granddaughter,1,0
