In [1]:
import numpy as np
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt

# Creation of variables

### Sex
1 represents male; 0 represents female

In [2]:
# Define el tamaño del conjunto de datos
dataset_size = 10000

# Define el porcentaje real de hombres en la población
porcentaje_hombres_real = 0.505  # Por ejemplo, el 50.5% son hombres

# Calcula el número de hombres en el conjunto de datos sintético
num_hombres = int(dataset_size * porcentaje_hombres_real)

# Crea una columna "sexo" con la proporción correcta
hombres = np.ones(num_hombres)  # 1 representa "hombre"
mujeres = np.zeros(dataset_size - num_hombres)  # 0 representa "mujer"

# Combina las dos listas para obtener la columna completa de "sexo"
columna_sexo = np.concatenate([hombres, mujeres])

# Mezcla los valores para que no estén en orden
np.random.shuffle(columna_sexo)

# Crea un DataFrame con la columna "sexo" y cualquier otra información que desees agregar
data = pd.DataFrame({'Sex': columna_sexo})
data['Sex'] = data['Sex'].astype(int)

### Smoker
1 represents smoker; 0 represents non smoker

In [3]:
# Define el porcentaje de fumadores para hombres y mujeres
porcentaje_fumadores_hombres = 0.367
porcentaje_fumadores_mujeres = 0.078

data['Smoker'] = np.where(data['Sex'] == 1, np.random.choice([0, 1], size=dataset_size, p=[1 - porcentaje_fumadores_hombres, porcentaje_fumadores_hombres]),
                          np.random.choice([0, 1], size=dataset_size, p=[1 - porcentaje_fumadores_mujeres, porcentaje_fumadores_mujeres]))

### PCR
1 represents PCR; 0 not

In [4]:
# Define el porcentaje de PCR para hombres y mujeres
porcentaje_PCR_hombres = 0.024
porcentaje_PCR_mujeres = 0.029


data['Result PCR mycoplasma'] = np.where(data['Sex'] == 1, np.random.choice([0, 1], size=dataset_size, p=[1 - porcentaje_PCR_hombres, porcentaje_PCR_hombres]),
                          np.random.choice([0, 1], size=dataset_size, p=[1 - porcentaje_PCR_mujeres, porcentaje_PCR_mujeres]))

print(data.head())

   Sex  Smoker  Result PCR mycoplasma
0    1       1                      0
1    1       0                      0
2    0       0                      0
3    1       1                      0
4    0       0                      0


### Age

In [5]:
# Define el tamaño del conjunto de datos
tamaño_dataset = 10000

# Set the random seed for reproducibility
np.random.seed(42)

# Mean and variance for the Gaussian distribution
mean = 65
variance = 15

# Generate random values from a Gaussian distribution
gaussian_values = np.random.normal(loc=mean, scale=np.sqrt(variance), size=dataset_size)

# Add a positive offset to skew the distribution to the right
offset = 5
skewed_values = (gaussian_values + offset).astype(int)

data['Age'] = skewed_values

In [6]:
np.min(skewed_values)

54

### Militar Service
1 represents the individual went to the militar service; 0 the individual did not.

In [7]:
data['Militar service'] = np.where(data['Sex'] == 1, np.random.choice([1, 0], size=dataset_size, p=[0.6, 0.4]),  # sex=1
                            np.random.choice([1, 0], size=tamaño_dataset, p=[0.4, 0.6]))  # sex=0

print(data.head())

   Sex  Smoker  Result PCR mycoplasma  Age  Militar service
0    1       1                      0   71                1
1    1       0                      0   69                1
2    0       0                      0   72                0
3    1       1                      0   75                1
4    0       0                      0   69                1


### Genetic

In [8]:
porcentaje_C9orf72 = 4.1E-6
porcentaje_SOD1 = 7.5E-7
porcentaje_TARDBP = 2E-7
porcentaje_FUS = 1.5E-7

# Calculate the number of individuals with each genetic variant
num_C9orf72 = int(tamaño_dataset * porcentaje_C9orf72)
num_SOD1 = int(tamaño_dataset * porcentaje_SOD1)
num_TARDBP = int(tamaño_dataset * porcentaje_TARDBP)
num_FUS = int(tamaño_dataset * porcentaje_FUS)

# Create lists for each genetic variant
C9orf72 = np.ones(num_C9orf72)
SOD1 = np.ones(num_SOD1)
TARDBP = np.ones(num_TARDBP)
FUS = np.ones(num_FUS)

# Create lists for individuals without each genetic variant
no_C9orf72 = np.zeros(dataset_size - num_C9orf72)
no_SOD1 = np.zeros(dataset_size - num_SOD1)
no_TARDBP = np.zeros(dataset_size - num_TARDBP)
no_FUS = np.zeros(dataset_size - num_FUS)

# Combine the lists to obtain columns for each genetic variant
columna_C9orf72 = np.concatenate([C9orf72, no_C9orf72])
columna_SOD1 = np.concatenate([SOD1, no_SOD1])
columna_TARDBP = np.concatenate([TARDBP, no_TARDBP])
columna_FUS = np.concatenate([FUS, no_FUS])

np.random.shuffle(columna_C9orf72)
np.random.shuffle(columna_FUS)
np.random.shuffle(columna_TARDBP)
np.random.shuffle(columna_SOD1)

# Add columns for each gene to the 'data' DataFrame
data['C9orf72'] = columna_C9orf72.astype(int)
data['SOD1'] = columna_SOD1.astype(int)
data['TARDBP'] = columna_TARDBP.astype(int)
data['FUS'] = columna_FUS.astype(int)

In [9]:
print(data.head())

   Sex  Smoker  Result PCR mycoplasma  Age  Militar service  C9orf72  SOD1  \
0    1       1                      0   71                1        0     0   
1    1       0                      0   69                1        0     0   
2    0       0                      0   72                0        0     0   
3    1       1                      0   75                1        0     0   
4    0       0                      0   69                1        0     0   

   TARDBP  FUS  
0       0    0  
1       0    0  
2       0    0  
3       0    0  
4       0    0  


### Profession

In [10]:
professions = ['mechanic', 'painter', 'constructor','cashier', 'doctor', 'janitor','engineer', 'accountant', 'waiter', 'teacher', 'policeman', 'fireman', 'manager', 'electrician', 'lawyer']
professionsWithOR = ['mechanic', 'painter', 'constructor']

data['Professions'] = [random.choice(professions) for _ in range(dataset_size)]

data['Mechanic'] = data['Professions'].apply(lambda x: 1 if x == 'mechanic' else 0)
data['Painter'] = data['Professions'].apply(lambda x: 1 if x == 'painter' else 0)
data['Constructor'] = data['Professions'].apply(lambda x: 1 if x == 'constructor' else 0)

### Ethnity

In [11]:
ethnicities = ['caucasian', 'black', 'mestizo', 'indian', 'asian']
ethnicityWithOR = ['caucasian', 'black', 'mestizo', 'indian']

data['Ethnicity'] = [random.choice(ethnicities) for _ in range(dataset_size)]

data['Caucasian'] = data['Ethnicity'].apply(lambda x: 1 if x == 'caucasian' else 0)
data['Black'] = data['Ethnicity'].apply(lambda x: 1 if x == 'Black' else 0)
data['Mestizo'] = data['Ethnicity'].apply(lambda x: 1 if x == 'mestizo' else 0)
data['Indian'] = data['Ethnicity'].apply(lambda x: 1 if x == 'indian' else 0)

## Results

In [12]:
data.head()

Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,Mechanic,Painter,Constructor,Ethnicity,Caucasian,Black,Mestizo,Indian
0,1,1,0,71,1,0,0,0,0,fireman,0,0,0,mestizo,0,0,1,0
1,1,0,0,69,1,0,0,0,0,janitor,0,0,0,asian,0,0,0,0
2,0,0,0,72,0,0,0,0,0,waiter,0,0,0,mestizo,0,0,1,0
3,1,1,0,75,1,0,0,0,0,mechanic,1,0,0,caucasian,1,0,0,0
4,0,0,0,69,1,0,0,0,0,accountant,0,0,0,mestizo,0,0,1,0


# Probability column

In [13]:
import math
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data['Age standarized'] = scaler.fit_transform(data[['Age']])
data['Probability'] = 0.138895 * data['Sex'] + 0.0198026 * data['Age standarized'] + 0.22314 * data['Smoker'] + 0.83724 * data['Militar service'] + 2.10010 * data['Result PCR mycoplasma'] + 0.7178398 * data['Mechanic'] + 0.86288995 * data['Painter'] + 0.77473 * data['Constructor'] + 1.071584 * data['Caucasian'] - 3.21887 * data['Black'] - 2.9957 * data['Mestizo'] - 3.91202 * data['Indian'] + 1.28 * data['C9orf72'] + 2.036012 * data['SOD1'] + 1.652497 * data['TARDBP'] + 1.86097 * data['FUS']

In [14]:
data['Probability'] = data['Probability'].apply(lambda x: math.exp(x))
data.head()

Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,Mechanic,Painter,Constructor,Ethnicity,Caucasian,Black,Mestizo,Indian,Age standarized,Probability
0,1,1,0,71,1,0,0,0,0,fireman,0,0,0,mestizo,0,0,1,0,0.548387,0.167702
1,1,0,0,69,1,0,0,0,0,janitor,0,0,0,asian,0,0,0,0,0.483871,2.679732
2,0,0,0,72,0,0,0,0,0,waiter,0,0,0,mestizo,0,0,1,0,0.580645,0.05058
3,1,1,0,75,1,0,0,0,0,mechanic,1,0,0,caucasian,1,0,0,0,0.677419,20.128032
4,0,0,0,69,1,0,0,0,0,accountant,0,0,0,mestizo,0,0,1,0,0.483871,0.116615


In [15]:
data['Probability'] = data['Probability'].apply(lambda x: (x/(1+x)))
data.head()

Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,Mechanic,Painter,Constructor,Ethnicity,Caucasian,Black,Mestizo,Indian,Age standarized,Probability
0,1,1,0,71,1,0,0,0,0,fireman,0,0,0,mestizo,0,0,1,0,0.548387,0.143617
1,1,0,0,69,1,0,0,0,0,janitor,0,0,0,asian,0,0,0,0,0.483871,0.728241
2,0,0,0,72,0,0,0,0,0,waiter,0,0,0,mestizo,0,0,1,0,0.580645,0.048145
3,1,1,0,75,1,0,0,0,0,mechanic,1,0,0,caucasian,1,0,0,0,0.677419,0.95267
4,0,0,0,69,1,0,0,0,0,accountant,0,0,0,mestizo,0,0,1,0,0.483871,0.104436


In [16]:
prevalence = (data['Probability'] > 0.6).sum() / len(data)
print(prevalence)

0.4428


In [17]:
data = data.drop(['Age standarized', 'Mechanic', 'Painter', 'Constructor', 'Caucasian', 'Black', 'Mestizo', 'Indian'], axis=1)

Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,Ethnicity,Probability
0,1,1,0,71,1,0,0,0,0,fireman,mestizo,0.143617
1,1,0,0,69,1,0,0,0,0,janitor,asian,0.728241
2,0,0,0,72,0,0,0,0,0,waiter,mestizo,0.048145
3,1,1,0,75,1,0,0,0,0,mechanic,caucasian,0.952670
4,0,0,0,69,1,0,0,0,0,accountant,mestizo,0.104436
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,75,1,0,0,0,0,policeman,asian,0.700704
9996,1,1,0,62,0,0,0,0,0,accountant,black,0.590769
9997,0,0,0,67,0,0,0,0,0,mechanic,mestizo,0.093676
9998,1,0,0,71,0,0,0,0,0,manager,caucasian,0.772299


# Additional variables

### Body Mass Index

In [18]:
def generate_bmi(row):
    age = row['Age']
    sex = row['Sex']

    if 17 <= age <= 49:
        if sex == 1:  # Men
            return np.random.uniform(low=23, high=30.5)
        elif sex == 0:  # Women
            return np.random.uniform(low=22, high=28.5)
    elif 50 <= age <= 69:
        if sex == 1:  # Men
            return np.random.uniform(low=24, high=30.75)
        elif sex == 0:  # Women
            return np.random.uniform(low=22, high=29.5)
    elif 70 <= age <= 79:
        if sex == 1:  # Men
            return np.random.uniform(low=23.75, high=30.25)
        elif sex == 0:  # Women
            return np.random.uniform(low=23, high=30)
    elif 80 <= age <= 98:
        if sex == 1:  # Men
            return np.random.uniform(low=23, high=28)
        elif sex == 0:  # Women
            return np.random.uniform(low=21.75, high=27.25)

# Create 'BMI' column based on the generated values
data['BMI'] = data.apply(generate_bmi, axis=1)
data.head()

Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,...,Painter,Constructor,Ethnicity,Caucasian,Black,Mestizo,Indian,Age standarized,Probability,BMI
0,1,1,0,71,1,0,0,0,0,fireman,...,0,0,mestizo,0,0,1,0,0.548387,0.143617,26.425894
1,1,0,0,69,1,0,0,0,0,janitor,...,0,0,asian,0,0,0,0,0.483871,0.728241,26.228107
2,0,0,0,72,0,0,0,0,0,waiter,...,0,0,mestizo,0,0,1,0,0.580645,0.048145,23.091505
3,1,1,0,75,1,0,0,0,0,mechanic,...,0,0,caucasian,1,0,0,0,0.677419,0.95267,27.553869
4,0,0,0,69,1,0,0,0,0,accountant,...,0,0,mestizo,0,0,1,0,0.483871,0.104436,24.485666


### Physical Activity

In [19]:
# Define the physical activity rates for men and women
physical_activity_rate_men = 1 / 4
physical_activity_rate_women = 1 / 3

# Create a new column 'Adequate Physical Activity' based on the physical activity rates
data['Adequate Physical Activity'] = np.where(data['Sex'] == 1, np.random.choice([0, 1], size=dataset_size, p=[1 - physical_activity_rate_men, physical_activity_rate_men]),
                                              np.random.choice([0, 1], size=dataset_size, p=[1 - physical_activity_rate_women, physical_activity_rate_women]))

data.head()


Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,...,Constructor,Ethnicity,Caucasian,Black,Mestizo,Indian,Age standarized,Probability,BMI,Adequate Physical Activity
0,1,1,0,71,1,0,0,0,0,fireman,...,0,mestizo,0,0,1,0,0.548387,0.143617,26.425894,0
1,1,0,0,69,1,0,0,0,0,janitor,...,0,asian,0,0,0,0,0.483871,0.728241,26.228107,0
2,0,0,0,72,0,0,0,0,0,waiter,...,0,mestizo,0,0,1,0,0.580645,0.048145,23.091505,0
3,1,1,0,75,1,0,0,0,0,mechanic,...,0,caucasian,1,0,0,0,0.677419,0.95267,27.553869,0
4,0,0,0,69,1,0,0,0,0,accountant,...,0,mestizo,0,0,1,0,0.483871,0.104436,24.485666,1


### Excessive alcohol consumption

In [20]:
# Define excessive alcohol consumption rates by age and sex
alcohol_rates = {
    '45-54': 0.082,
    '55-64': 0.118,
    '65-74': 0.156,
    '75+': 0.16
}

alcohol_rate_female = 0.041
alcohol_rate_male = 0.13

# Create a new column 'Excessive Alcohol Consumption' based on age and sex-specific rates
data['Excessive Alcohol Consumption'] = np.where(
    ((data['Age'] >= 45) & (data['Age'] <= 54) & (data['Sex'] == 1) & (np.random.rand(dataset_size) < alcohol_rates['45-54'])) |
    ((data['Age'] >= 55) & (data['Age'] <= 64) & (data['Sex'] == 1) & (np.random.rand(dataset_size) < alcohol_rates['55-64'])) |
    ((data['Age'] >= 65) & (data['Age'] <= 74) & (data['Sex'] == 1) & (np.random.rand(dataset_size) < alcohol_rates['65-74'])) |
    ((data['Age'] >= 75) & (data['Sex'] == 1) & (np.random.rand(dataset_size) < alcohol_rates['75+'])) |
    ((data['Sex'] == 0) & (np.random.rand(dataset_size) < alcohol_rate_female)) |
    ((data['Sex'] == 1) & (np.random.rand(dataset_size) < alcohol_rate_male)),
    1, 0 # If the conditions are met, assign '1' to the 'Excessive Alcohol Consumption' column; otherwise, assigns '0'.
)

data.head()



Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,...,Ethnicity,Caucasian,Black,Mestizo,Indian,Age standarized,Probability,BMI,Adequate Physical Activity,Excessive Alcohol Consumption
0,1,1,0,71,1,0,0,0,0,fireman,...,mestizo,0,0,1,0,0.548387,0.143617,26.425894,0,0
1,1,0,0,69,1,0,0,0,0,janitor,...,asian,0,0,0,0,0.483871,0.728241,26.228107,0,1
2,0,0,0,72,0,0,0,0,0,waiter,...,mestizo,0,0,1,0,0.580645,0.048145,23.091505,0,0
3,1,1,0,75,1,0,0,0,0,mechanic,...,caucasian,1,0,0,0,0.677419,0.95267,27.553869,0,0
4,0,0,0,69,1,0,0,0,0,accountant,...,mestizo,0,0,1,0,0.483871,0.104436,24.485666,1,0


### Visited tropical countries prior

In [21]:
tropical_visit_rates = {
    'caucasian' : 0.237,
    'black' : 0.162,
    'mestizo' : 0.561,
    'indian' : 0.793,
    'asian' : 0.417
}

data['Visited tropical countries'] = np.where(
    ((data['Ethnicity'] == 'caucasian') & (np.random.rand(dataset_size) < tropical_visit_rates['caucasian'])) |
    ((data['Ethnicity'] == 'black') & (np.random.rand(dataset_size) < tropical_visit_rates['black'])) |
    ((data['Ethnicity'] == 'mestizo') & (np.random.rand(dataset_size) < tropical_visit_rates['mestizo'])) |
    ((data['Ethnicity'] == 'indian') & (np.random.rand(dataset_size) < tropical_visit_rates['indian'])) |
    ((data['Ethnicity'] == 'asian') & (np.random.rand(dataset_size) < tropical_visit_rates['asian'])),
    1, 0 # If the conditions are met, assign '1' to the 'Excessive Alcohol Consumption' column; otherwise, assigns '0'.
)

data.head()

Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,...,Caucasian,Black,Mestizo,Indian,Age standarized,Probability,BMI,Adequate Physical Activity,Excessive Alcohol Consumption,Visited tropical countries
0,1,1,0,71,1,0,0,0,0,fireman,...,0,0,1,0,0.548387,0.143617,26.425894,0,0,1
1,1,0,0,69,1,0,0,0,0,janitor,...,0,0,0,0,0.483871,0.728241,26.228107,0,1,0
2,0,0,0,72,0,0,0,0,0,waiter,...,0,0,1,0,0.580645,0.048145,23.091505,0,0,0
3,1,1,0,75,1,0,0,0,0,mechanic,...,1,0,0,0,0.677419,0.95267,27.553869,0,0,0
4,0,0,0,69,1,0,0,0,0,accountant,...,0,0,1,0,0.483871,0.104436,24.485666,1,0,1


### Contact sports

In [22]:
sports_rate_female = 0.046
sports_rate_male = 0.11

# Create a new column 'Excessive Alcohol Consumption' based on age and sex-specific rates
data['Contact sports'] = np.where(
    ((data['Sex'] == 0) & (np.random.rand(dataset_size) < sports_rate_female)) |
    ((data['Sex'] == 1) & (np.random.rand(dataset_size) < sports_rate_male)),
    1, 0 # If the conditions are met, assign '1' to the 'Excessive Alcohol Consumption' column; otherwise, assigns '0'.
)

data.head()

Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,...,Black,Mestizo,Indian,Age standarized,Probability,BMI,Adequate Physical Activity,Excessive Alcohol Consumption,Visited tropical countries,Contact sports
0,1,1,0,71,1,0,0,0,0,fireman,...,0,1,0,0.548387,0.143617,26.425894,0,0,1,0
1,1,0,0,69,1,0,0,0,0,janitor,...,0,0,0,0.483871,0.728241,26.228107,0,1,0,1
2,0,0,0,72,0,0,0,0,0,waiter,...,0,1,0,0.580645,0.048145,23.091505,0,0,0,0
3,1,1,0,75,1,0,0,0,0,mechanic,...,0,0,0,0.677419,0.95267,27.553869,0,0,0,0
4,0,0,0,69,1,0,0,0,0,accountant,...,0,1,0,0.483871,0.104436,24.485666,1,0,1,0


### Sleep less than 6 hours on average

In [23]:
sleep_rates = {
    'caucasian' : 0.492,
    'black' : 0.417,
    'mestizo' : 0.272,
    'indian' : 0.392,
    'asian' : 0.628
}

data['Sleep less than 6h'] = np.where(
    ((data['Ethnicity'] == 'caucasian') & (np.random.rand(dataset_size) < sleep_rates['caucasian'])) |
    ((data['Ethnicity'] == 'black') & (np.random.rand(dataset_size) < sleep_rates['black'])) |
    ((data['Ethnicity'] == 'mestizo') & (np.random.rand(dataset_size) < sleep_rates['mestizo'])) |
    ((data['Ethnicity'] == 'indian') & (np.random.rand(dataset_size) < sleep_rates['indian'])) |
    ((data['Ethnicity'] == 'asian') & (np.random.rand(dataset_size) < sleep_rates['asian'])),
    1, 0 # If the conditions are met, assign '1' to the 'Excessive Alcohol Consumption' column; otherwise, assigns '0'.
)

data.head()

Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,...,Mestizo,Indian,Age standarized,Probability,BMI,Adequate Physical Activity,Excessive Alcohol Consumption,Visited tropical countries,Contact sports,Sleep less than 6h
0,1,1,0,71,1,0,0,0,0,fireman,...,1,0,0.548387,0.143617,26.425894,0,0,1,0,0
1,1,0,0,69,1,0,0,0,0,janitor,...,0,0,0.483871,0.728241,26.228107,0,1,0,1,1
2,0,0,0,72,0,0,0,0,0,waiter,...,1,0,0.580645,0.048145,23.091505,0,0,0,0,1
3,1,1,0,75,1,0,0,0,0,mechanic,...,0,0,0.677419,0.95267,27.553869,0,0,0,0,0
4,0,0,0,69,1,0,0,0,0,accountant,...,1,0,0.483871,0.104436,24.485666,1,0,1,0,1


### Cholesterol levels

In [24]:
data['Cholesterol Levels'] = np.random.normal(loc=180, scale=30, size=len(data))
data.head()

Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,...,Indian,Age standarized,Probability,BMI,Adequate Physical Activity,Excessive Alcohol Consumption,Visited tropical countries,Contact sports,Sleep less than 6h,Cholesterol Levels
0,1,1,0,71,1,0,0,0,0,fireman,...,0,0.548387,0.143617,26.425894,0,0,1,0,0,172.821398
1,1,0,0,69,1,0,0,0,0,janitor,...,0,0.483871,0.728241,26.228107,0,1,0,1,1,150.933073
2,0,0,0,72,0,0,0,0,0,waiter,...,0,0.580645,0.048145,23.091505,0,0,0,0,1,152.290039
3,1,1,0,75,1,0,0,0,0,mechanic,...,0,0.677419,0.95267,27.553869,0,0,0,0,0,135.949019
4,0,0,0,69,1,0,0,0,0,accountant,...,0,0.483871,0.104436,24.485666,1,0,1,0,1,198.204362


### Hypertension

In [25]:
hypertension_rates = {
    'low_cholesterol': 0.1,
    'high_cholesterol': 0.8
}

data['Hypertension'] = np.where(
    data['Cholesterol Levels'] < 200,  # You can adjust this threshold based on your dataset
    np.random.rand(len(data)) < hypertension_rates['low_cholesterol'],
    np.random.rand(len(data)) < hypertension_rates['high_cholesterol']
)

# Display the DataFrame
print(data.head())

data.head()


   Sex  Smoker  Result PCR mycoplasma  Age  Militar service  C9orf72  SOD1  \
0    1       1                      0   71                1        0     0   
1    1       0                      0   69                1        0     0   
2    0       0                      0   72                0        0     0   
3    1       1                      0   75                1        0     0   
4    0       0                      0   69                1        0     0   

   TARDBP  FUS Professions  ...  Age standarized  Probability        BMI  \
0       0    0     fireman  ...         0.548387     0.143617  26.425894   
1       0    0     janitor  ...         0.483871     0.728241  26.228107   
2       0    0      waiter  ...         0.580645     0.048145  23.091505   
3       0    0    mechanic  ...         0.677419     0.952670  27.553869   
4       0    0  accountant  ...         0.483871     0.104436  24.485666   

  Adequate Physical Activity  Excessive Alcohol Consumption  \
0          

Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,...,Age standarized,Probability,BMI,Adequate Physical Activity,Excessive Alcohol Consumption,Visited tropical countries,Contact sports,Sleep less than 6h,Cholesterol Levels,Hypertension
0,1,1,0,71,1,0,0,0,0,fireman,...,0.548387,0.143617,26.425894,0,0,1,0,0,172.821398,False
1,1,0,0,69,1,0,0,0,0,janitor,...,0.483871,0.728241,26.228107,0,1,0,1,1,150.933073,False
2,0,0,0,72,0,0,0,0,0,waiter,...,0.580645,0.048145,23.091505,0,0,0,0,1,152.290039,False
3,1,1,0,75,1,0,0,0,0,mechanic,...,0.677419,0.95267,27.553869,0,0,0,0,0,135.949019,False
4,0,0,0,69,1,0,0,0,0,accountant,...,0.483871,0.104436,24.485666,1,0,1,0,1,198.204362,False


### Immune Depression

In [26]:
immune_depression_rates = {
    '18-30': 0.05,
    '31-45': 0.1,
    '46-60': 0.15,
    '61+': 0.2
}

data['Immune Depression'] = np.where(
    (data['Age'] >= 18) & (data['Age'] <= 30) & (np.random.rand(len(data)) < immune_depression_rates['18-30']) |
    (data['Age'] >= 31) & (data['Age'] <= 45) & (np.random.rand(len(data)) < immune_depression_rates['31-45']) |
    (data['Age'] >= 46) & (data['Age'] <= 60) & (np.random.rand(len(data)) < immune_depression_rates['46-60']) |
    (data['Age'] >= 61) & (np.random.rand(len(data)) < immune_depression_rates['61+']),
    1, 0  # If the conditions are met, assign '1' to the 'Immune Depression' column; otherwise, assign '0'.
)

data.head()

Unnamed: 0,Sex,Smoker,Result PCR mycoplasma,Age,Militar service,C9orf72,SOD1,TARDBP,FUS,Professions,...,Probability,BMI,Adequate Physical Activity,Excessive Alcohol Consumption,Visited tropical countries,Contact sports,Sleep less than 6h,Cholesterol Levels,Hypertension,Immune Depression
0,1,1,0,71,1,0,0,0,0,fireman,...,0.143617,26.425894,0,0,1,0,0,172.821398,False,0
1,1,0,0,69,1,0,0,0,0,janitor,...,0.728241,26.228107,0,1,0,1,1,150.933073,False,0
2,0,0,0,72,0,0,0,0,0,waiter,...,0.048145,23.091505,0,0,0,0,1,152.290039,False,0
3,1,1,0,75,1,0,0,0,0,mechanic,...,0.95267,27.553869,0,0,0,0,0,135.949019,False,0
4,0,0,0,69,1,0,0,0,0,accountant,...,0.104436,24.485666,1,0,1,0,1,198.204362,False,0


**Randomly organize the columns**


In [28]:
shuffled_data = data.sample(frac=1, axis=1, random_state=42)  # Setting a random_state for reproducibility
shuffled_data.head()

Unnamed: 0,Hypertension,Mestizo,Constructor,Excessive Alcohol Consumption,FUS,Professions,Adequate Physical Activity,Sex,Cholesterol Levels,Ethnicity,...,Visited tropical countries,Sleep less than 6h,Age standarized,Immune Depression,BMI,TARDBP,Mechanic,Caucasian,Probability,SOD1
0,False,1,0,0,0,fireman,0,1,172.821398,mestizo,...,1,0,0.548387,0,26.425894,0,0,0,0.143617,0
1,False,0,0,1,0,janitor,0,1,150.933073,asian,...,0,1,0.483871,0,26.228107,0,0,0,0.728241,0
2,False,1,0,0,0,waiter,0,0,152.290039,mestizo,...,0,1,0.580645,0,23.091505,0,0,0,0.048145,0
3,False,0,0,0,0,mechanic,0,1,135.949019,caucasian,...,0,0,0.677419,0,27.553869,0,1,1,0.95267,0
4,False,1,0,0,0,accountant,1,0,198.204362,mestizo,...,1,1,0.483871,0,24.485666,0,0,0,0.104436,0


In [29]:
# Save dataset
shuffled_data.to_excel('dataset.xlsx', index=False)
