In [12]:
import random
import pandas as pd
import numpy as np

N_SAMPLES = 2000

# Possible options
pain_types = ["Sharp", "Dull", "Burning", "Throbbing"]
# pain_locations = ["Head", "Back", "Stomach", "Chest", "Arm", "Leg", "Neck", "Everywhere"]
radiate = ["Yes", "No"]
durations = ["Today", "Yesterday", "Several days ago", "Last week", "More than a week ago"]

def severity_label(score):
    if score <= 2:
        return "Mild"
    elif score == 3:
        return "Moderate"
    elif score == 4:
        return "Severe"
    else:
        return "Very Severe"

def choose_severity_int_vote(pain_type, radiates, duration, self_score, sleep_score, activity_score, mood_score):
    # Weights for 1–5
    weights = [1, 1, 1, 1, 1]

    # 1. Self-reported score: strong influence
    weights[self_score - 1] += 6
    if self_score >= 4:
        weights[3] += 2
        weights[4] += 3
    elif self_score <= 2:
        weights[0] += 3
        weights[1] += 2

    # 2. Pain type
    if pain_type == "Burning":
        weights[4] += 4
    elif pain_type == "Sharp":
        weights[3] += 3
    elif pain_type == "Dull":
        weights[2] += 3
    elif pain_type == "Throbbing":
        weights[2] += 2

    # 3. Duration
    if duration == "More than a week ago":
        weights[4] += 3
    elif duration == "Last week":
        weights[3] += 2
    elif duration == "Several days ago":
        weights[2] += 2
    elif duration == "Yesterday":
        weights[1] += 1
    elif duration == "Today":
        weights[0] += 1

    # 4. Radiating pain
    if radiates == "Yes":
        weights[4] += 2
        weights[3] += 1

    # 5. Functional impact (the lower the score, the worse the condition)
    functional_impact = (6 - sleep_score) + (6 - activity_score) + (6 - mood_score)
    if functional_impact >= 12:
        weights[4] += 3
    elif functional_impact >= 8:
        weights[3] += 2
    elif functional_impact >= 5:
        weights[2] += 1
    else:
        weights[0] += 1

    # Final severity is the index with max votes +1
    severity_score = np.argmax(weights) + 1
    return severity_score

def generate_correlated_scores(severity_base):
    """Generate functional scores correlated with severity"""
    if severity_base == 5:
        sleep_score = random.choices([1, 2], weights=[0.8, 0.2])[0]
        activity_score = random.choices([1, 2], weights=[0.7, 0.3])[0]
        mood_score = random.choices([1, 2], weights=[0.7, 0.3])[0]
    elif severity_base == 4:
        sleep_score = random.choices([1, 2, 3], weights=[0.5, 0.4, 0.1])[0]
        activity_score = random.choices([1, 2, 3], weights=[0.4, 0.5, 0.1])[0]
        mood_score = random.choices([1, 2, 3], weights=[0.4, 0.4, 0.2])[0]
    elif severity_base == 3:
        sleep_score = random.choices([2, 3, 4], weights=[0.3, 0.5, 0.2])[0]
        activity_score = random.choices([2, 3, 4], weights=[0.3, 0.5, 0.2])[0]
        mood_score = random.choices([2, 3, 4], weights=[0.3, 0.5, 0.2])[0]
    elif severity_base == 2:
        sleep_score = random.choices([3, 4, 5], weights=[0.4, 0.4, 0.2])[0]
        activity_score = random.choices([3, 4, 5], weights=[0.4, 0.4, 0.2])[0]
        mood_score = random.choices([3, 4, 5], weights=[0.4, 0.4, 0.2])[0]
    else:
        sleep_score = random.choices([4, 5], weights=[0.3, 0.7])[0]
        activity_score = random.choices([4, 5], weights=[0.3, 0.7])[0]
        mood_score = random.choices([4, 5], weights=[0.3, 0.7])[0]

    return sleep_score, activity_score, mood_score

def generate_structured_data(n_samples=2000):
    """Generate structured data ensuring consistency"""
    data = []

    severity_distribution = {
        1: 0.20,
        2: 0.25,
        3: 0.25,
        4: 0.20,
        5: 0.10
    }

    for target_severity in range(1, 6):
        n_for_this_severity = int(n_samples * severity_distribution[target_severity])

        for _ in range(n_for_this_severity):
            if target_severity == 5:
                pain_type = random.choices(pain_types, weights=[0.4, 0.05, 0.5, 0.05])[0]
                duration = random.choices(durations, weights=[0.01, 0.02, 0.05, 0.25, 0.67])[0]
                radiates = random.choices(["Yes", "No"], weights=[0.9, 0.1])[0]
                self_score = 5
            elif target_severity == 4:
                pain_type = random.choices(pain_types, weights=[0.3, 0.15, 0.4, 0.15])[0]
                duration = random.choices(durations, weights=[0.05, 0.08, 0.12, 0.35, 0.4])[0]
                radiates = random.choices(["Yes", "No"], weights=[0.75, 0.25])[0]
                self_score = random.choices([4, 5], weights=[0.7, 0.3])[0]
            elif target_severity == 3:
                pain_type = random.choices(pain_types, weights=[0.2, 0.35, 0.25, 0.2])[0]
                duration = random.choices(durations, weights=[0.15, 0.2, 0.3, 0.2, 0.15])[0]
                radiates = random.choices(["Yes", "No"], weights=[0.5, 0.5])[0]
                self_score = random.choices([3, 4], weights=[0.7, 0.3])[0]
            elif target_severity == 2:
                pain_type = random.choices(pain_types, weights=[0.15, 0.5, 0.2, 0.15])[0]
                duration = random.choices(durations, weights=[0.3, 0.25, 0.25, 0.15, 0.05])[0]
                radiates = random.choices(["Yes", "No"], weights=[0.3, 0.7])[0]
                self_score = random.choices([2, 3], weights=[0.7, 0.3])[0]
            else:
                pain_type = random.choices(pain_types, weights=[0.1, 0.6, 0.15, 0.15])[0]
                duration = random.choices(durations, weights=[0.5, 0.25, 0.15, 0.08, 0.02])[0]
                radiates = random.choices(["Yes", "No"], weights=[0.15, 0.85])[0]
                self_score = random.choices([1, 2], weights=[0.7, 0.3])[0]

            sleep_score, activity_score, mood_score = generate_correlated_scores(target_severity)

            severity_score = choose_severity_int_vote(
                pain_type, radiates, duration, self_score,
                sleep_score, activity_score, mood_score
            )

            if abs(severity_score - target_severity) > 1:
                severity_score = target_severity
            elif severity_score != target_severity and random.random() < 0.7:
                severity_score = target_severity

            severity_class = severity_label(severity_score)

            data.append({
                "pain_type": pain_type,
                "radiates": radiates,
                "duration": duration,
                "self_score": self_score,
                "sleep_score": sleep_score,
                "activity_score": activity_score,
                "mood_score": mood_score,
                "severity_score": severity_score,
                "severity_class": severity_class
            })

    return data

# Generate and save
data = generate_structured_data(N_SAMPLES)
df = pd.DataFrame(data)
df.to_csv(f"../data/synthetic_users_dataset_{N_SAMPLES}.csv", index=False)
df.head()

Unnamed: 0,pain_type,radiates,duration,self_score,sleep_score,activity_score,mood_score,severity_score,severity_class
0,Dull,No,Yesterday,1,5,5,5,1,Mild
1,Dull,Yes,Yesterday,1,4,5,4,1,Mild
2,Dull,Yes,Today,2,4,5,5,2,Mild
3,Dull,No,Yesterday,2,4,4,4,1,Mild
4,Dull,No,Yesterday,2,5,5,5,1,Mild


In [13]:
print(f"Generated {len(data)} samples.")
print(df['severity_score'].value_counts().sort_index())
print(df['pain_type'].value_counts())
print(df['duration'].value_counts())

Generated 2000 samples.
severity_score
1    367
2    477
3    506
4    373
5    277
Name: count, dtype: int64
pain_type
Dull         735
Burning      557
Sharp        395
Throbbing    313
Name: count, dtype: int64
duration
Today                   465
Several days ago        402
Last week               398
More than a week ago    391
Yesterday               344
Name: count, dtype: int64
