In [2]:
import random
import pandas as pd

In [3]:
def _get_analysis_score(difference_analysis: dict, wer: float) -> dict:
    weights = {
        "clarity": {"wer": 0.9, "syllables": 0.1},
        "speed": {
            "speech_rate": 0.7, 
            "speaking_duration": 0.15, 
            "total_duration": 0.15
        },
        "articulation": {"articulation_rate": 0.8, "syllables": 0.2},
        "rythm": {"ratio": 0.7, "pauses": 0.3},
    }

    def compute_score(criteria_weight: dict) -> int:
        score = 0
        for metric, weight in criteria_weight.items():
            diff = wer if metric == "wer" else abs(difference_analysis.get(metric, 0))
            diff = min(diff, 1)
            metric_score = (1 - diff) * weight
            score += metric_score
        return max(0, min(10, round(score * 10)))

    clarity = compute_score(weights["clarity"])
    speed = compute_score(weights["speed"])
    articulation = compute_score(weights["articulation"])
    rythm = compute_score(weights["rythm"])
    total = round((clarity + speed + articulation + rythm) * 2.5)

    return total

def classify_score(score: int) -> int:
    if score <= 40:
        return 0 #begginer
    elif score <= 79:
        return 1 #intermediate
    else:
        return 2 #advanced

def generate_instance() -> dict:
    # Features rounded to 1 decimal
    instance = {
        "number_of_syllables": round(random.random(), 1),
        "number_of_pauses": round(random.random(), 1),
        "speech_rate": round(random.random(), 1),
        "articulation_rate": round(random.random(), 1),
        "speaking_duration": round(random.random(), 1),
        "total_duration": round(random.random(), 1),
        "ratio": round(random.random(), 1),
        "wer": round(random.random(), 1),
    }
    diff_analysis = {
        "syllables": instance["number_of_syllables"],
        "pauses": instance["number_of_pauses"],
        "speech_rate": instance["speech_rate"],
        "articulation_rate": instance["articulation_rate"],
        "speaking_duration": instance["speaking_duration"],
        "total_duration": instance["total_duration"],
        "ratio": instance["ratio"],
    }
    total_score = _get_analysis_score(diff_analysis, instance["wer"])
    instance["label"] = classify_score(total_score)
    return instance

def generate_balanced_dataset(n: int = 600) -> pd.DataFrame:
    target_per_class = n // 3
    data = []
    counts = {0: 0, 1: 0, 2: 0}

    while any(v < target_per_class for v in counts.values()):
        inst = generate_instance()
        label = inst["label"]
        if counts[label] < target_per_class:
            data.append(inst)
            counts[label] += 1

    return pd.DataFrame(data)

In [4]:

df = generate_balanced_dataset(900)



In [5]:
print(df["label"].value_counts())

label
1    300
0    300
2    300
Name: count, dtype: int64


In [10]:
df.head(5)

Unnamed: 0,number_of_syllables,number_of_pauses,speech_rate,articulation_rate,speaking_duration,total_duration,ratio,wer,label
0,0.9,0.3,0.9,0.5,0.6,0.2,0.2,0.6,1
1,1.0,0.6,0.4,0.6,0.6,0.1,0.5,0.3,1
2,0.4,0.2,0.6,0.3,0.2,0.2,0.8,0.2,1
3,0.4,0.6,0.7,0.2,0.6,0.6,0.5,0.5,1
4,0.9,0.4,0.4,0.3,0.3,0.9,0.7,0.5,1


In [7]:
df.to_csv("dataset.csv", index=False)