In [None]:
import numpy as np
import matplotlib.pyplot as plt

from src.data_generator import generate_data_from_csv
from src.train_tree import train_tree
from src.interpret_tree import interpret_tree

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.tree import plot_tree

In [None]:
feature_names, morph_list, X, y, error_rate = generate_data_from_csv(
    "data/English_pronouns.csv",
    accuracy_rate=0.95,
    n=10000
)

print("Feature names:", feature_names)
print("Unique morphs:", morph_list)
print("Proportion of errors in synthetic data:", error_rate)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

clf = train_and_evaluate_decision_tree(X_train, y_train, min_imp_dec=0.01)

y_pred = clf.predict(X_test)
print("\nTest macro F1 score:", f1_score(y_test, y_pred, average="macro"))
print("\nClassification report:\n", classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(20,20))
plot_tree(
    clf,
    feature_names=feature_names,
    class_names=np.unique(y),
    filled=True,
    rounded=True,
    fontsize=8,
    impurity=True
)
plt.show()

In [None]:
specifications = interpret_decision_tree(clf, feature_names)

print("Extracted morphological rules:")
for spec in specifications:
    print(spec)