# Sentence Segmentation - Rule-based methods

In [1]:
from sentence_segmentation_rules import rules
from utils import load, dump

In [2]:
file = load('data/pa153_playground_gold_all.txt')
dump('output/all_version_gold.txt', rules(file))

Loaded data/pa153_playground_gold_all.txt.
Written results to output/all_version_gold.txt


In [3]:
file = load('data/pa153_2025_test_all.txt')

Loaded data/pa153_2025_test_all.txt.


In [4]:
dump('output/rule_version_2.txt', rules(file))

Written results to output/rule_version_2.txt


In [5]:
from statistical_methods import get_labels, create_dataset, extract_features
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


with open('data/pa153_playground_test_all.txt', 'r', encoding='utf-8') as f:
    raw = f.read()
with open('data/pa153_playground_gold_all.txt', 'r', encoding='utf-8') as f:
    gold = f.read()

boundary_labels = get_labels(raw, gold)

df = create_dataset(raw, boundary_labels)

In [6]:
df

Unnamed: 0,next_is_upper,next_is_lower,next_is_number,next_is_quote,prev_is_upper,prev_is_number,prev_is_abbrev_or_initial,prev_word_preceded_by_open_paren,prev_prev_is_abbrev,prev_char_is_digit,...,prev_char_is_closing_paren,prev_char_is_percent,prev_char_is_space,next_char_is_period,next_char_is_punct,next_char_is_closer,next_char_is_closing_paren,is_ellipsis,label,next_is_uper
0,False,True,False,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,0,
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,1,
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,1,
3,True,False,False,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,0,
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,True,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1634,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,1,
1635,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,1,
1636,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,1,
1637,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,1,


In [7]:
X_dict = df.drop('label', axis=1).to_dict(orient='records')
y = df['label']

vectorizer = DictVectorizer(sparse=False,)
X_encoded = vectorizer.fit_transform(X_dict)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.1, random_state=42)

clf = RandomForestClassifier(n_estimators=100, class_weight={0: 40, 1: 1}, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.97      0.91        37
           1       0.99      0.95      0.97       127

    accuracy                           0.96       164
   macro avg       0.92      0.96      0.94       164
weighted avg       0.96      0.96      0.96       164



In [8]:
def segment_text(raw_text, model, vectorizer):
    """
    Scans raw_text, uses the model to find boundaries,
    and returns a new string with \n inserted after boundaries.
    """
    segmented_text = ""
    last_cut_idx = 0
    text_len = len(raw_text)

    for i in range(text_len):
        char = raw_text[i]
        if char in ['.', '?', '!']:
            feats = extract_features(raw_text, i)
            feats_encoded = vectorizer.transform([feats])
            prediction = model.predict(feats_encoded)[0]

            if prediction == 1:
                cut_point = i + 1

                if cut_point < text_len and raw_text[cut_point] in ['"', "'", '”', "„"]:
                    cut_point += 1

                segment = raw_text[last_cut_idx:cut_point]
                segmented_text += segment + "\n"

                last_cut_idx = cut_point

    if last_cut_idx < text_len:
        segmented_text += raw_text[last_cut_idx:]

    return "\n".join([line.strip() for line in segmented_text.splitlines() if line.strip()])


In [9]:
with open('data/pa153_2025_test_all.txt', 'r', encoding='utf-8') as f:
    raw_content = f.read()

final_output = segment_text(raw_content, clf, vectorizer)

with open('output/stats_version_4.txt', 'w', encoding='utf-8') as f:
    f.write(final_output)

In [10]:
with open('data/pa153_playground_test_all.txt', 'r', encoding='utf-8') as f:
    raw_content = f.read()

final_output = segment_text(raw_content, clf, vectorizer)

with open('output/stats_playground.txt', 'w', encoding='utf-8') as f:
    f.write(final_output)

In [11]:
import pandas as pd
from IPython.display import display

def inspect_sentence(text, model, vectorizer):
    """
    Inspects the model's decision for each potential boundary in the text.
    """
    inspection_data = []

    for i, char in enumerate(text):
        if char in ['.', '?', '!']:
            features = extract_features(text, i)

            # Transform features and make a prediction
            features_encoded = vectorizer.transform([features])
            prediction = model.predict(features_encoded)[0]
            probabilities = model.predict_proba(features_encoded)[0]

            # Store data for display
            data_point = {
                'index': i,
                'char': char,
                'context': f"...{text[i-15:i]}`{char}`{text[i+1:i+15]}...",
                'prediction': 'Boundary' if prediction == 1 else 'Not a Boundary',
                'prob_not_boundary': probabilities[0],
                'prob_boundary': probabilities[1],
            }
            data_point.update(features)
            inspection_data.append(data_point)

    if not inspection_data:
        print("No potential sentence boundaries found.")
        return

    df = pd.DataFrame(inspection_data)

    feature_names = sorted(vectorizer.feature_names_)
    core_cols = ['index', 'char', 'context', 'prediction', 'prob_boundary', 'prob_not_boundary']

    for col in feature_names:
        if col not in df.columns:
            df[col] = None

    display_cols = core_cols + [f for f in feature_names if f in df.columns and f not in core_cols]

    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
        display(df[display_cols])

# inspect_sentence("")

Unnamed: 0,index,char,context,prediction,prob_boundary,prob_not_boundary,is_ellipsis,next_char_is_closer,next_char_is_closing_paren,next_char_is_period,next_char_is_punct,next_is_lower,next_is_number,next_is_quote,next_is_uper,next_is_upper,prev_char_is_closer,prev_char_is_closing_paren,prev_char_is_digit,prev_char_is_percent,prev_char_is_period,prev_char_is_punct,prev_char_is_space,prev_is_abbrev_or_initial,prev_is_number,prev_is_upper,prev_prev_is_abbrev,prev_word_preceded_by_open_paren
0,86,.,...kami jako první`.` Podpůrný a ga...,Boundary,0.845817,0.154183,False,False,False,False,False,False,False,False,,True,False,False,False,False,False,False,False,False,False,False,False,False
1,190,.,"...ářským záložnám`.` Myslím, že na...",Boundary,0.845817,0.154183,False,False,False,False,False,False,False,False,,True,False,False,False,False,False,False,False,False,False,False,False,False
2,292,.,...sto irelevantní`.` Přitom mlž M....,Boundary,0.845817,0.154183,False,False,False,False,False,False,False,False,,True,False,False,False,False,False,False,False,False,False,False,False,False
3,306,.,...í. Přitom mlž M`.` arenaria byl ...,Not a Boundary,0.0,1.0,False,False,False,False,False,True,False,False,,False,False,False,False,False,False,False,False,True,False,True,False,False
4,415,.,... 2 milióny let)`.` Ladislav Ková...,Boundary,1.0,0.0,False,False,False,False,False,False,False,False,,True,False,True,False,False,False,False,False,False,False,False,False,False
5,548,.,...je na 2500 Romů`.` Liberální spo...,Boundary,0.845817,0.154183,False,False,False,False,False,False,False,False,,True,False,False,False,False,False,False,False,False,False,False,False,False
6,628,.,... být nepochybné`.` Domácí sice v...,Boundary,0.845817,0.154183,False,False,False,False,False,False,False,False,,True,False,False,False,False,False,False,False,False,False,False,False,False
7,717,.,... otočili na 4:5`.` V témže čísle...,Boundary,0.609797,0.390203,False,False,False,False,False,False,False,False,,True,False,False,True,False,False,False,False,False,True,False,False,False
8,803,.,"...ilné"" živočichy`.` Právě s techn...",Boundary,0.845817,0.154183,False,False,False,False,False,False,False,False,,True,False,False,False,False,False,False,False,False,False,False,False,False
9,862,.,... zatím na štíru`.` Předseda Nezá...,Boundary,0.823269,0.176731,False,False,False,False,False,False,False,False,,True,False,False,False,False,False,False,False,False,False,False,True,False
