# Sentence Segmentation - Rule-based methods

In [1]:
from sentence_segmentation_rules import rules
from utils import load, dump

In [2]:
file = load('data/pa153_playground_gold_all.txt')
dump('output/all_version_gold.txt', rules(file))

Loaded data/pa153_playground_gold_all.txt.
Written results to output/all_version_gold.txt


In [3]:
file = load('data/pa153_2025_test_all.txt')

Loaded data/pa153_2025_test_all.txt.


In [4]:
dump('output/all_version_2.txt', rules(file))

Written results to output/all_version_2.txt


In [5]:
from statistical_methods import get_labels, create_dataset, extract_features
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


with open('data/pa153_playground_test_all.txt', 'r', encoding='utf-8') as f:
    raw = f.read()
with open('data/pa153_playground_gold_all.txt', 'r', encoding='utf-8') as f:
    gold = f.read()

boundary_labels = get_labels(raw, gold)

df = create_dataset(raw, boundary_labels)

In [6]:
X_dict = df.drop('label', axis=1).to_dict(orient='records')
y = df['label']

vectorizer = DictVectorizer(sparse=False)
X_encoded = vectorizer.fit_transform(X_dict)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2)

clf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96        90
           1       0.98      0.98      0.98       238

    accuracy                           0.98       328
   macro avg       0.97      0.97      0.97       328
weighted avg       0.98      0.98      0.98       328



In [7]:
df

Unnamed: 0,next_is_upper,next_is_lower,next_is_number,next_is_quote,prev_word_len,prev_is_upper,prev_is_title,prev_is_initial,prev_char_is_period,prev_char_is_punct,next_char_is_period,next_char_is_punct,next_next_char_is_period,punct_type,label
0,False,True,False,False,2,False,False,False,False,False,False,False,False,.,0
1,True,False,False,False,6,False,False,False,False,False,False,False,False,.,1
2,True,False,False,False,14,False,False,False,False,False,False,False,False,.,1
3,True,False,False,False,1,True,False,True,False,False,False,False,False,.,0
4,True,False,False,False,7,False,False,False,False,False,True,False,True,.,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1634,True,False,False,False,9,False,False,False,False,False,False,False,False,.,1
1635,True,False,False,False,8,False,False,False,False,False,False,False,False,.,1
1636,True,False,False,False,5,False,False,False,False,False,False,False,False,.,1
1637,True,False,False,False,7,False,False,False,False,False,False,False,False,.,1


In [9]:
def segment_text(raw_text, model, vectorizer):
    """
    Scans raw_text, uses the model to find boundaries,
    and returns a new string with \n inserted after boundaries.
    """
    segments = []
    last_cut_idx = 0

    # Iterate through every character
    for i, char in enumerate(raw_text):

        # Check if this is a candidate for a boundary
        if char in ['.']: # decided to use only dots as ? ! reduce precision

            feats = extract_features(raw_text, i)
            feats_encoded = vectorizer.transform([feats])

            prediction = model.predict(feats_encoded)[0]

            if prediction == 1:
                cut_point = i + 1  #Cut after the period

                # Check if the next character is a quote
                if cut_point < len(raw_text):
                    if raw_text[cut_point] in ['"', "'", '”']:
                        cut_point += 1  # Include the quote in this sentence

                current_segment = raw_text[last_cut_idx : cut_point]
                segments.append(current_segment.strip())

                last_cut_idx = cut_point

    if last_cut_idx < len(raw_text):
        segments.append(raw_text[last_cut_idx:].strip())

    return "\n".join(segments)

with open('data/pa153_2025_test_all.txt', 'r', encoding='utf-8') as f:
    raw_content = f.read()

final_output = segment_text(raw_content, clf, vectorizer)

with open('output/stats_version_2.txt', 'w', encoding='utf-8') as f:
    f.write(final_output)