In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
from tqdm import tqdm

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
with open('styledata/train/formal', encoding='utf-8') as f:
    formal_train = [line.strip() for line in f if line.strip()]
    
with open('styledata/test/formal', encoding='utf-8') as f:
    formal_test = [line.strip() for line in f if line.strip()]

with open('styledata/train/informal', encoding='utf-8') as f:
    informal_train = [line.strip() for line in f if line.strip()]
    
with open('styledata/test/informal', encoding='utf-8') as f:
    informal_test = [line.strip() for line in f if line.strip()]

X = formal_train + informal_train
y = [0] * len(formal_train) + [1] * len(informal_train) # 0 is formal, 1 is informal

X_test = formal_test + informal_test
y_test = [0] * len(formal_test) + [1] * len(informal_test)

In [4]:
X

["The In-Laws movie isn't a holiday movie, but it's okay.",
 "I don't think that page gave me viruses.",
 'I watch it everyday, my favorite charachter is Inuasha.',
 'Funbrain.com and runescape.com are great for family fun.',
 "He was on the Late Night show with Conan O'Brien and seemed gay.",
 'Mel Gibson is a strong believed of God.',
 'My exams are not over yet because I still have one left.',
 'Sky Kids 3D: game over.',
 'His mom was a wafer so long.',
 'I am entirely off or does that help.',
 'Her name is Jessica.',
 'www.us.imdb.com is probably one of your best bets.',
 'Titanic is my favorite movie and I cry when I see it.',
 'It gives us something to do.',
 'He is debonair, or perhaps more than that.',
 'Many people enjoy the MLB.',
 'I do not possess that model, but I am partial to my Creative Zen Micro',
 'I viewed it and I believe it is a quality program.',
 'She is a poor vocalist.',
 'He is very attractive.',
 'This is an example of something someone, who is age 20 and mar

In [5]:
clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('lr', LogisticRegression(max_iter=1000))
])
clf.fit(X, y)

In [5]:
sentences = [
    "I am writing to inform you about the recent developments.",
    "That is so lame, bro.",
    "Could you please clarify your previous statement?",
    "Hey, can you explain what you meant?",
]
for s in sentences:
    clf.predict([s])
    print(f"sentence: '{s}' => {'formal' if clf.predict([s])[0] == 0 else 'informal'}; formal probability {clf.predict_proba([s])[0][0]}")

sentence: 'I am writing to inform you about the recent developments.' => formal; formal probability 0.9722046530599819
sentence: 'That is so lame, bro.' => informal; formal probability 0.4003224125846151
sentence: 'Could you please clarify your previous statement?' => formal; formal probability 0.9335817071456372
sentence: 'Hey, can you explain what you meant?' => informal; formal probability 0.4709779927178809


In [7]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.87      0.80      1082
           1       0.89      0.77      0.82      1416

    accuracy                           0.81      2498
   macro avg       0.81      0.82      0.81      2498
weighted avg       0.82      0.81      0.81      2498



In [18]:
# def filter(sentence, digit_ratio=0.1, symbol_ratio=0.15, min_length=5):
#     if len(sentence) < min_length:
#         return False

#     digit_count = sum(c.isdigit() for c in sentence)
#     symbol_count = sum(not c.isalnum() and not c.isspace() for c in sentence)
#     digit_ratio = digit_count / len(sentence)
#     symbol_ratio = symbol_count / len(sentence)
    
#     if digit_ratio > 0.1 or symbol_ratio > 0.1:
#         return False

#     return True
    
import re

DIGIT_REGEX = re.compile(r'\d')
SYMBOL_REGEX = re.compile(r'[^\w\s]')  # matches non-alphanumeric, non-whitespace
CITATION_REGEX = re.compile(r'\[.*?\]|\(.*?\)|\{.*?\}')

def fast_filter(sentence, max_digit_ratio=0.1, max_symbol_ratio=0.1, min_length=5):
    total = len(sentence)
    if total < min_length:
        return False

    digit_ratio = len(DIGIT_REGEX.findall(sentence)) / total
    symbol_ratio = len(SYMBOL_REGEX.findall(sentence)) / total

    if digit_ratio > max_digit_ratio or symbol_ratio > max_symbol_ratio:
        return False
    if CITATION_REGEX.search(sentence):
        return False

    return True

In [9]:
with open('./wikipedia-data/wikisent2.txt', encoding='utf-8') as f:
    wikipedia_sentences = [line.strip() for line in f if line.strip()]
print(f"Total sentences: {len(wikipedia_sentences)}")

Total sentences: 7871825


In [None]:
# Test the first 100000 sentences with fast_filter
test_sentences = wikipedia_sentences[:100000]
test_filtered = [s for s in tqdm(test_sentences) if fast_filter(s)]
print(f"Filtered sentences ratio for first 100k datapoints: {len(test_filtered) / len(test_sentences)}")

100%|██████████| 100000/100000 [00:00<00:00, 213220.65it/s]

Filtered sentences ratio for first 100k datapoints: 0.76333





In [20]:
filtered_sentences = [s for s in tqdm(wikipedia_sentences) if fast_filter(s)]
print(f"Filtered sentences: {len(filtered_sentences)}")
print(f"Filtered sentences ratio: {len(filtered_sentences)/len(wikipedia_sentences)}")

100%|██████████| 7871825/7871825 [00:34<00:00, 226289.64it/s]

Filtered sentences: 6427106
Filtered sentences ratio: 0.8164696242612101





In [21]:
high_formality_sentences = [
    s for s in tqdm(filtered_sentences)
    if clf.predict_proba([s])[0][0] > 0.9  # high probability of being formal
]
print(f"High-formality sentences: {len(high_formality_sentences)}")

100%|██████████| 6427106/6427106 [1:09:32<00:00, 1540.39it/s]

High-formality sentences: 1282221





In [22]:
with open("wikipedia-data/high_formality_sentences.txt", "w", encoding="utf-8") as f:
    for sentence in high_formality_sentences:
        f.write(sentence + "\n")
print("saved high formailty sentences to file")

saved high formailty sentences to file


In [6]:
with open("wikipedia-data/high_formality_sentences.txt", encoding="utf-8") as f:
    high_formality_sentences = [line.strip() for line in f if line.strip()]

sample_sentences = list(np.random.choice(high_formality_sentences, size=5000, replace=False))

In [8]:
import time
import random
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
client = OpenAI()

example_prompt = (
    "Make this sentence informal:\n"
    'Formal: "I am writing to request a refund."\n'
    'Informal: "Hey, I just wanna get my money back."\n'
    "---\n"
)
def clean_response(text):
    text = text.strip()
    if text.lower().startswith("informal:"):
        return text[len("informal:"):].strip()
    return text

def query(sentence, retries=5):
    prompt = example_prompt + f'Formal: "{sentence}"\nInformal:'
    for attempt in range(retries):
        try:
            response = client.responses.create(
                model="gpt-4o-mini",
                input=prompt
            )
            return sentence, clean_response(response.output_text)
        except Exception as e:
            wait = 2 ** attempt + random.uniform(0, 1)
            print(f"Retrying in {wait:.1f}s due to error: {e}")
            time.sleep(wait)
    return sentence, ""

# Run synchronously with progress bar
results = []
for sentence in tqdm(sample_sentences):
    results.append(query(sentence))

# Save to dataframe
import pandas as pd
df = pd.DataFrame(results, columns=["formal", "informal"])

100%|██████████| 5000/5000 [1:36:13<00:00,  1.15s/it]  


In [20]:
df.to_json("formal_informal_pairs.jsonl", orient="records", lines=True, force_ascii=False)
import json

with open("formal_informal_pairs.jsonl", "w", encoding="utf-8") as f:
    for formal, informal in results:
        if isinstance(formal, str) and isinstance(informal, str):
            obj = {"formal": formal.strip(), "informal": informal.strip()}
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
