In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
from pathlib import Path

In [None]:
df_train_cnn_article = pd.read_parquet("../Article/cnn_daily_news/cnn_data/train_article_cnn_5000.parquet")
df_val_cnn_article = pd.read_parquet("../Article/cnn_daily_news/cnn_data/val_article_cnn_1000.parquet")
df_test_cnn_article = pd.read_parquet("../Article/cnn_daily_news/cnn_data/test_article_cnn_1000.parquet")
df_train_arxiv_paper = pd.read_parquet("../arxiv_research/arxiv_data/train_article_arxiv_5000.parquet")
df_val_arxiv_paper = pd.read_parquet("../arxiv_research/arxiv_data/val_article_arxiv_1000.parquet")
df_test_arxiv_paper = pd.read_parquet("../arxiv_research/arxiv_data/test_article_arxiv_1000.parquet")
df_train_email = pd.read_parquet("../enron_email/enron_train_5000.parquet")
df_val_email = pd.read_parquet("../enron_email/enron_val_1000.parquet")
df_test_email = pd.read_parquet("../enron_email/enron_test_1000.parquet")
df_train_reddit_post = pd.read_parquet("../reddit/reddit_train_5000.parquet")
df_val_reddit_post = pd.read_parquet("../reddit/reddit_val_1000.parquet")
df_test_reddit_post = pd.read_parquet("../reddit/reddit_test_1000.parquet")

In [None]:
token_hf ="" #change this
model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, token = token_hf)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=token_hf,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    cache_dir="./llama_model"

)

1


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
def summarize_llama3_article(text: str) -> str:

    messages = [
    {
        "role": "system",
        "content": (
            "You are an expert news editor. Your goal is to produce a clear, objective, "
            "journalistic-style briefing. You must strictly follow formatting rules."
        )
    },
    {
        "role": "user",
        "content": (
            "Summarize the following article in EXACTLY six to seven complete sentences. "
            "This is mandatory. Do not generate fewer or more than seven sentences.\n\n"
            "Requirements:\n"
            "- Journalistic briefing tone (Inverted Pyramid style)\n" 
            "- No bullet points, no lists\n"
            "- No citations or references\n"
            "- No markdown formatting\n"
            "- Do not copy any sentence from the article; rephrase everything\n\n"
            f"Article:\n{text}"
        )
    }
]


    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=32000,
    ).to(model.device)

    eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>") or tokenizer.eos_token_id

    outputs = model.generate(
        **inputs,
        max_new_tokens=800,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        eos_token_id=eot_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    gen_ids = outputs[0, inputs["input_ids"].shape[1]:]
    summary = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
    return summary



In [12]:


out_path = Path("train_article_cnn_summaries_llama31_v2.csv")
rows = []

for i, text in enumerate(df_train_cnn_article["article"]):
    if ( i % 100 == 0):
        print(i)
    summary = summarize_llama3_article(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "cnn_daily_news",
        "model": "meta-llama/Llama-3.1-8B-Instruct"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
Saved 5000 rows to train_article_cnn_summaries_llama31_v2.csv


In [13]:
out_path = Path("val_article_cnn_summaries_llama31_v2.csv")
rows = []

for i, text in enumerate(df_val_cnn_article["article"]):
    summary = summarize_llama3_article(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "cnn_daily_news",
        "model": "meta-llama/Llama-3.1-8B-Instruct"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 1000 rows to val_article_cnn_summaries_llama31_v2.csv


In [14]:
out_path = Path("test_article_cnn_summaries_llama31_v2.csv")
rows = []

for i, text in enumerate(df_test_cnn_article["article"]):
    summary = summarize_llama3_article(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "cnn_daily_news",
        "model": "meta-llama/Llama-3.1-8B-Instruct"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 1000 rows to test_article_cnn_summaries_llama31_v2.csv


In [None]:
def summarize_llama3_research(text: str) -> str:

    messages = [
    {
        "role": "system",
        "content": (
            "You are an expert summarizer. Your goal is to produce a clear, factual, "
            "abstract-style summary. You must strictly follow formatting rules."
        )
    },
    {
        "role": "user",
        "content": (
            "Summarize the following article in EXACTLY six to seven complete sentences. "
            "This is mandatory. Do not generate fewer or more than seven sentences.\n\n"
            "Requirements:\n"
            "- Abstract-style academic writing\n"
            "- No bullet points, no lists\n"
            "- No citations or references\n"
            "- No markdown formatting\n"
            "- Do not copy any sentence from the article; rephrase everything\n\n"
            f"Article:\n{text}"
        )
    }
]


    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=32000,
    ).to(model.device)

    eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>") or tokenizer.eos_token_id

    outputs = model.generate(
        **inputs,
        max_new_tokens=800,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        eos_token_id=eot_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    gen_ids = outputs[0, inputs["input_ids"].shape[1]:]
    summary = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
    return summary





In [9]:


out_path = Path("train_article_arxiv_summaries_llama31.csv")
rows = []

for i, text in enumerate(df_train_arxiv_paper["article"]):
    summary = summarize_llama3_research(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "arxiv",
        "model": "meta-llama/Llama-3.1-8B-Instruct"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 5000 rows to train_article_arxiv_summaries_llama31.csv


In [11]:
out_path = Path("val_article_arxiv_summaries_llama31.csv")
rows = []

for i, text in enumerate(df_val_arxiv_paper["article"]):
    summary = summarize_llama3_research(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "arxiv",
        "model": "meta-llama/Llama-3.1-8B-Instruct"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 1000 rows to val_article_arxiv_summaries_llama31.csv


In [20]:
out_path = Path("test_article_arxiv_summaries_llama31.csv")
rows = []

for i, text in enumerate(df_test_arxiv_paper["article"]):
    summary = summarize_llama3_research(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "arxiv",
        "model": "meta-llama/Llama-3.1-8B-Instruct"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 1000 rows to test_article_arxiv_summaries_llama31.csv


In [8]:
def summarize_reddit_llama3(text: str) -> str:

    messages = [
    {
        "role": "system",
        "content": (
            "You are an expert at summarizing Reddit posts. Your goal is to provide a clear, "
            "neutral summary of the user's post while keeping the tone natural and conversational. "
            "Capture the key situation, context, motivations, and main concerns. Avoid adding "
            "opinions or judgments."
        )
    },
    {
        "role": "user",
        "content": (
            "Summarize the following Reddit post in 4–6 sentences. "
            "Keep the summary concise, neutral, and easy to understand.\n\n"
            "Requirements:\n"
            "- Use plain, natural language\n"
            "- Reflect the poster's situation accurately\n"
            "- Include the key events, motivations, and concerns\n"
            "- No bullet points, no lists\n"
            "- No advising, judging, or moralizing\n"
            "- Do not copy text; fully rephrase everything\n\n"
            f"Reddit Post:\n{text}"
        )
    }

]


    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=32000,
    ).to(model.device)

    eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>") or tokenizer.eos_token_id

    outputs = model.generate(
        **inputs,
        max_new_tokens=800,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        eos_token_id=eot_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    gen_ids = outputs[0, inputs["input_ids"].shape[1]:]
    summary = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
    return summary





In [15]:
out_path = Path("train_reddit_summaries_llama31.csv")
rows = []

for i, text in enumerate(df_train_reddit_post["text"]):
    summary = summarize_reddit_llama3(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "reddit",
        "model": "meta-llama/Llama-3.1-8B-Instruct"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 5000 rows to train_reddit_summaries_llama31.csv


In [16]:
out_path = Path("val_reddit_summaries_llama31.csv")
rows = []

for i, text in enumerate(df_val_reddit_post["text"]):
    summary = summarize_reddit_llama3(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "reddit",
        "model": "meta-llama/Llama-3.1-8B-Instruct"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 1000 rows to val_reddit_summaries_llama31.csv


In [17]:
out_path = Path("test_reddit_summaries_llama31.csv")
rows = []

for i, text in enumerate(df_test_reddit_post["text"]):
    summary = summarize_reddit_llama3(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "reddit",
        "model": "meta-llama/Llama-3.1-8B-Instruct"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 1000 rows to test_reddit_summaries_llama31.csv


In [None]:
def summarize_email_llama3(text: str) -> str:\

    messages = [
        {
            "role": "system",
            "content": (
                "You summarize business emails in a concise, neutral, and professional tone. "
                "Your summaries must begin immediately with the content, without any introductory phrases, "
                "explanations, or meta-commentary."
            )
        },
        {
            "role": "user",
            "content": (
                "Write a summary of the following email in 3–5 complete sentences.\n\n"
                "Requirements:\n"
                "- Begin the summary immediately with the content; do NOT write phrases such as "
                "  'Here is a summary', 'The email says', or 'This message discusses'\n"
                "- Professional tone\n"
                "- Capture the core purpose, key points, and any actions or decisions\n"
                "- No bullet points or lists\n"
                "- No greetings or signatures\n"
                "- No quoting, copying, or references to the fact that this is an email\n"
                "- Ignore headers, footers, and forward chains unless essential\n\n"
                f"Email:\n{text}"
            )
        }

]


    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=32000,
    ).to(model.device)

    eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>") or tokenizer.eos_token_id

    outputs = model.generate(
        **inputs,
        max_new_tokens=800,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        eos_token_id=eot_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    gen_ids = outputs[0, inputs["input_ids"].shape[1]:]
    summary = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
    return summary





In [13]:
out_path = Path("train_email_summaries_llama31.csv")
rows = []

for i, text in enumerate(df_train_email["text"]):
    summary = summarize_email_llama3(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "email",
        "model": "meta-llama/Llama-3.1-8B-Instruct"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 5000 rows to train_email_summaries_llama31.csv


In [12]:
out_path = Path("val_email_summaries_llama31.csv")
rows = []

for i, text in enumerate(df_val_email["text"]):
    summary = summarize_email_llama3(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "email",
        "model": "meta-llama/Llama-3.1-8B-Instruct"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 1000 rows to val_email_summaries_llama31.csv


In [13]:
out_path = Path("test_email_summaries_llama31.csv")
rows = []

for i, text in enumerate(df_test_email["text"]):
    summary = summarize_email_llama3(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "email",
        "model": "meta-llama/Llama-3.1-8B-Instruct"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 1000 rows to test_email_summaries_llama31.csv
