In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
from pathlib import Path

In [None]:
df_train_cnn_article = pd.read_parquet("../Article/cnn_daily_news/cnn_data/train_article_cnn_5000.parquet")
df_val_cnn_article = pd.read_parquet("../Article/cnn_daily_news/cnn_data/val_article_cnn_1000.parquet")
df_test_cnn_article = pd.read_parquet("../Article/cnn_daily_news/cnn_data/test_article_cnn_1000.parquet")
df_train_arxiv_paper = pd.read_parquet("../arxiv_research/arxiv_data/train_article_arxiv_5000.parquet")
df_val_arxiv_paper = pd.read_parquet("../arxiv_research/arxiv_data/val_article_arxiv_1000.parquet")
df_test_arxiv_paper = pd.read_parquet("../arxiv_research/arxiv_data/test_article_arxiv_1000.parquet")
df_train_email = pd.read_parquet("../enron_email/enron_train_5000.parquet")
df_val_email = pd.read_parquet("../enron_email/enron_val_1000.parquet")
df_test_email = pd.read_parquet("../enron_email/enron_test_1000.parquet")
df_train_reddit_post = pd.read_parquet("../reddit/reddit_train_5000.parquet")
df_val_reddit_post = pd.read_parquet("../reddit/reddit_val_1000.parquet")
df_test_reddit_post = pd.read_parquet("../reddit/reddit_test_1000.parquet")

In [7]:

model_id = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    cache_dir="./mistral_model"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def summarize_mistral_article(text: str) -> str:

    messages = [
    {
        "role": "system",
        "content": (
            "You are an expert news editor. Your goal is to produce a clear, objective, "
            "journalistic-style briefing. You must strictly follow formatting rules."
        )
    },
    {
        "role": "user",
        "content": (
            "Summarize the following article in EXACTLY six to seven complete sentences. "
            "This is mandatory. Do not generate fewer or more than seven sentences.\n\n"
            "Requirements:\n"
            "- Journalistic briefing tone (Inverted Pyramid style)\n" 
            "- No bullet points, no lists\n"
            "- No citations or references\n"
            "- No markdown formatting\n"
            "- Do not copy any sentence from the article; rephrase everything\n\n"
            f"Article:\n{text}"
        )
    }
]



    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=32000,
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=800,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    gen_ids = outputs[0, inputs["input_ids"].shape[1]:]
    summary = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    del inputs, outputs, gen_ids
    if torch.cuda.is_available():
        torch.cuda.empty_cache()  
        
    return summary





In [None]:


out_path = Path("train_article_cnn_summaries_mistral_v03_v2.csv")
rows = []

for i, text in enumerate(df_train_cnn_article["article"]):
    if( i % 100 ==0):
        print(i)
    summary = summarize_mistral_article(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "cnn_daily_news",
        "model": "mistralai/Mistral-7B-Instruct-v0.3"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

0


In [None]:
out_path = Path("val_article_cnn_summaries_mistral_v03_v2.csv")
rows = []

for i, text in enumerate(df_val_cnn_article["article"]):
    if( i % 100 ==0):
        print(i)
    summary = summarize_mistral_article(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "cnn_daily_news",
        "model": "mistralai/Mistral-7B-Instruct-v0.3"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

In [None]:
out_path = Path("test_article_cnn_summaries_mistral_v03_v2.csv")
rows = []

for i, text in enumerate(df_test_cnn_article["article"]):
    if( i % 100 ==0):
        print(i)
    summary = summarize_mistral_article(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "cnn_daily_news",
        "model": "mistralai/Mistral-7B-Instruct-v0.3"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

In [None]:
def summarize_mistral_research(text: str) -> str:


    messages = [
    {
        "role": "system",
        "content": (
            "You are an expert summarizer. Your goal is to produce a clear, factual, "
            "abstract-style summary. You must strictly follow formatting rules."
        )
    },
    {
        "role": "user",
        "content": (
            "Summarize the following article in EXACTLY six to seven complete sentences. "
            "This is mandatory. Do not generate fewer or more than seven sentences.\n\n"
            "Requirements:\n"
            "- Abstract-style academic writing\n"
            "- No bullet points, no lists\n"
            "- No citations or references\n"
            "- No markdown formatting\n"
            "- Do not copy any sentence from the article; rephrase everything\n\n"
            f"Article:\n{text}"
        )
    }
]



    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=32000,
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=800,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    gen_ids = outputs[0, inputs["input_ids"].shape[1]:]
    summary = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
    return summary






In [None]:


out_path = Path("train_article_arxiv_summaries_mistral_v03.csv")
rows = []

for i, text in enumerate(df_train_arxiv_paper["article"]):
    if( i % 100 ==0):
        print(i)
    summary = summarize_mistral_research(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "arxiv",
        "model": "mistralai/Mistral-7B-Instruct-v0.3"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

In [27]:
out_path = Path("val_article_arxiv_summaries_mistral_v03.csv")
rows = []

for i, text in enumerate(df_val_arxiv_paper["article"]):
    if( i % 100 ==0):
        print(i)
    summary = summarize_mistral_research(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "arxiv",
        "model": "mistralai/Mistral-7B-Instruct-v0.3"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (32768). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Saved 1000 rows to val_article_arxiv_summaries_mistral_v03.csv


In [28]:
out_path = Path("test_article_arxiv_summaries_mistral_v03.csv")
rows = []

for i, text in enumerate(df_test_arxiv_paper["article"]):
    if( i % 100 ==0):
        print(i)
    summary = summarize_mistral_research(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "arxiv",
        "model": "mistralai/Mistral-7B-Instruct-v0.3"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 1000 rows to test_article_arxiv_summaries_mistral_v03.csv


In [None]:
def summarize_reddit_mistral(text: str) -> str:

    messages = [
    {
        "role": "system",
        "content": (
            "You are an expert at summarizing Reddit posts. Your goal is to provide a clear, "
            "neutral summary of the user's post while keeping the tone natural and conversational. "
            "Capture the key situation, context, motivations, and main concerns. Avoid adding "
            "opinions or judgments."
        )
    },
    {
        "role": "user",
        "content": (
            "Summarize the following Reddit post in 4–6 sentences. "
            "Keep the summary concise, neutral, and easy to understand.\n\n"
            "Requirements:\n"
            "- Use plain, natural language\n"
            "- Reflect the poster's situation accurately\n"
            "- Include the key events, motivations, and concerns\n"
            "- No bullet points, no lists\n"
            "- No advising, judging, or moralizing\n"
            "- Do not copy text; fully rephrase everything\n\n"
            f"Reddit Post:\n{text}"
        )
    }

]



    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=32000,
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=800,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    gen_ids = outputs[0, inputs["input_ids"].shape[1]:]
    summary = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
    return summary





In [19]:
out_path = Path("train_reddit_summaries_mistral_v03.csv")
rows = []

for i, text in enumerate(df_train_reddit_post["text"]):
    if( i % 100 ==0):
        print(i)
    summary = summarize_reddit_mistral(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "reddit",
        "model": "mistralai/Mistral-7B-Instruct-v0.3"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 5000 rows to train_reddit_summaries_mistral_v03.csv


In [29]:
out_path = Path("val_reddit_summaries_mistral_v03.csv")
rows = []

for i, text in enumerate(df_val_reddit_post["text"]):
    if( i % 100 ==0):
        print(i)
    summary = summarize_reddit_mistral(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "reddit",
        "model": "mistralai/Mistral-7B-Instruct-v0.3"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 1000 rows to val_reddit_summaries_mistral_v03.csv


In [30]:
out_path = Path("test_reddit_summaries_mistral_v03.csv")
rows = []

for i, text in enumerate(df_test_reddit_post["text"]):
    if( i % 100 ==0):
        print(i)
    summary = summarize_reddit_mistral(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "reddit",
        "model": "mistralai/Mistral-7B-Instruct-v0.3"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 1000 rows to test_reddit_summaries_mistral_v03.csv


In [None]:
def summarize_email_mistral(text: str) -> str:\

    messages = [
        {
            "role": "system",
            "content": (
                "You summarize business emails in a concise, neutral, and professional tone. "
                "Your summaries must begin immediately with the content, without any introductory phrases, "
                "explanations, or meta-commentary."
            )
        },
        {
            "role": "user",
            "content": (
                "Write a summary of the following email in 3–5 complete sentences.\n\n"
                "Requirements:\n"
                "- Begin the summary immediately with the content; do NOT write phrases such as "
                "  'Here is a summary', 'The email says', or 'This message discusses'\n"
                "- Professional tone\n"
                "- Capture the core purpose, key points, and any actions or decisions\n"
                "- No bullet points or lists\n"
                "- No greetings or signatures\n"
                "- No quoting, copying, or references to the fact that this is an email\n"
                "- Ignore headers, footers, and forward chains unless essential\n\n"
                f"Email:\n{text}"
            )
        }

]



    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=32000,
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=800,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    gen_ids = outputs[0, inputs["input_ids"].shape[1]:]
    summary = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
    return summary





In [18]:
out_path = Path("train_email_summaries_mistral_v03.csv")
rows = []

for i, text in enumerate(df_train_email["text"]):
    if( i % 100 ==0):
        print(i)
    summary = summarize_email_mistral(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "email",
        "model": "mistralai/Mistral-7B-Instruct-v0.3"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 5000 rows to train_email_summaries_mistral_v03.csv


In [31]:
out_path = Path("val_email_summaries_mistral_v03.csv")
rows = []

for i, text in enumerate(df_val_email["text"]):
    if( i % 100 ==0):
        print(i)
    summary = summarize_email_mistral(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "email",
        "model": "mistralai/Mistral-7B-Instruct-v0.3"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

Saved 1000 rows to val_email_summaries_mistral_v03.csv


In [17]:
out_path = Path("test_email_summaries_mistral_v03.csv")
rows = []

for i, text in enumerate(df_test_email["text"]):
    if( i % 100 ==0):
        print(i)
    summary = summarize_email_mistral(text)   
    rows.append({
        "doc_id": i,                   
        "text": text,
        "summary": summary,
        "dataset": "email",
        "model": "mistralai/Mistral-7B-Instruct-v0.3"
    })

df_out = pd.DataFrame(rows, columns=["doc_id", "text", "summary","dataset", "model"])
df_out.to_csv(out_path, index=False)
print(f"Saved {len(df_out)} rows to {out_path}")

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (32768). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Saved 1000 rows to test_email_summaries_mistral_v03.csv
