In [1]:
"""
basic requirements for project
includes file reading
"""

import pandas as pd
data_own_model = pd.read_csv(r"C:\Users\Zhun Kai\Downloads\archive\news_summary.csv", encoding='latin1')

data_own_model['text'] = data_own_model['text'].fillna('').astype(str)
data_own_model['headlines'] = data_own_model['ctext'].fillna('').astype(str)

import pandas as pd
from rouge_score import rouge_scorer
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
"""Custom version 1, TF-IDF alone
"""

# nltk.download('punkt')
# nltk.download('stopwords')

# Text preprocessing
def preprocess_text(text):
    text = text.lower()  # Lowercase
    # Remove unnecessary punctuation (but keep periods)
    text = re.sub(r'[^\w\s\.\-]', '', text)  # Keeps periods, hyphens, and alphanumeric characters
    sentences = sent_tokenize(text)  # Tokenize into sentences
    sw = set(stopwords.words('english'))  # Load stopwords
    filtered_sentences = [
        " ".join(w for w in sent.split() if w not in sw)
        for sent in sentences
    ]
    return filtered_sentences

def score_sentences(sentences):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Return the sum of TF-IDF scores across words for each sentence
    return tfidf_matrix.sum(axis=1).A1  # Sum scores across words for each sentence

# Extract top sentences based on scores
def extract_summary(sentences, sentence_scores, n=3):
    top_indices = np.argsort(sentence_scores)[-n:][::-1]  # Top n scores
    return ' '.join(sentences[i] for i in top_indices)

# Summarize document
def summarize_document(document, n=3):
    sentences = preprocess_text(document)
    sentence_scores = score_sentences(sentences)
    return extract_summary(sentences, sentence_scores, n)

# ROUGE evaluation
def evaluate_summaries(reference_summaries, generated_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    results = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for ref, gen in zip(reference_summaries, generated_summaries):
        scores = scorer.score(ref, gen)
        results['rouge1'].append(scores['rouge1'].fmeasure)
        results['rouge2'].append(scores['rouge2'].fmeasure)
        results['rougeL'].append(scores['rougeL'].fmeasure)

    # Compute averages
    avg_scores = {metric: sum(values) / len(values) for metric, values in results.items()}
    return avg_scores

# Process dataset and summarize
def process_dataset(data):
    generated_summaries = []
    reference_summaries = []

    # Loop through each row in the dataset
    for _, row in tqdm(data.iterrows(), total=len(data), desc="Processing Data"):
        text = row['text']
        reference = row['headlines']

        # Generate summary for the current row
        generated_summary = summarize_document(text, n=2)  # Adjust the number of sentences if needed
        
        # Store results
        generated_summaries.append(generated_summary)
        reference_summaries.append(reference)

    return generated_summaries, reference_summaries

# Process the dataset
generated_summaries, reference_summaries = process_dataset(data_own_model)

# Evaluate the generated summaries with ROUGE
rouge_scores = evaluate_summaries(data_own_model['text'], generated_summaries)

# Print the average ROUGE scores
print("Average ROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

Processing Data: 100%|████████████████████████████████████████████████████████████| 4514/4514 [00:08<00:00, 553.19it/s]


Average ROUGE Scores:
ROUGE-1: 0.6662
ROUGE-2: 0.3505
ROUGE-L: 0.6060


In [4]:
"""Custom version 2, TF-IDF + cosine similarity"""

# Text preprocessing
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s\.\-]', '', text)  # Keeps periods, hyphens, and alphanumeric characters
    sentences = sent_tokenize(text)  # Tokenize into sentences
    sw = set(stopwords.words('english'))  # Load stopwords
    filtered_sentences = [
        " ".join(w for w in sent.split() if w not in sw)
        for sent in sentences
    ]
    return filtered_sentences

# Score sentences based on TF-IDF
def score_sentences(sentences):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(sentences)
    sentence_scores = tfidf_matrix.sum(axis=1).A1  # Sum scores across words for each sentence
    return tfidf_matrix, sentence_scores

# Extract top sentences based on scores and similarity
def extract_summary(sentences, sentence_scores, tfidf_matrix, n=3):
    top_indices = np.argsort(sentence_scores)[-n:][::-1]
    selected_sentences = [sentences[i] for i in top_indices]
    
    cosine_sim = cosine_similarity(tfidf_matrix)
    avg_similarities = np.mean(cosine_sim[top_indices], axis=1)
    refined_indices = np.argsort(avg_similarities)[-n:][::-1]
    
    return ' '.join(selected_sentences[i] for i in refined_indices)

# Summarize document
def summarize_document(document, n=3):
    sentences = preprocess_text(document)
    tfidf_matrix, sentence_scores = score_sentences(sentences)
    summary = extract_summary(sentences, sentence_scores, tfidf_matrix, n)
    return summary

# ROUGE evaluation
def evaluate_summaries(reference_summaries, generated_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    results = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for ref, gen in zip(reference_summaries, generated_summaries):
        scores = scorer.score(ref, gen)
        results['rouge1'].append(scores['rouge1'].fmeasure)
        results['rouge2'].append(scores['rouge2'].fmeasure)
        results['rougeL'].append(scores['rougeL'].fmeasure)

    avg_scores = {metric: sum(values) / len(values) for metric, values in results.items()}
    return avg_scores

# Process dataset and summarize with progress bar
def process_dataset(data):
    generated_summaries = []

    for _, row in tqdm(data.iterrows(), total=len(data), desc="Processing Data"):
        text = row['text']
        # Generate summary for the current row
        generated_summary = summarize_document(text, n=2)  # Adjust the number of sentences if needed
        generated_summaries.append(generated_summary)

    return generated_summaries

# Load the dataset
data_path = r"C:\Users\Zhun Kai\Downloads\archive\news_summary.csv"
data_own_model = pd.read_csv(data_path, encoding='latin1')
data_own_model = data_own_model[['text', 'headlines']].dropna()

# Process the dataset and add summaries to a new column
data_own_model['generated_summary'] = process_dataset(data_own_model)

# Evaluate summaries using ROUGE
rouge_scores = evaluate_summaries(data_own_model['text'], data_own_model['generated_summary'])

# Print ROUGE scores
print("Average ROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

# Save the updated dataset to a CSV file
output_path = r"C:\Users\Zhun Kai\Downloads\summarized_news.csv"
data_own_model.to_csv(output_path, index=False)
print(f"Summarized data saved to {output_path}")

Processing Data: 100%|████████████████████████████████████████████████████████████| 4514/4514 [00:09<00:00, 472.60it/s]


Average ROUGE Scores:
ROUGE-1: 0.6662
ROUGE-2: 0.3490
ROUGE-L: 0.5481
Summarized data saved to C:\Users\Zhun Kai\Downloads\summarized_news.csv


In [None]:
"""Pretrained model
"""

# Load the summarization model
pre_trained_summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=0)  # -1 for CPU, 0 for GPU

# Function to evaluate ROUGE scores
def evaluate_summaries(reference_summaries, generated_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    results = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for ref, gen in zip(reference_summaries, generated_summaries):
        scores = scorer.score(ref, gen)
        results['rouge1'].append(scores['rouge1'].fmeasure)
        results['rouge2'].append(scores['rouge2'].fmeasure)
        results['rougeL'].append(scores['rougeL'].fmeasure)

    # Compute averages
    avg_scores = {metric: sum(values) / len(values) for metric, values in results.items()}
    return avg_scores

# Process dataset and generate summaries with progress bar
def process_dataset(data):
    generated_summaries = []
    reference_summaries = []

    # Use tqdm to add a progress bar for the loop
    for _, row in tqdm(data.iterrows(), total=len(data), desc="Processing Data"):
        text = row['text']
        reference = row['headlines']

        # Generate summary for the current row using the pretrained model
        pre_trained_model_summary = pre_trained_summarizer(text, max_length=150, min_length=50, do_sample=False)
        
        # Extract the generated summary
        generated_summary = pre_trained_model_summary[0]['summary_text']
        
        # Store results
        generated_summaries.append(generated_summary)
        reference_summaries.append(reference)

    return generated_summaries, reference_summaries

# Process the dataset and get summaries
generated_summaries, reference_summaries = process_dataset(data_own_model)

# Evaluate the generated summaries with ROUGE against headlines
rouge_scores_headlines = evaluate_summaries(data_own_model['headlines'], generated_summaries)
print("ROUGE Scores (Generated Summary vs Headline):")
print(f"ROUGE-1: {rouge_scores_headlines['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores_headlines['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores_headlines['rougeL']:.4f}")

# Evaluate the generated summaries with ROUGE against original text
rouge_scores_text = evaluate_summaries(data_own_model['text'], generated_summaries)
print("\nROUGE Scores (Generated Summary vs Original Text):")
print(f"ROUGE-1: {rouge_scores_text['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores_text['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores_text['rougeL']:.4f}")

In [5]:
"""
Current hyperparameter in custom model includes:
1. TF-IDF Vectorizer:
    a. max_features: The number of features to keep based on frequency.
    b. stop_words: To remove common words that do not carry much information.
    c. ngram_range: Consider adjusting the range of n-grams to include bigrams or trigrams.
    d. min_df and max_df: These control the inclusion of words that appear too rarely or too frequently across the corpus.
2. Summarization Model (Custom) Parameters:
    a. n: The number of sentences in the summary. This is a key hyperparameter to adjust based on the length of the summary you want.
3. Cosine Similarity:
    a. weight of sentence similarity during the refinement of the summary.
"""

'\nCurrent hyperparameter in custom model includes:\n1. TF-IDF Vectorizer:\n    a. max_features: The number of features to keep based on frequency.\n    b. stop_words: To remove common words that do not carry much information.\n    c. ngram_range: Consider adjusting the range of n-grams to include bigrams or trigrams.\n    d. min_df and max_df: These control the inclusion of words that appear too rarely or too frequently across the corpus.\n2. Summarization Model (Custom) Parameters:\n    a. n: The number of sentences in the summary. This is a key hyperparameter to adjust based on the length of the summary you want.\n3. Cosine Similarity:\n    a. weight of sentence similarity during the refinement of the summary.\n'

In [6]:
"""
splitting data
"""

# Split into train (80%), validation (10%), and test (10%)
train_data, temp_data = train_test_split(data_own_model, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Check the sizes of each set
print("Training Set:", len(train_data))
print("Validation Set:", len(val_data))
print("Test Set:", len(test_data))

Training Set: 3611
Validation Set: 451
Test Set: 452


In [7]:
"""
fine tuning TF-IDF
"""

# Load the dataset
data_path = r"C:\Users\Zhun Kai\Downloads\summarized_news.csv"
data_own_model_fine_tuning = pd.read_csv(data_path, encoding='latin1')
data_own_model_fine_tuning = data_own_model_fine_tuning[['text', 'generated_summary']].dropna()

# Split data into training and testing sets
train_texts, test_texts, train_summaries, test_summaries = train_test_split(
    data_own_model_fine_tuning['text'], data_own_model_fine_tuning['generated_summary'], test_size=0.2, random_state=42
)

# Define the parameter grid for TF-IDF
param_grid = {
    'max_features': [500, 1000, 2000],
    'ngram_range': [(1, 1), (1, 2)],
    'stop_words': [None, 'english']
}

# Function to fine-tune TF-IDF using cosine similarity
def fine_tune_tfidf(train_texts, train_summaries, param_grid):
    best_score = -1
    best_params = None
    best_vectorizer = None

    for max_features in param_grid['max_features']:
        for ngram_range in param_grid['ngram_range']:
            for stop_words in param_grid['stop_words']:
                print(f"Testing parameters: max_features={max_features}, ngram_range={ngram_range}, stop_words={stop_words}")
                
                # Initialize TfidfVectorizer with current parameters
                tfidf_vectorizer = TfidfVectorizer(
                    max_features=max_features,
                    ngram_range=ngram_range,
                    stop_words=stop_words
                )
                
                # Fit and transform the training texts and summaries
                tfidf_train_texts = tfidf_vectorizer.fit_transform(train_texts)
                tfidf_train_summaries = tfidf_vectorizer.transform(train_summaries)
                
                # Compute cosine similarities
                similarities = cosine_similarity(tfidf_train_texts, tfidf_train_summaries)
                
                # Calculate the mean diagonal similarity (main diagonal shows input-output match)
                mean_similarity = np.mean(np.diag(similarities))
                print(f"Mean Cosine Similarity: {mean_similarity:.4f}")
                
                # Update best parameters if the current mean similarity is higher
                if mean_similarity > best_score:
                    best_score = mean_similarity
                    best_params = {'max_features': max_features, 'ngram_range': ngram_range, 'stop_words': stop_words}
                    best_vectorizer = tfidf_vectorizer

    print("Best Parameters:", best_params)
    print("Best Mean Cosine Similarity:", best_score)
    return best_vectorizer, best_params, best_score

# Fine-tune TF-IDF using the function
best_vectorizer, best_params, best_score = fine_tune_tfidf(train_texts, train_summaries, param_grid)

# Display the best configuration
print("Best TF-IDF Vectorizer Configuration:", best_params)
print("Best Cosine Similarity:", best_score)

Testing parameters: max_features=500, ngram_range=(1, 1), stop_words=None
Mean Cosine Similarity: 0.7361
Testing parameters: max_features=500, ngram_range=(1, 1), stop_words=english
Mean Cosine Similarity: 0.9184
Testing parameters: max_features=500, ngram_range=(1, 2), stop_words=None
Mean Cosine Similarity: 0.6576
Testing parameters: max_features=500, ngram_range=(1, 2), stop_words=english
Mean Cosine Similarity: 0.9177
Testing parameters: max_features=1000, ngram_range=(1, 1), stop_words=None
Mean Cosine Similarity: 0.7940
Testing parameters: max_features=1000, ngram_range=(1, 1), stop_words=english
Mean Cosine Similarity: 0.9199
Testing parameters: max_features=1000, ngram_range=(1, 2), stop_words=None
Mean Cosine Similarity: 0.7103
Testing parameters: max_features=1000, ngram_range=(1, 2), stop_words=english
Mean Cosine Similarity: 0.9199
Testing parameters: max_features=2000, ngram_range=(1, 1), stop_words=None
Mean Cosine Similarity: 0.8307
Testing parameters: max_features=2000,

In [8]:
"""
checking feature against test data to confirm reliability
"""

# Transform the test data using the best TF-IDF vectorizer
tfidf_test_texts = best_vectorizer.transform(test_texts)
tfidf_test_summaries = best_vectorizer.transform(test_summaries)

# Compute cosine similarities on the test set
test_similarities = cosine_similarity(tfidf_test_texts, tfidf_test_summaries)

# Calculate the mean diagonal similarity for the test set
mean_test_similarity = np.mean(np.diag(test_similarities))
print(f"Mean Cosine Similarity on Test Data: {mean_test_similarity:.4f}")

Mean Cosine Similarity on Test Data: 0.9217


In [9]:
"""Custom model version 3, with fine tuned TF-IDF variables"""
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Text preprocessing
def preprocess_text(text):
    text = text.lower()  # Lowercase
    # Remove unnecessary punctuation (but keep periods)
    text = re.sub(r'[^\w\s\.\-]', '', text)  # Keeps periods, hyphens, and alphanumeric characters
    sentences = sent_tokenize(text)  # Tokenize into sentences
    sw = set(stopwords.words('english'))  # Load stopwords
    filtered_sentences = [
        " ".join(w for w in sent.split() if w not in sw)
        for sent in sentences
    ]
    return filtered_sentences

def score_sentences(sentences):
    # Use the tuned TF-IDF parameters
    vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Return the sum of TF-IDF scores across words for each sentence
    return tfidf_matrix.sum(axis=1).A1  # Sum scores across words for each sentence

# Extract top sentences based on scores
def extract_summary(sentences, sentence_scores, n=3):
    top_indices = np.argsort(sentence_scores)[-n:][::-1]  # Top n scores
    return ' '.join(sentences[i] for i in top_indices)

# Summarize document
def summarize_document(document, n=3):
    sentences = preprocess_text(document)
    sentence_scores = score_sentences(sentences)
    return extract_summary(sentences, sentence_scores, n)

# ROUGE evaluation
def evaluate_summaries(reference_summaries, generated_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    results = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for ref, gen in zip(reference_summaries, generated_summaries):
        scores = scorer.score(ref, gen)
        results['rouge1'].append(scores['rouge1'].fmeasure)
        results['rouge2'].append(scores['rouge2'].fmeasure)
        results['rougeL'].append(scores['rougeL'].fmeasure)

    # Compute averages
    avg_scores = {metric: sum(values) / len(values) for metric, values in results.items()}
    return avg_scores

# Process dataset and summarize
def process_dataset(data):
    data['generated_summary'] = None  # Initialize the new column for summaries

    for index, row in tqdm(data.iterrows(), total=len(data), desc="Generating Summaries"):
        text = row['text']
        generated_summary = summarize_document(text)  # Generate the summary
        data.at[index, 'generated_summary'] = generated_summary  # Save it in the new column

    return data

# Process the dataset
data_own_model = process_dataset(data_own_model)

# Evaluate the generated summaries with ROUGE
rouge_scores = evaluate_summaries(data_own_model['text'], data_own_model['generated_summary'])

# Print the average ROUGE scores
print("Average ROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

[nltk_data] Downloading package punkt to C:\Users\Zhun
[nltk_data]     Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Zhun
[nltk_data]     Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Zhun
[nltk_data]     Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Generating Summaries: 100%|███████████████████████████████████████████████████████| 4514/4514 [00:08<00:00, 548.85it/s]


Average ROUGE Scores:
ROUGE-1: 0.7436
ROUGE-2: 0.3822
ROUGE-L: 0.6386


In [10]:
"""
applying custom model on target data, checking reliability
"""

import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import joblib
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# File paths
TFIDF_VECTOR_PATH = "tfidf_vectorizer.joblib"
SUMMARIZED_DATA_PATH = "summarized_news.csv"

# Text preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s\.\-]', '', text)  # Remove unnecessary punctuation
    sentences = sent_tokenize(text)  # Split into sentences
    sw = set(stopwords.words('english'))  # Load stopwords
    filtered_sentences = [
        " ".join(w for w in sent.split() if w not in sw) for sent in sentences
    ]
    return filtered_sentences

# Train and save TF-IDF vectorizer
def train_and_save_vectorizer(documents):
    vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), stop_words='english')
    vectorizer.fit(documents)
    joblib.dump(vectorizer, TFIDF_VECTOR_PATH)
    print(f"TF-IDF vectorizer trained and saved to {TFIDF_VECTOR_PATH}")
    return vectorizer

# Load the saved TF-IDF vectorizer
def load_vectorizer(path):
    return joblib.load(path)

# Score sentences using TF-IDF
def score_sentences(sentences, vectorizer):
    tfidf_matrix = vectorizer.transform(sentences)
    return tfidf_matrix.sum(axis=1).A1  # Sum TF-IDF scores across words for each sentence

# Extract the top n sentences for the summary
def extract_summary(sentences, sentence_scores, n=3):
    top_indices = np.argsort(sentence_scores)[-n:][::-1]  # Get indices of top n sentences
    return ' '.join(sentences[i] for i in top_indices)

# Summarize a single document
def summarize_document(document, vectorizer, n=3):
    sentences = preprocess_text(document)
    sentence_scores = score_sentences(sentences, vectorizer)
    return extract_summary(sentences, sentence_scores, n)

# Summarize a dataset
def process_dataset(data, vectorizer):
    data['generated_summary'] = None  # Initialize new column
    for index, row in tqdm(data.iterrows(), total=len(data), desc="Generating Summaries"):
        text = row['text']
        data.at[index, 'generated_summary'] = summarize_document(text, vectorizer)
    return data

# Main process
if __name__ == "__main__":
    # Load your dataset
    data_path = r"C:\Users\Zhun Kai\Downloads\archive\news_summary.csv"
    data = pd.read_csv(data_path, encoding="latin1")[['text', 'headlines']].dropna()

    # Check if the TF-IDF vectorizer is already trained
    try:
        vectorizer = load_vectorizer(TFIDF_VECTOR_PATH)
        print("TF-IDF vectorizer loaded successfully.")
    except FileNotFoundError:
        print("No existing vectorizer found. Training a new one...")
        sample_texts = data['text'].tolist()  # Use dataset to train
        vectorizer = train_and_save_vectorizer(sample_texts)

    # Process the dataset and generate summaries
    print("Generating summaries for the dataset...")
    summarized_data = process_dataset(data, vectorizer)

    # Save the summarized data to CSV
    summarized_data.to_csv(SUMMARIZED_DATA_PATH, index=False)
    print(f"Summarized data saved to {SUMMARIZED_DATA_PATH}")

[nltk_data] Downloading package punkt to C:\Users\Zhun
[nltk_data]     Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Zhun
[nltk_data]     Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


TF-IDF vectorizer loaded successfully.
Generating summaries for the dataset...


Generating Summaries: 100%|██████████████████████████████████████████████████████| 4514/4514 [00:04<00:00, 1125.53it/s]

Summarized data saved to summarized_news.csv





In [12]:
"""
deploying custom data
"""

from tqdm import tqdm
import pandas as pd

# Main process
if __name__ == "__main__":
    # Load your dataset
    data_path = r"C:\Users\Zhun Kai\Downloads\archive\news_summary.csv"
    news_data = pd.read_csv(data_path, encoding="latin1")

    # Check if the TF-IDF vectorizer is already trained
    try:
        vectorizer = load_vectorizer(TFIDF_VECTOR_PATH)
        print("TF-IDF vectorizer loaded successfully.")
    except FileNotFoundError:
        print("No existing vectorizer found. Training a new one...")
        sample_texts = news_data['text'].tolist()  # Use dataset to train
        vectorizer = train_and_save_vectorizer(sample_texts)

    # Process the dataset and generate summaries
    print("Generating summaries for the dataset...")
    summarized_data = process_dataset(news_data, vectorizer)

    # Fill missing values to prevent issues during evaluation
    news_data['ctext'] = news_data['ctext'].fillna("").astype(str)
    summarized_data['generated_summary'] = summarized_data['generated_summary'].fillna("").astype(str)

    # Initialize progress bar for ROUGE evaluation
    print("\nEvaluating ROUGE Scores...")
    rouge_results_headline = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    rouge_results_ctext = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    # Evaluate summaries against 'headline'
    for ref, gen in tqdm(zip(news_data['headlines'], summarized_data['generated_summary']), 
                         total=len(news_data), desc="Evaluating Headlines"):
        scores = evaluate_summaries([ref], [gen])
        for key in rouge_results_headline:
            rouge_results_headline[key].append(scores[key])

    # Evaluate summaries against 'ctext'
    for ref, gen in tqdm(zip(news_data['ctext'], summarized_data['generated_summary']), 
                         total=len(news_data), desc="Evaluating Ctext"):
        scores = evaluate_summaries([ref], [gen])
        for key in rouge_results_ctext:
            rouge_results_ctext[key].append(scores[key])

    # Calculate and print average ROUGE scores
    print("\nAverage ROUGE Scores (Generated Summary vs Headline):")
    print(f"ROUGE-1: {sum(rouge_results_headline['rouge1']) / len(rouge_results_headline['rouge1']):.4f}")
    print(f"ROUGE-2: {sum(rouge_results_headline['rouge2']) / len(rouge_results_headline['rouge2']):.4f}")
    print(f"ROUGE-L: {sum(rouge_results_headline['rougeL']) / len(rouge_results_headline['rougeL']):.4f}")

    print("\nAverage ROUGE Scores (Generated Summary vs Original Text - ctext):")
    print(f"ROUGE-1: {sum(rouge_results_ctext['rouge1']) / len(rouge_results_ctext['rouge1']):.4f}")
    print(f"ROUGE-2: {sum(rouge_results_ctext['rouge2']) / len(rouge_results_ctext['rouge2']):.4f}")
    print(f"ROUGE-L: {sum(rouge_results_ctext['rougeL']) / len(rouge_results_ctext['rougeL']):.4f}")

    # Save the summarized data to CSV
    summarized_data.to_csv(SUMMARIZED_DATA_PATH, index=False)
    print(f"\nSummarized data saved to {SUMMARIZED_DATA_PATH}")

TF-IDF vectorizer loaded successfully.
Generating summaries for the dataset...


Generating Summaries: 100%|██████████████████████████████████████████████████████| 4514/4514 [00:04<00:00, 1121.99it/s]



Evaluating ROUGE Scores...


Evaluating Headlines: 100%|██████████████████████████████████████████████████████| 4514/4514 [00:03<00:00, 1337.54it/s]
Evaluating Ctext: 100%|███████████████████████████████████████████████████████████| 4514/4514 [00:32<00:00, 138.75it/s]



Average ROUGE Scores (Generated Summary vs Headline):
ROUGE-1: 0.2600
ROUGE-2: 0.0840
ROUGE-L: 0.2202

Average ROUGE Scores (Generated Summary vs Original Text - ctext):
ROUGE-1: 0.1845
ROUGE-2: 0.0669
ROUGE-L: 0.1342

Summarized data saved to summarized_news.csv
