In [2]:
import os
from copy import deepcopy
import numpy as np
import pandas as pd
from difflib import SequenceMatcher
from pathlib import Path

from sklearn.model_selection import train_test_split

# For plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(context="notebook", 
              style="white")

from datasets import Dataset
import evaluate
from transformers import (BartTokenizer,
                          BartForConditionalGeneration,
                          DataCollatorForSeq2Seq,
                          EarlyStoppingCallback,
                          Seq2SeqTrainingArguments,
                          Seq2SeqTrainer, 
                          get_scheduler)
import torch
import bitsandbytes as bnb

  from .autonotebook import tqdm as notebook_tqdm


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [3]:
# https://www.kaggle.com/datasets/asaniczka/data-scientist-linkedin-job-postings
import kagglehub
# Download latest version
path = kagglehub.dataset_download("asaniczka/data-scientist-linkedin-job-postings")

print("Path to dataset files:", path)
files = os.listdir(path)
# Print the names of the files
for file in files:
    print(file)

Path to dataset files: /Users/oliverzhou/.cache/kagglehub/datasets/asaniczka/data-scientist-linkedin-job-postings/versions/103
postings.csv


In [4]:
postings = pd.read_csv(path+'/postings.csv')
postings.head()

Unnamed: 0,job_title,company,job_location,job_link,first_seen,search_city,search_country,job level,job_type,job_summary,job_skills
0,Technical Data Analyst,Jefferson Health Plans,"Philadelphia, PA",https://www.linkedin.com/jobs/view/technical-d...,2023-12-20,Phoenixville,United States,Associate,Remote,Why Choose Jefferson Health Plans?\nWe are an ...,"KNIME, QlikView, SQL, MS Access, MS Excel, Log..."
1,Data Center Engineer - Minneapolis,DeRisk Technologies,"Minneapolis, MN",https://www.linkedin.com/jobs/view/data-center...,2023-12-20,Minneapolis,United States,Associate,Onsite,Job Responsibilities:\nDeployment / In-Scope C...,"Server, Storage, Backup, Networking, Virtualiz..."
2,Data Analyst,Avani Tech Solutions Private Limited,"Minneapolis, MN",https://www.linkedin.com/jobs/view/data-analys...,2023-12-20,Minneapolis,United States,Associate,Onsite,Success Factor knowledge\nSchedule : Monday th...,"Data Management, HR Data Retention Controls, C..."
3,Data Engineer II - NBC Sports Next,NBC Sports Next,"Minneapolis, MN",https://www.linkedin.com/jobs/view/data-engine...,2023-12-20,Minneapolis,United States,Associate,Remote,Company Description\nNBC Sports Next is where ...,"Data Engineering, Data Warehousing, SQL, MySQL..."
4,Data Analyst - Operational Assessment,National Grid Renewables,"Bloomington, MN",https://www.linkedin.com/jobs/view/data-analys...,2023-12-20,Minneapolis,United States,Associate,Hybrid,National Grid Renewables is a leading North Am...,"Data Analyst, Operational Assessment, Wind Ene..."


### Processing

In [6]:
postings.drop_duplicates(inplace=True)

In [7]:
# Apply conditions to create 'job_type'
patterns = {
    'Data Scientist': r'Data\s*Scientist|Data\s*Science|Scientist',  # Match both "Data Scientist" and "Data Science"
    'Data Analyst (BI)': r'Data\s*Analyst|Data\s*Research\s*Analyst|Analyst|Data\s*Analytics|BI|Business\s*Intelligence|Analytics|Visualization|Data\s*Analysis',  # Add "Data Research Analyst"
    'Data Engineer': r'Data\s*Engineer|Database\s*Engineer|Engineer',  # Consider "Database Engineer" as well
    'Software Engineer': r'Software\s*Engineer|Developer|Programmer|Software',
    'Statistician': r'\s*Statistician',
    'Modeler': r'\s*Modeler',
    'Consultant': r'\s*Consultant',
    'Specialist': r'\s*Specialist'
}

# Initialize 'job_type' column with 'Unknown'
postings['job_type'] = 'Unknown'

# Apply patterns to classify job titles
for job_type, pattern in patterns.items():
    postings.loc[postings['job_title'].str.contains(pattern, case=False, na=False, regex=True), 'job_type'] = job_type

# Show the first few rows
postings.head()

Unnamed: 0,job_title,company,job_location,job_link,first_seen,search_city,search_country,job level,job_type,job_summary,job_skills
0,Technical Data Analyst,Jefferson Health Plans,"Philadelphia, PA",https://www.linkedin.com/jobs/view/technical-d...,2023-12-20,Phoenixville,United States,Associate,Data Analyst (BI),Why Choose Jefferson Health Plans?\nWe are an ...,"KNIME, QlikView, SQL, MS Access, MS Excel, Log..."
1,Data Center Engineer - Minneapolis,DeRisk Technologies,"Minneapolis, MN",https://www.linkedin.com/jobs/view/data-center...,2023-12-20,Minneapolis,United States,Associate,Data Engineer,Job Responsibilities:\nDeployment / In-Scope C...,"Server, Storage, Backup, Networking, Virtualiz..."
2,Data Analyst,Avani Tech Solutions Private Limited,"Minneapolis, MN",https://www.linkedin.com/jobs/view/data-analys...,2023-12-20,Minneapolis,United States,Associate,Data Analyst (BI),Success Factor knowledge\nSchedule : Monday th...,"Data Management, HR Data Retention Controls, C..."
3,Data Engineer II - NBC Sports Next,NBC Sports Next,"Minneapolis, MN",https://www.linkedin.com/jobs/view/data-engine...,2023-12-20,Minneapolis,United States,Associate,Data Engineer,Company Description\nNBC Sports Next is where ...,"Data Engineering, Data Warehousing, SQL, MySQL..."
4,Data Analyst - Operational Assessment,National Grid Renewables,"Bloomington, MN",https://www.linkedin.com/jobs/view/data-analys...,2023-12-20,Minneapolis,United States,Associate,Data Analyst (BI),National Grid Renewables is a leading North Am...,"Data Analyst, Operational Assessment, Wind Ene..."


In [8]:
postings[postings['job_type']=='Unknown']['job_title'].value_counts()
data = postings[postings['job_type']!='Unknown']
data.head()

Unnamed: 0,job_title,company,job_location,job_link,first_seen,search_city,search_country,job level,job_type,job_summary,job_skills
0,Technical Data Analyst,Jefferson Health Plans,"Philadelphia, PA",https://www.linkedin.com/jobs/view/technical-d...,2023-12-20,Phoenixville,United States,Associate,Data Analyst (BI),Why Choose Jefferson Health Plans?\nWe are an ...,"KNIME, QlikView, SQL, MS Access, MS Excel, Log..."
1,Data Center Engineer - Minneapolis,DeRisk Technologies,"Minneapolis, MN",https://www.linkedin.com/jobs/view/data-center...,2023-12-20,Minneapolis,United States,Associate,Data Engineer,Job Responsibilities:\nDeployment / In-Scope C...,"Server, Storage, Backup, Networking, Virtualiz..."
2,Data Analyst,Avani Tech Solutions Private Limited,"Minneapolis, MN",https://www.linkedin.com/jobs/view/data-analys...,2023-12-20,Minneapolis,United States,Associate,Data Analyst (BI),Success Factor knowledge\nSchedule : Monday th...,"Data Management, HR Data Retention Controls, C..."
3,Data Engineer II - NBC Sports Next,NBC Sports Next,"Minneapolis, MN",https://www.linkedin.com/jobs/view/data-engine...,2023-12-20,Minneapolis,United States,Associate,Data Engineer,Company Description\nNBC Sports Next is where ...,"Data Engineering, Data Warehousing, SQL, MySQL..."
4,Data Analyst - Operational Assessment,National Grid Renewables,"Bloomington, MN",https://www.linkedin.com/jobs/view/data-analys...,2023-12-20,Minneapolis,United States,Associate,Data Analyst (BI),National Grid Renewables is a leading North Am...,"Data Analyst, Operational Assessment, Wind Ene..."


# Text Processing

Cleaning Text: Remove unwanted characters, URLs, and unnecessary whitespace.
Lowercasing: Convert all text to lowercase to maintain consistency.
Tokenization: Split the text into words or tokens.
Stop Words Removal: Remove common words that may not add value to your analysis (e.g., "and", "the").
Stemming/Lemmatization: Reduce words to their base or root form.

In [9]:
cols = ['job_title', 'job_summary', 'job_skills', 'job_type']
data = data[cols]
data.head()

Unnamed: 0,job_title,job_summary,job_skills,job_type
0,Technical Data Analyst,Why Choose Jefferson Health Plans?\nWe are an ...,"KNIME, QlikView, SQL, MS Access, MS Excel, Log...",Data Analyst (BI)
1,Data Center Engineer - Minneapolis,Job Responsibilities:\nDeployment / In-Scope C...,"Server, Storage, Backup, Networking, Virtualiz...",Data Engineer
2,Data Analyst,Success Factor knowledge\nSchedule : Monday th...,"Data Management, HR Data Retention Controls, C...",Data Analyst (BI)
3,Data Engineer II - NBC Sports Next,Company Description\nNBC Sports Next is where ...,"Data Engineering, Data Warehousing, SQL, MySQL...",Data Engineer
4,Data Analyst - Operational Assessment,National Grid Renewables is a leading North Am...,"Data Analyst, Operational Assessment, Wind Ene...",Data Analyst (BI)


In [10]:
data['job_type'].value_counts()

job_type
Data Scientist       1691
Data Engineer        1335
Data Analyst (BI)     895
Software Engineer     282
Specialist             49
Consultant             44
Statistician           11
Modeler                 8
Name: count, dtype: int64

In [11]:
data.dropna(subset=['job_title', 'job_summary', 'job_skills'], inplace=True)
data.isna().sum()

job_title      0
job_summary    0
job_skills     0
job_type       0
dtype: int64

In [12]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import json

# Load JSON file
with open("skills.json", "r") as file:
    skills_data = json.load(file)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oliverzhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oliverzhou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/oliverzhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Text cleaning for job_summary
def clean_text_summary(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Lowercase
    text = text.lower()
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Retain keywords (skills) and remove stop words
    cleaned_tokens = [
        lemmatizer.lemmatize(word) for word in tokens 
        if word in skills_data or word not in stop_words
    ]
    return ' '.join(cleaned_tokens)

# Text cleaning for job_skills (only remove special characters and notations)
def clean_text_skills(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Lowercase
    text = text.lower()
    return text

# Assuming your DataFrame is named 'data'
# Apply cleaning to job_summary and job_skills
data['cleaned_job_summary'] = data['job_summary'].apply(clean_text_summary)
data['cleaned_job_skills'] = data['job_skills'].apply(clean_text_skills)

# Save the cleaned data
data.to_csv('cleaned_data.csv', index=False)

# Preview the cleaned data
data.head()

Unnamed: 0,job_title,job_summary,job_skills,job_type,cleaned_job_summary,cleaned_job_skills
0,Technical Data Analyst,Why Choose Jefferson Health Plans?\nWe are an ...,"KNIME, QlikView, SQL, MS Access, MS Excel, Log...",Data Analyst (BI),choose jefferson health plan awardwinning notf...,knime qlikview sql ms access ms excel logical ...
1,Data Center Engineer - Minneapolis,Job Responsibilities:\nDeployment / In-Scope C...,"Server, Storage, Backup, Networking, Virtualiz...",Data Engineer,job responsibility deployment inscope configur...,server storage backup networking virtualizatio...
2,Data Analyst,Success Factor knowledge\nSchedule : Monday th...,"Data Management, HR Data Retention Controls, C...",Data Analyst (BI),success factor knowledge schedule monday frida...,data management hr data retention controls cal...
3,Data Engineer II - NBC Sports Next,Company Description\nNBC Sports Next is where ...,"Data Engineering, Data Warehousing, SQL, MySQL...",Data Engineer,company description nbc sport next sport techn...,data engineering data warehousing sql mysql po...
4,Data Analyst - Operational Assessment,National Grid Renewables is a leading North Am...,"Data Analyst, Operational Assessment, Wind Ene...",Data Analyst (BI),national grid renewables leading north america...,data analyst operational assessment wind energ...


# BERTSUM 

In [25]:
from transformers import EncoderDecoderModel, AutoTokenizer
import torch

# Load a BERT-based encoder-decoder model for summarization
model_name = "mrm8488/bert-small2bert-small-finetuned-cnn_daily_mail-summarization"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EncoderDecoderModel.from_pretrained(model_name)

# Function to summarize text using the encoder-decoder model
def bertsum_summarize(text, max_length=30):
    try:
        # Tokenize the input text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        outputs = model.generate(
            inputs.input_ids,
            max_length=max_length,
            num_beams=4,
            early_stopping=True
        )
        # Decode the generated summary
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        print(f"Error during BERTSUM summarization: {e}")
        return "Error during summarization"

# Generate summaries for test data
candidate_summaries = []
for i, text in enumerate(texts):
    if i % 100 == 0:
        print(f"Processing text {i}/{len(texts)}...")
    candidate_summaries.append(bertsum_summarize(text))

# Save candidate summaries
with open("bertsum-summaries.txt", "w") as file:
    for summary in candidate_summaries:
        file.write(summary + "\n")

# Evaluate using ROUGE
rouge_scores = calc_rouge_scores(candidate_summaries, ref_summaries)
print("ROUGE Scores with BERTSUM:", rouge_scores)

# Output a few results for inspection
for i in range(5):
    print(f"Original Text: {texts[i]}")
    print(f"Reference Summary: {ref_summaries[i]}")
    print(f"Generated Summary (BERTSUM): {candidate_summaries[i]}")
    print("-" * 50)


Config of the encoder: <class 'transformers.models.bert.modeling_bert.BertModel'> is overwritten by shared encoder config: BertConfig {
  "_name_or_path": "google/bert_uncased_L-4_H-512_A-8",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "return_dict": false,
  "transformers_version": "4.46.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Config of the decoder: <class 'transformers.models.bert.modeling_bert.BertLMHeadModel'> is overwritten by shared decoder config: BertConfig {
  "_name_or_path": "google/bert_uncased_L-4_H-512_A-8",
  "add_cross_attention": true,
  "attenti

Processing text 0/431...




Processing text 100/431...
Processing text 200/431...
Processing text 300/431...
Processing text 400/431...
ROUGE Scores with BERTSUM: {'rouge1': np.float64(6.0), 'rouge2': np.float64(2.2), 'rougeL': np.float64(5.9), 'rougeLsum': np.float64(5.9)}
Original Text: role data analyst location hartford ct raleigh nc duration fulltime job description least year experience working healthcare business data analyst health plan member enrollment benefit plan configuration provider setup contract setup billing payment ee claim processing edi transaction medicare medicaid commercial health plan cob accumulator least year experience requirement elicitation technique like jadsessions workshop interview survey etc hedis knowledgeable data analyst least year experience creating technical requirement specification based architecturedesign detailing processe least year experience agile methodology preferably agile scrum business analyst understanding u healthcare data thanks regard sheebakavipriya proces

# LDA

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration
from evaluate import load
import nltk

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Preprocessing
stop_words = set(stopwords.words('english'))
data['tokenized_summary'] = data['cleaned_job_summary'].apply(
    lambda x: [word for word in word_tokenize(x) if word not in stop_words]
)

# Convert tokenized summaries back to strings
data['processed_summary'] = data['tokenized_summary'].apply(lambda x: ' '.join(x))

# Vectorization using CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(data['processed_summary'])

# Fit LDA Model
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(doc_term_matrix)

# Extract topics for each document
def get_topics_per_doc(lda_model, doc_term_matrix, feature_names, num_words=5):
    """
    Get the top words representing each topic in each document.
    """
    topics = []
    for topic_weights in lda_model.transform(doc_term_matrix):
        topic = topic_weights.argmax()
        top_words = [feature_names[i] for i in lda_model.components_[topic].argsort()[-num_words:]]
        topics.append(' '.join(top_words))
    return topics

feature_names = vectorizer.get_feature_names_out()
data['lda_summary'] = get_topics_per_doc(lda_model, doc_term_matrix, feature_names)

# Evaluate using ROUGE
metric = load("rouge")

def calc_rouge_scores(candidates, references):
    """
    Calculate ROUGE scores for candidate summaries against reference summaries.
    """
    result = metric.compute(predictions=candidates, references=references, use_stemmer=True)
    return {key: round(value * 100, 1) for key, value in result.items()}

# Compute ROUGE scores
lda_candidates = data['lda_summary'].tolist()
reference_summaries = data['job_summary'].tolist()

rouge_scores = calc_rouge_scores(lda_candidates, reference_summaries)
print("ROUGE Scores for LDA Summarization:", rouge_scores)

# Display the first few results
print(data[['job_title', 'job_summary', 'lda_summary']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oliverzhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oliverzhou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


ROUGE Scores for LDA Summarization: {'rouge1': np.float64(1.7), 'rouge2': np.float64(0.0), 'rougeL': np.float64(1.4), 'rougeLsum': np.float64(1.7)}
                               job_title  \
0                 Technical Data Analyst   
1     Data Center Engineer - Minneapolis   
2                           Data Analyst   
3     Data Engineer II - NBC Sports Next   
4  Data Analyst - Operational Assessment   

                                         job_summary  \
0  Why Choose Jefferson Health Plans?\nWe are an ...   
1  Job Responsibilities:\nDeployment / In-Scope C...   
2  Success Factor knowledge\nSchedule : Monday th...   
3  Company Description\nNBC Sports Next is where ...   
4  National Grid Renewables is a leading North Am...   

                              lda_summary  
0     analysis science work business team  
1         technology team year skill work  
2         technology team year skill work  
3     analysis science work business team  
4  business opportunity produc

This approach combines extractive summarization using TF-IDF cosine similarity and abstractive summarization using a pre-trained T5 transformer model:

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import T5Tokenizer, T5ForConditionalGeneration
from nltk.tokenize import sent_tokenize
from evaluate import load
import pandas as pd
import nltk

# Download required NLTK data
nltk.download('punkt')

# Load data
data = pd.read_csv('cleaned_data.csv')
data['cleaned_job_summary'] = data['cleaned_job_summary'].fillna("")

# Step 1: Extractive Summarization using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(data['cleaned_job_summary'])

def extract_key_sentences(text, num_sentences=3):
    """
    Extract top sentences based on TF-IDF cosine similarity scores.
    """
    sentences = sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return ' '.join(sentences)  # Return all sentences if fewer than the threshold
    
    # Calculate sentence vectors
    sentence_vectors = vectorizer.transform(sentences)
    similarity_scores = cosine_similarity(sentence_vectors, tfidf_matrix)
    
    # Rank sentences by their average similarity score
    ranked_indices = similarity_scores.mean(axis=1).argsort()[::-1][:num_sentences]
    return ' '.join([sentences[i] for i in ranked_indices])

# Extract key sentences
data['key_sentences'] = data['cleaned_job_summary'].apply(extract_key_sentences)

# Step 2: Abstractive Summarization using T5
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def generate_summary(text, max_length=50):
    """
    Generate abstractive summaries using T5.
    """
    try:
        inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
        outputs = model.generate(inputs.input_ids, max_length=max_length, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        print(f"Error during summarization: {e}")
        return "Error during summarization"

# Apply abstractive summarization on key sentences
data['generated_summary'] = data['key_sentences'].apply(generate_summary)

# Evaluate using ROUGE
metric = load("rouge")

def calc_rouge_scores(candidates, references):
    """
    Calculate ROUGE scores for candidate summaries against reference summaries.
    """
    result = metric.compute(predictions=candidates, references=references, use_stemmer=True)
    return {key: round(value * 100, 1) for key, value in result.items()}

# Compute ROUGE scores
candidates = data['generated_summary'].tolist()
references = data['job_summary'].tolist()
rouge_scores = calc_rouge_scores(candidates, references)
print("ROUGE Scores for Improved Summarization:", rouge_scores)

# Display results
print(data[['job_title', 'job_summary', 'generated_summary']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oliverzhou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


ROUGE Scores for Improved Summarization: {'rouge1': np.float64(15.9), 'rouge2': np.float64(9.0), 'rougeL': np.float64(15.8), 'rougeLsum': np.float64(15.8)}
                               job_title  \
0                 Technical Data Analyst   
1     Data Center Engineer - Minneapolis   
2                           Data Analyst   
3     Data Engineer II - NBC Sports Next   
4  Data Analyst - Operational Assessment   

                                         job_summary  \
0  Why Choose Jefferson Health Plans?\nWe are an ...   
1  Job Responsibilities:\nDeployment / In-Scope C...   
2  Success Factor knowledge\nSchedule : Monday th...   
3  Company Description\nNBC Sports Next is where ...   
4  National Grid Renewables is a leading North Am...   

                                   generated_summary  
0  jefferson health plan awardwinning notforprofi...  
1  job responsibility deployment inscope configur...  
2  success factor knowledge schedule monday frida...  
3  nbc sport next spor

Strengths of the Approach:

The hybrid extractive-abstractive method outperforms simpler LDA-based summarization (e.g., previous ROUGE-1 ~1.7).
Combining relevance filtering (TF-IDF) and abstractive refinement (T5) provides a more aligned and coherent summary.

T5 abstractive summarization improves fluency, making the summaries more human-like compared to pure extractive methods.

ROUGE-L scores suggest the generated summaries capture the structure and phrasing of the reference summaries to a reasonable degree.

-------------

Limitations:

A ROUGE-2 score of 9.0 suggests the method struggles to consistently capture semantic pairings and context.
The extractive step might omit key relational phrases that the abstractive model doesn't reconstruct.

If the reference summaries are verbose or not concise, this can lower the scores.
ROUGE might not fully capture semantic equivalence or rephrased content.

------------
The hybrid extractive-abstractive summarization approach shows significant improvement with ROUGE-1 and ROUGE-L nearing 16%. While there's room for improvement, the current results demonstrate a good balance between relevance and readability. Further refinement of extractive techniques and model tuning can yield even better results.