https://huggingface.co/thearod5/bert4re

https://ieeexplore.ieee.org/document/9920081

In [None]:
import pandas as pd
import os
import re
from keybert import KeyBERT
import matplotlib.pyplot as plt



In [None]:
data_folder = 'data/'

model = KeyBERT()

In [None]:
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  
    text = re.sub(r'\s+', ' ', text).strip()  
    return text.lower()


In [None]:
# Create an empty dictionary to store the keyword results for each year
all_keywords_by_year = {1: {}, 2: {}, 3: {}, 4: {}, 5: {}}

data_folder = 'data'

# Loop through each CSV file in the data folder
for filename in os.listdir(data_folder):
    if filename.endswith(".csv"):
        year = filename.split('.')[0]
        
        # Load the CSV file
        file_path = os.path.join(data_folder, filename)
        df = pd.read_csv(file_path)
        
        # Ensure all values in the 'Content' column are strings (replace non-strings like NaN)
        df['Content'] = df['Content'].fillna('').astype(str)
        
        # Clean the 'Content' column
        df['cleaned_content'] = df['Content'].apply(clean_text)

        # Extract keywords for 1-5 word phrases
        for n in range(1, 6):
            df['keywords'] = df['cleaned_content'].apply(lambda x: model.extract_keywords(
                x, keyphrase_ngram_range=(n, n), stop_words='english', use_maxsum=True, nr_candidates=20, top_n=5
            ))

            # Flatten the list of keywords and count occurrences
            keywords_flat = df['keywords'].explode()
            keywords_flat = keywords_flat.apply(lambda x: x[0] if isinstance(x, (list, tuple)) and x else None).dropna()

            # Count the occurrences of the keywords and take the top 10
            keyword_counts = keywords_flat.value_counts().head(10)

            # Store the result in the dictionary
            all_keywords_by_year[n][year] = keyword_counts

# Show the result for one year for each n-gram size
for n in range(1, 6):
    print(f"\nTop keywords for {n}-word phrases in 2014-2015:")
    print(all_keywords_by_year[n]['2014-2015'])

In [None]:
# Plotting the results for each n-gram size
for n in range(1, 6):
    keyword_df = pd.DataFrame(all_keywords_by_year[n]).T.fillna(0) 
    keyword_df.columns = [f'Top {i+1}' for i in range(10)] 
    keyword_df = keyword_df.applymap(lambda x: x if isinstance(x, (int, float)) else 0) 

    plt.figure(figsize=(12, 6))
    keyword_df.plot(kind='bar', stacked=True)
    plt.title(f'Top Keywords for {n}-Word Phrases Over the Years')
    plt.xlabel('Year')
    plt.ylabel('Occurrences')
    plt.xticks(rotation=45)
    plt.legend(loc='upper right', title='Keywords')
    plt.tight_layout()
    plt.show()