In [1]:
import sys
import argparse
from datetime import datetime

# for sentimental analysis
import os
import glob
import pandas as pd
from torch.nn.functional import softmax
import torch
from huggingface_hub import login

from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer,
)
from transformers.pipelines import AggregationStrategy
import numpy as np



execution_date = '2025-05-16'

print('execution_date')
print(execution_date)

login(token='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')

read_path  = f'../data/5_sentiment/01_en/{execution_date}'
write_dir  = f'../data/6_key_topics/01_en/{execution_date}'
write_path = f'{write_dir}/00000.csv'

if not os.path.isdir(write_dir):
    os.makedirs(write_dir)

csv_files = glob.glob(read_path + "/*.csv")
df_list = (pd.read_csv(file) for file in csv_files)
df = pd.concat(df_list, ignore_index=True)
df['news_text'] = df['news_content'].fillna('') + ' ' + df['news_title']

print(df.head())

# Define keyphrase extraction pipeline
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, all_outputs):
        results = super().postprocess(
            all_outputs=all_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])
    
# Load pipeline
model_name = "ml6team/keyphrase-extraction-kbir-inspec"
extractor = KeyphraseExtractionPipeline(model=model_name)

def predict_key_topics(text):
    return extractor(text)

df['news_key_topics'] = df['news_text'].apply(predict_key_topics)
print(df.head())

df.to_csv(write_path, index=False, mode='w')

  from .autonotebook import tqdm as notebook_tqdm


execution_date
2025-05-16
  news_datetime                                         news_title  \
0    2025-05-06  Management Discussion and Analysis Quarter 1 E...   
1    2025-05-06  Management Discussion and Analysis Quarter 1 E...   
2    2025-05-06  Management Discussion and Analysis Quarter 1 E...   
3    2025-05-06  Financial Performance Quarter 1 (F45) (Reviewe...   
4    2025-05-06  Financial Statement Quarter 1/2025 (Reviewed) ...   

                                        news_content news agency  \
0  Management Discussion and Analysis  Company na...       seten   
1  Management Discussion and Analysis  Company na...       seten   
2  Management Discussion and Analysis  Company na...       seten   
3  Summary of operating result form (F45)        ...       seten   
4                                                NaN       seten   

  execution_date sentiment_1 sentiment_2 sentiment_3 sentiment_final  \
0     2025-05-16     neutral     neutral     neutral         neutral   


Device set to use cpu


  news_datetime                                         news_title  \
0    2025-05-06  Management Discussion and Analysis Quarter 1 E...   
1    2025-05-06  Management Discussion and Analysis Quarter 1 E...   
2    2025-05-06  Management Discussion and Analysis Quarter 1 E...   
3    2025-05-06  Financial Performance Quarter 1 (F45) (Reviewe...   
4    2025-05-06  Financial Statement Quarter 1/2025 (Reviewed) ...   

                                        news_content news agency  \
0  Management Discussion and Analysis  Company na...       seten   
1  Management Discussion and Analysis  Company na...       seten   
2  Management Discussion and Analysis  Company na...       seten   
3  Summary of operating result form (F45)        ...       seten   
4                                                NaN       seten   

  execution_date sentiment_1 sentiment_2 sentiment_3 sentiment_final  \
0     2025-05-16     neutral     neutral     neutral         neutral   
1     2025-05-16     neutr