In [1]:
from BERTopicModel import BERTopicModel
import re
import pandas as pd
from sentence_transformers import SentenceTransformer

from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.similarity_metrics import PairwiseJaccardSimilarity

import gensim
import gensim.corpora as corpora

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrut\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrut\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Define a function to clean text
def clean_text(text: str) -> str:
    """
    Clean the input text by converting to lowercase, removing non-alphanumeric characters, and extra spaces.

    Parameters:
    - text (str): The text to clean.

    Returns:
    - str: The cleaned text.
    """
    try:
        text = text.lower()
        text = re.sub('[^a-zA-Z0-9 ]+', ' ', text)
        text = re.sub(' +', ' ', text)
        text = text.strip()
    except:
        text = ""
    return text

In [3]:
df = pd.read_csv("../data/cleaned_data.csv")

In [4]:
# Clean the abstract column in the dataframe
df['abstract'] = df['abstract'].apply(clean_text)

# Filter the dataframe based on certain conditions
subset = df[(df['abstract'].str.len() > 10) & (df['type'] == 'article') & (df['language'] == 'en') & (df['title'].str.len() > 10)]

# Define the percentage of subsampling
subsample_percentage = 5

# Calculate the number of rows for the subsample
subsample_size = int(len(subset) * (subsample_percentage / 100))

# Randomly sample the data
subsample = subset.sample(n=subsample_size, random_state=42)

# Split the title and abstract documents into lists
title_docs = subsample["title"].to_list()
abstract_docs = subsample["abstract"].to_list()

In [6]:
title_dataset = [d.split() for d in title_docs]
abstract_dataset = [d.split() for d in abstract_docs]

In [7]:
# Train Model 1
bert_base_model = BERTopicModel()
results1, freq1, topic_dict1, topics1, model1 = bert_base_model.train_model(title_docs)

# Calculate Coherence and Topic Diversity Scores for Model 1
npmi1 = Coherence(texts=title_dataset, topk=10, measure='c_v').score(results1)
td1 = TopicDiversity().score(results1)

In [8]:
print("Coherence Score for Model 1: ", npmi1)
print("Topic Diversity Score of Model 1: ", td1)

Coherence Score for Model 1:  0.4712433650708998
Topic Diversity Score of Model 1:  0.8692307692307693


In [None]:
# Train Model 2
bert_base_model = BERTopicModel(nr_topics="auto")
results2, freq2, topic_dict2, topics2, model2 = bert_base_model.train_model(abstract_docs)

# Calculate Coherence and Topic Diversity Scores for Model 2
npmi2 = Coherence(texts=abstract_dataset, topk=10, measure='c_v').score(results2)
td2 = TopicDiversity().score(results2)

In [None]:
messages = [
    {"role": "system", "content": "You are a topic representation creator model for studies in the domain of NLP. Your task is to determine the sub-domain of the research work based on its abstract. Each sub-domain name should not exceed more than 4 words. Your representations should be specific and focus on the most described object. Always provide a representation. Please do not use the following words delimited with triple backticks: '''natural language processing, computer science, machine learning, artificial intelligence'''"},
    {'role': 'user', 'content': """I have a topic that contains the following documents which are delimited with triple backticks:
'''-  business world large companies that can achieve continuity in innovation gain a significant competitive advantage the sensitivity of these companies to follow and monitor news sources in e commerce social media and forums provides important information to businesses in decision making process large amount of data shared in these resources sentiment analysis can be made from people s comments about services and products users emotions can be extracted and important feedback can be obtained all of this is of course possible with accurate sentiment analysis this study new data sets were created for turkish english and arabic and for first time comparative sentiment analysis was performed from texts in three different languages addition a very comprehensive study was presented to researchers by comparing performances of both pre trained language models for turkish arabic and english as well as deep learning and machine learning models our paper will guide researchers working on sentiment analysis about which methods will be more successful in texts written in different languages which contain different types and spelling mistakes which factors will affect success and how much these factors will affect performance,
- analysis also called opinion mining is field of study that analyzes people s opinions sentiments attitudes and emotions are important sentiment analysis since songs and mood are mutually dependent on each other on selected song it becomes easy find mood of listener future it can be used for recommendation the song lyric is a rich source of datasets containing words that are helpful analysis and classification of sentiments generated from it now a days observe a lot of inter sentential and intra sentential code mixing songs which has a varying impact on audience to study this impact created a telugu songs dataset which contained both telugu english code mixed and pure telugu songs in this paper classify songs based on its arousal as exciting or non exciting we develop a language identification tool and introduce code mixing features obtained from it as additional features system with these additional features attains 4 5 accuracy greater than traditional approaches on our dataset, 
- this paper we propose a sentiment analysis model for the assessment of teacher performance in the classroom by tweets written by a pilot group of college students naive bayes nb is the technique to be applied to classify tweets based on the polar express emotion positive negative and neutral to carry out this process a dataset fits adding distinctive terms of context as possible features to support the classification process, 
- analysis refers to classify emotion of a text whether positive or negative the studies conducted on sentiment analysis are generally based on english and other languages while there are limited studies on turkish in this study after constructing a dataset using a well known hotel reservation site booking com compare performances of different machine learning approaches we also apply dictionary based method sentitfidf which differs from traditional methods due to their logarithmic differential term frequency and term presence distribution usage the results are evaluated using area under of a receiver operating characteristic roc curve auc the results show that using document term matrix as input gives better classification results than tfidf matrix we also observe that best results are obtained using random forest classifier with an auc value of 89 on both positive and negative comments, 
- the current era of computing the use of social networking sites like twitter and facebook is growing significantly over time people from different cultures and backgrounds share vast volumes of textual comments that show their viewpoints on several aspects of life and make them available to all for commenting monitoring real social media activities has now become a prime concern for politicians in understanding their social image this paper are going to analyse the tweets of various social media platforms regarding two prominent political leaders and classify them as positive negative or neutral using machine learning and deep learning methods we have proposed a deep learning approach for a better solution our proposed model has provided state of the art results using deep learning models'''
It must be in the following format: <topic label>"""},
    {'role': 'assistant', 'content': 'Sentiment Analysis'},
    {"role": "user", "content": """I have a topic that contains the following documents which are delimited with triple backticks:
'''[DOCUMENTS]'''
REMEMBER to only use 1-4 words and to NOT use the following words delimited with triple backticks: '''natural language processing, computer science, machine learning, artificial intelligence'''
It must be in the following format: <topic label>"""},
]

In [None]:
# Train Model 3
sentence_model = SentenceTransformer('all-mpnet-base-v2', device="cuda")
embeddings = sentence_model.encode(title_docs, show_progress_bar=True)
bert_base_model = BERTopicModel(embeddings=embeddings, reduce_outliers=True, messages=messages)
results3, freq3, topic_dict3, topics3, model3 = bert_base_model.train_model(title_docs)

# Calculate Coherence and Topic Diversity Scores for Model 3
npmi3 = Coherence(texts=title_dataset, topk=10, measure='c_v').score(results3)
td3 = TopicDiversity().score(results3)

In [None]:
# Add the Topic and Topic Words to the subsample dataframe
subsample["Topic"] = topics3
topic_dict_words = {x: [i[0] for i in topic_dict3[x]] for x in topic_dict3}
topic_dict_first_word = {x: topic_dict3[x][0][0] for x in topic_dict3}
subsample['topic_words'] = subsample['Topic'].map(topic_dict_words)
subsample['topic_first_word'] = subsample['Topic'].map(topic_dict_first_word)