In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import string
import re
import nltk
from nltk.corpus import stopwords  # Import the stopwords module from NLTK

nltk.download('stopwords')

# Load data from Excel
df = pd.read_excel('LDA_Abstracts.xls')

# Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))

# Add custom stopwords
custom_stopwords = ["50", "95", "reserved", "It", "05", "study", "conducted", "xhtml", "w3", "sup", "sub", "xmlns", "1999", "elsevier", "ltd",  "org", "www", "significant", "significantly", "higher", "http", "mg", "kg"]

# Extend the default list with custom stopwords
stop_words.update(custom_stopwords)

# Define a function to preprocess text
def preprocess_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Tokenize text and remove stopwords and punctuation
        tokens = re.findall(r'\b\w+\b', text.lower())  # Tokenize text
        tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
        return ' '.join(tokens)
    else:
        return ''

# Apply preprocess_text function
df['Content'] = df['abstract'].apply(preprocess_text)

# Vectorize content
vect = CountVectorizer(max_df=0.95, min_df=10, ngram_range=(2, 3))  # Limit phrases to 2 to 3 words
dtm = vect.fit_transform(df['Content'])

# (change n_components) Apply LDA to get 30 topics (random state is the seed)
lda = LatentDirichletAllocation(n_components=30, random_state=42, doc_topic_prior=0.0000001, topic_word_prior=0.0000001)
lda_topics = lda.fit_transform(dtm)

# Get feature names
feature_names = vect.get_feature_names()

# Get topic words (phrases)
topic_phrases = {}
for i, topic in enumerate(lda.components_):
    topic_indices = topic.argsort()[-10:] # This -10 has nothing to do with n_components
    topic_words = [feature_names[index] for index in topic_indices]
    topic_phrases[i] = ' '.join(topic_words)

# (change range) Add topics as columns 
for i in range(30):
    col_name = 'Topic_' + str(i)        
    df[col_name] = lda_topics[:, i]

# (change range) Calculate propensity scores    
for i in range(30):
    col_name = 'Topic_' + str(i)
    
    df[col_name + '_Score'] = df.apply(lambda x: 
        (sum([1 for word in x['Content'].split() if word in topic_phrases[i].split()]) / len(x['Content'].split())) if len(x['Content'].split()) > 0 else 0, 
        axis=1)

# Export DataFrame with phrases and corresponding topic numbers
phrases_df = pd.DataFrame({'Topic Phrases': topic_phrases.values(), 'Topic Number': topic_phrases.keys()})
phrases_df.to_csv('lda_topic_phrases.csv', index=False, encoding='utf-8')

# (change range) Specify the columns to save in the 'lda_output.csv', including the "id" column
columns_to_save = ['id', 'abstract'] + [f'Topic_{i}' for i in range(30)] + [f'Topic_{i}_Score' for i in range(30)]
df.to_csv('lda_output.csv', index=False, encoding='utf-8', columns=columns_to_save)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\trevor_woolley\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
