In [31]:
# Imports
import pandas as pd
import nltk
import chardet
import re
import string
from gensim import corpora, models
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [26]:
# Load the NIHdata from CSV files
NIH_csv_files = ['NIHdata/NIH_CCF.csv', 'NIHdata/NIH_CICI.csv', 'NIHdata/NIH_CSSI.csv', 'NIHdata/NIH_MRI.csv', 'NIHdata/NIH_OAC.csv']
data_frames = []
for file in NIH_csv_files:
    df = pd.read_csv(file)
    # Keep only the relevant columns (abstracts and titles)
    df = df[['Title', 'Abstract']]
    # Merge the data frames
    data_frames.append(df)
print('Data frames loaded', len(data_frames))

Data frames loaded 5


In [15]:
# Lets check what encoding we have for our NSFdata files
NSF_csv_files = ['NSFdata/NSF_CCF.csv', 'NSFdata/NSF_CICI.csv', 'NSFdata/NSF_CSSI.csv', 'NSFdata/NSF_DIBBS.csv', 'NSFdata/NSF_MRI.csv', 'NSFdata/NSF_OAC.csv', 'NSFdata/NSF_SI2.csv']
for file in NSF_csv_files:
    with open(file, 'rb') as f:
        result = chardet.detect(f.read())
        print(file, result)

NSFdata/NSF_CCF.csv {'encoding': 'ISO-8859-1', 'confidence': 0.7299916171744574, 'language': ''}
NSFdata/NSF_CICI.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
NSFdata/NSF_CSSI.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
NSFdata/NSF_DIBBS.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
NSFdata/NSF_MRI.csv {'encoding': 'ISO-8859-1', 'confidence': 0.7299962504897843, 'language': ''}
NSFdata/NSF_OAC.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
NSFdata/NSF_SI2.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [27]:
# Load the NSFdata from CSV files
NSF_csv_files = ['NSFdata/NSF_CCF.csv', 'NSFdata/NSF_CICI.csv', 'NSFdata/NSF_CSSI.csv', 'NSFdata/NSF_DIBBS.csv', 'NSFdata/NSF_MRI.csv', 'NSFdata/NSF_OAC.csv', 'NSFdata/NSF_SI2.csv']
for file in NSF_csv_files:
    df = pd.read_csv(file, encoding='ISO-8859-1')
    # Keep only the relevant columns (abstracts and titles)
    df = df[['Title', 'Abstract']]
    # Merge the data frames
    data_frames.append(df)
print('Data frames loaded', len(data_frames))

Data frames loaded 12


In [19]:
# Lets check what encoding we have for our DOEdata files
DOE_xls_files = ['DOEdata/DOE_CCF.xls', 'DOEdata/DOE_MRI.xls', 'DOEdata/DOE_OAC.xls']
for file in DOE_xls_files:
    with open(file, 'rb') as f:
        result = chardet.detect(f.read())
        print(file, result)

DOEdata/DOE_CCF.xls {'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}
DOEdata/DOE_MRI.xls {'encoding': 'Windows-1254', 'confidence': 0.3892575204703193, 'language': 'Turkish'}
DOEdata/DOE_OAC.xls {'encoding': 'Windows-1254', 'confidence': 0.4989260344799692, 'language': 'Turkish'}


In [28]:
# Load the DOEdata from xls files
DOE_xls_files = ['DOEdata/DOE_CCF.xls', 'DOEdata/DOE_MRI.xls', 'DOEdata/DOE_OAC.xls']
for file in DOE_xls_files:
    df = pd.read_excel(file)
    # Keep only the relevant columns (abstracts and titles)
    df = df[['Title', 'Abstract']]
    # Merge the data frames
    data_frames.append(df)
print('Data frames loaded', len(data_frames))

Data frames loaded 15


In [29]:
# Concatenate the data frames
df = pd.concat(data_frames, ignore_index=True)

# Save the concatenated dataframe as a CSV file
df.to_csv('Merged_data_DOE3_NIH5_NSF7_15DF.csv', index=False)

In [None]:
# Remove HTML tags
df['Title'] = df['Title'].apply(lambda x: re.sub('<.*?>', '', x))
df['Abstract'] = df['Abstract'].apply(lambda x: re.sub('<.*?>', '', x))

# Remove other non-alphanumeric characters
df['Title'] = df['Title'].apply(lambda x: re.sub('[^0-9a-zA-Z]+', ' ', x))
df['Abstract'] = df['Abstract'].apply(lambda x: re.sub('[^0-9a-zA-Z]+', ' ', x))

# Remove numbers and punctuation
translator = str.maketrans('', '', string.punctuation + string.digits)
df['Title'] = df['Title'].apply(lambda x: x.translate(translator))
df['Abstract'] = df['Abstract'].apply(lambda x: x.translate(translator))

# Remove URLs and email addresses
df['Title'] = df['Title'].apply(lambda x: re.sub(r'\S+@\S+', '', x))
df['Title'] = df['Title'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Abstract'] = df['Abstract'].apply(lambda x: re.sub(r'\S+@\S+', '', x))
df['Abstract'] = df['Abstract'].apply(lambda x: re.sub(r'http\S+', '', x))

# Preprocess the text
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english') + ['a', 'an', 'the'])
porter_stemmer = PorterStemmer()

# Tokenize the text into individual words or phrases
df['Title'] = df['Title'].apply(lambda x: tokenizer.tokenize(x.lower()))
# Remove stop words
df['Title'] = df['Title'].apply(lambda x: [word for word in x if word not in stop_words])
# Stem the words
df['Title'] = df['Title'].apply(lambda x: [porter_stemmer.stem(word) for word in x])
# Tokenize the text into individual words or phrases
df['Abstract'] = df['Abstract'].apply(lambda x: tokenizer.tokenize(x.lower()))
# Remove stop words
df['Abstract'] = df['Abstract'].apply(lambda x: [word for word in x if word not in stop_words])
# Stem the words
df['Abstract'] = df['Abstract'].apply(lambda x: [porter_stemmer.stem(word) for word in x])

# Create a dictionary of the terms and their frequencies
texts = df['Title'] + df['Abstract']
dictionary = corpora.Dictionary(texts)

# Create a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# Train the LDA model
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)

# Print the topics and their top keywords
for topic in lda_model.print_topics():
    print(topic)

# This code saves the topics and their top keywords to a file named topics.txt in the current working directory. The print function redirects the output to the file using the file argument.
# Save the topics and their top keywords to a file
with open('topics.txt', 'w') as f:
    for topic in lda_model.print_topics():
        print(topic, file=f)

In [30]:
from gensim.models.coherencemodel import CoherenceModel

# Evaluate the coherence of the topics for different values of num_topics
for num_topics in range(5, 20):
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    print(f"Number of topics: {num_topics}, coherence score: {coherence_score}")

""""
This code trains an LDA model for different values of num of topics, evaluates the coherence of the resulting topics using the CoherenceModel class with the 'c_v' coherence measure, and prints the coherence score for each value of num_topics. 
You can use the coherence score to select the best value of num_topics that produces the most coherent and interpretable topics.

For the passes parameter, you can start with a small number (e.g., 5) and gradually increase it to see if the topics become more stable and consistent across different runs. 
The passes parameter controls the number of times the model goes through the entire dataset, so increasing it can improve the stability and quality of the topics, but also increases the computational cost. 
You can use the coherence score as a guide to determine the optimal number of passes.
"""

SyntaxError: unterminated triple-quoted string literal (detected at line 10) (3341524094.py, line 10)