In [None]:
# Imports
import pandas as pd
from gensim import corpora, models
from googlesearch import search
import re
import nltk
import chardet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import requests

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Clean the abstracts so they dont have HTML tags, special characters, numbers, stopwords, and lemmatize the words for better results in the LDA model
def preprocess_abstract(abstract):
    if abstract is None or not isinstance(abstract, str):
        return []
    abstract = re.sub('<[^<]+?>', '', abstract)  # Remove HTML tags
    abstract = abstract.lower()  # Convert to lowercase
    abstract = re.sub(r'\W+', ' ', abstract)  # Remove special characters and numbers
    words = abstract.split()  # Tokenize
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize words
    return words

In [None]:
# Create a new colunm "Keywords" to be classified into keywords based on the LDA model
def get_lda_keywords(lda_model, bow, num_keywords=8):
    topic_dist = lda_model[bow]
    dominant_topic = max(topic_dist, key=lambda x: x[1])[0]
    topic_terms = lda_model.get_topic_terms(dominant_topic, topn=num_keywords)
    return [dictionary[id] for id, prob in topic_terms]

In [None]:
# This cleans the final abstracts (note the difference is this one is for the final abstracts and the other one is for the abstracts ran for the LDA model they have diffrent processing)
def clean_abstract(abstract):
    if abstract is None or not isinstance(abstract, str):
        return []
    abstract = re.sub('<[^<]+?>', '', abstract)  # Remove HTML tags
    abstract = re.sub(r'\s+', ' ', abstract)  # Remove extra whitespaces
    return abstract.strip()

In [None]:
# # Load the DOEdata from xls files
# DOE_xls_files = ['DOEdata/DOE_CCF.xls', 'DOEdata/DOE_MRI.xls', 'DOEdata/DOE_OAC.xls']
# for file in DOE_xls_files:
#     df = pd.read_excel(file)
#     # Keep only the relevant columns (abstracts and titles)
#     df = df[['Title', 'Abstract']]
#     # Merge the data frames
#     data_frames.append(df)
# print('Data frames loaded', len(data_frames))

In [None]:
# # Load the NIHdata from CSV files
# NIH_csv_files = ['NIHdata/NIH_CCF.csv', 'NIHdata/NIH_CICI.csv', 'NIHdata/NIH_CSSI.csv', 'NIHdata/NIH_MRI.csv', 'NIHdata/NIH_OAC.csv']
# data_frames = []
# for file in NIH_csv_files:
#     df = pd.read_csv(file)
#     # Keep only the relevant columns (abstracts and titles)
#     df = df[['Title', 'Abstract']]
#     # Merge the data frames
#     data_frames.append(df)
# print('Data frames loaded', len(data_frames))

In [None]:
# Now we merge them all into one file using DOE NSF or NIH datasets, in the process we will save each segment keyword Ex. NSF_CCF.csv will have a .csv file etc. and a final .csv file of course
NSF_csv_files = ['NSFdata/NSF_CCF.csv', 'NSFdata/NSF_CICI.csv', 'NSFdata/NSF_CSSI.csv', 'NSFdata/NSF_DIBBS.csv', 'NSFdata/NSF_MRI.csv', 'NSFdata/NSF_OAC.csv', 'NSFdata/NSF_SI2.csv']

all_projects = pd.DataFrame()

for file_path in NSF_csv_files:
    data = pd.read_csv(file_path, encoding='ISO-8859-1')
    projects = data[["AwardNumber", "Title", "NSFOrganization", "PrincipalInvestigator", "PIEmailAddress", "Abstract"]]
    abstracts = projects["Abstract"].apply(preprocess_abstract)
    dictionary = corpora.Dictionary(abstracts)
    corpus = [dictionary.doc2bow(text) for text in abstracts]
    lda_model = models.LdaModel(corpus, num_topics=8, id2word=dictionary, passes=3)
    projects["Keyword"] = projects["Abstract"].apply(lambda x: get_lda_keywords(lda_model, dictionary.doc2bow(preprocess_abstract(x))))
    #projects["News"] = projects["Title"].apply(search_news)
    projects["Clean_Abstract"] = projects["Abstract"].apply(clean_abstract)
    
    output = projects.rename(columns={"Title": "Project_name",
                                      "NSFOrganization": "Funding_agency",
                                      "AwardNumber": "Award_number",
                                      "PrincipalInvestigator": "PI_name",
                                      "PIEmailAddress": "PI_contact",
                                      "Clean_Abstract": "Description"})
    
    output.drop(columns=["Abstract"], inplace=True)
    output = output[["Project_name", "Funding_agency", "Award_number", "PI_name", "PI_contact", "Keyword", "Description"]]
    output_file = f"NSF_{file_path.split('/')[-1].split('.')[0]}_processed.csv"
    output.to_csv(output_file, index=False)
    print(f"Saved processed data to: {output_file}")
    all_projects = all_projects.append(output, ignore_index=True)

all_projects.to_csv("NSF_all_final.csv", index=False)