In [1]:
import pandas as pd
from gensim import corpora, models
from googlesearch import search
import re
import nltk
import chardet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import requests

In [4]:
# Lets check what encoding we have for our NSFdata files
NSF_csv_files = ['NSFdata/NSF_CCF.csv', 'NSFdata/NSF_CICI.csv', 'NSFdata/NSF_CSSI.csv', 'NSFdata/NSF_DIBBS.csv', 'NSFdata/NSF_MRI.csv', 'NSFdata/NSF_OAC.csv', 'NSFdata/NSF_SI2.csv']
for file in NSF_csv_files:
    with open(file, 'rb') as f:
        result = chardet.detect(f.read())
        print(file, result)

NSFdata/NSF_CCF.csv {'encoding': 'ISO-8859-1', 'confidence': 0.7299916171744574, 'language': ''}
NSFdata/NSF_CICI.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
NSFdata/NSF_CSSI.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
NSFdata/NSF_DIBBS.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
NSFdata/NSF_MRI.csv {'encoding': 'ISO-8859-1', 'confidence': 0.7299962504897843, 'language': ''}
NSFdata/NSF_OAC.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
NSFdata/NSF_SI2.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
# 1. Read the CSV file and load it into a DataFrame
#data = pd.read_csv('NSFdata/NSF_DIBBS.csv', encoding='ISO-8859-1')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eduar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eduar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
projects = data[["AwardNumber", "Title", "NSFOrganization", "PrincipalInvestigator", "PIEmailAddress", "Abstract"]]
print(projects["Abstract"])

0      This project would automate the creation and d...
1      The growing number of cyber attacks on the Int...
2      Uranium-series geochronology plays a critical ...
3      Cybersecurity has become a significant issue t...
4      CIF21 DIBBs: Conceptualization of the Social a...
                             ...                        
125    ABSTRACT<br/><br/>OPP-9813312   OPP-9813442   ...
126    ABSTRACT<br/><br/>OPP-9813312   OPP-9813442   ...
127    ABSTRACT<br/><br/>OPP-9907197    OPP-9907469  ...
128    Current general circulation models (GCMs) have...
129    ABSTRACT<br/><br/>OPP-9907197    OPP-9907469  ...
Name: Abstract, Length: 130, dtype: object


In [3]:
def preprocess_abstract(abstract):
    if abstract is None or not isinstance(abstract, str):
        return []
    abstract = re.sub('<[^<]+?>', '', abstract)  # Remove HTML tags
    abstract = abstract.lower()  # Convert to lowercase
    abstract = re.sub(r'\W+', ' ', abstract)  # Remove special characters and numbers
    words = abstract.split()  # Tokenize
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize words
    return words

In [17]:
# 3. Process the Abstract column
abstracts = projects["Abstract"].apply(preprocess_abstract)
print(abstracts)

0      [project, would, automate, creation, data, ana...
1      [growing, number, cyber, attack, internet, cri...
2      [uranium, series, geochronology, play, critica...
3      [cybersecurity, become, significant, issue, pr...
4      [cif21, dibbs, conceptualization, social, inno...
                             ...                        
125    [abstractopp, 9813312, opp, 9813442, opp, 9813...
126    [abstractopp, 9813312, opp, 9813442, opp, 9813...
127    [abstractopp, 9907197, opp, 9907469, opp, 9907...
128    [current, general, circulation, model, gcms, d...
129    [abstractopp, 9907197, opp, 9907469, opp, 9907...
Name: Abstract, Length: 130, dtype: object


In [20]:
# 4. Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(abstracts)
print(dictionary)
# 5. Corpus is a list of bags of words. Each bag-of-words is a list of tuples (term_id, term_frequency).
corpus = [dictionary.doc2bow(text) for text in abstracts]

# 6. Define the LDA model
lda_model = models.LdaModel(corpus, num_topics=8, id2word=dictionary, passes=15)

Dictionary<4137 unique tokens: ['000', '1440753', '24', '50', '500']...>


In [4]:
# Create a new colunm LDA_ABSTRACTS to be classified into keywords based on the LDA model
def get_lda_keywords(lda_model, bow, num_keywords=8):
    topic_dist = lda_model[bow]
    dominant_topic = max(topic_dist, key=lambda x: x[1])[0]
    topic_terms = lda_model.get_topic_terms(dominant_topic, topn=num_keywords)
    return [dictionary[id] for id, prob in topic_terms]

#projects["LDA_abstract_keywords"] = projects["Abstract"].apply(lambda x: get_lda_keywords(lda_model, dictionary.doc2bow(preprocess_abstract(x))))
#projects["LDA_abstract_keywords"] = projects["Abstract"].apply(lambda x: lda_model[dictionary.doc2bow(preprocess_abstract(x))])

In [42]:
print(projects["LDA_abstract_keywords"])

0      [data, project, community, researcher, researc...
1      [data, project, information, science, scientif...
2      [data, research, project, science, new, commun...
3      [data, science, project, research, tool, commu...
4      [data, science, project, research, tool, commu...
                             ...                        
125    [record, atmospheric, ice, project, university...
126    [record, atmospheric, ice, project, university...
127    [snow, ice, chemistry, atmospheric, atmosphere...
128    [data, material, project, system, science, arc...
129    [snow, ice, chemistry, atmospheric, atmosphere...
Name: LDA_abstract_keywords, Length: 130, dtype: object


In [13]:
BING_API_KEY = "4071d4e339c04e5b98c7ce1366a3366e"
# 7. Defining Search for news articles and other online sources
def search_news(title):
    headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY}
    params = {
        "q": f'"{title}"',
        "count": 3,
        "offset": 0,
        "mkt": "en-US",
        "safesearch": "Moderate",
    }
    response = requests.get("https://api.bing.microsoft.com/v7.0/search", headers=headers, params=params)
    response.raise_for_status()
    search_results = response.json()
    if "webPages" not in search_results:
        return ''
    news_links = [result["url"] for result in search_results["webPages"]["value"] if "news" in result["url"] or "article" in result["url"]]
    return ', '.join(news_links)

In [5]:
def search_news(title):
    try:
        search_results = search(title, num_results=3, sleep_interval=15, advanced=True)
        news_links = [f'{result.title} ({result.url}) - {result.description}' for result in search_results]
        return ', '.join(news_links)
    except Exception as e:
        print(f"Error during search: {e}")
        return ''

In [48]:
projects["News"] = projects["Title"].apply(search_news)
print(projects["News"])

0       
1       
2       
3       
4       
      ..
125     
126     
127     
128     
129     
Name: News, Length: 130, dtype: object


In [6]:
# 8. Create a new DataFrame with the desired columns
def clean_abstract(abstract):
    if abstract is None or not isinstance(abstract, str):
        return []
    abstract = re.sub('<[^<]+?>', '', abstract)  # Remove HTML tags
    abstract = re.sub(r'\s+', ' ', abstract)  # Remove extra whitespaces
    return abstract.strip()

# projects["Clean_Abstract"] = projects["Abstract"].apply(clean_abstract)

# output = projects.rename(columns={"Title": "Project_title",
#                                   "NSFOrganization": "Funding_agency",
#                                   "AwardNumber": "Award_number",
#                                   "PrincipalInvestigator": "PI_Name",
#                                   "PIEmailAddress": "PI_contact",
#                                   "Clean_Abstract": "Abstracts"})

# output.drop(columns=["Abstract"], inplace=True)
# # Lets reorder the columns to have the following order - Project_title, Funding_agency, Award_number, PI_Name, PI_contact, LDA_abstract_keywords, Abstracts, News
# output = output[["Project_title", "Funding_agency", "Award_number", "PI_Name", "PI_contact", "LDA_abstract_keywords", "Abstracts", "News"]]

In [59]:
output.to_csv("NSF_DIBBS_final3_output.csv", index=False)

In [7]:
# Basically now we have to merge them all into one file using DOE NSF and NIH datasets.
NSF_csv_files = ['NSFdata/NSF_CCF.csv', 'NSFdata/NSF_CICI.csv', 'NSFdata/NSF_CSSI.csv', 'NSFdata/NSF_DIBBS.csv', 'NSFdata/NSF_MRI.csv', 'NSFdata/NSF_OAC.csv', 'NSFdata/NSF_SI2.csv']

all_projects = pd.DataFrame()

for file_path in NSF_csv_files:
    data = pd.read_csv(file_path, encoding='ISO-8859-1')
    projects = data[["AwardNumber", "Title", "NSFOrganization", "PrincipalInvestigator", "PIEmailAddress", "Abstract"]]
    abstracts = projects["Abstract"].apply(preprocess_abstract)
    dictionary = corpora.Dictionary(abstracts)
    corpus = [dictionary.doc2bow(text) for text in abstracts]
    lda_model = models.LdaModel(corpus, num_topics=8, id2word=dictionary, passes=3)
    projects["Keyword"] = projects["Abstract"].apply(lambda x: get_lda_keywords(lda_model, dictionary.doc2bow(preprocess_abstract(x))))
    projects["News"] = projects["Title"].apply(search_news)
    projects["Clean_Abstract"] = projects["Abstract"].apply(clean_abstract)
    
    output = projects.rename(columns={"Title": "Project_name",
                                      "NSFOrganization": "Funding_agency",
                                      "AwardNumber": "Award_number",
                                      "PrincipalInvestigator": "PI_name",
                                      "PIEmailAddress": "PI_contact",
                                      "Clean_Abstract": "Description"})
    
    output.drop(columns=["Abstract"], inplace=True)
    output = output[["Project_name", "Funding_agency", "Award_number", "PI_name", "PI_contact", "Keyword", "Description", "News"]]
    output_file = f"NSF_{file_path.split('/')[-1].split('.')[0]}_processed.csv"
    output.to_csv(output_file, index=False)
    print(f"Saved processed data to: {output_file}")
    all_projects = all_projects.append(output, ignore_index=True)

all_projects.to_csv("NSF_all_final.csv", index=False)


In [18]:
print(projects)

      AwardNumber                                              Title  \
0         9820538      Undergraduate Engineering in Medical Research   
1         1802188  Digitization TCN:  Collaborative Research: Cap...   
2         1802199  Digitization TCN:  Collaborative Research:  Ca...   
3         1802200  Digitization TCN:  Collaborative Research: Cap...   
4         1802178  Digitization TCN:  Collaborative Research:  Ca...   
...           ...                                                ...   
2995      1814888  SHF: Small: Communication-Efficient Distribute...   
2996      1816209  CIF: Small: Occlusion-Based Computational Imag...   
2997      1810758  NSF-BSF: AF: Small: An Algorithmic Theory of B...   
2998      1822342       CISE/SHF: Summer School on Formal Techniques   
2999      1751400  CAREER:Enabling Scalable, Modular, and Efficie...   

     NSFOrganization PrincipalInvestigator            PIEmailAddress  \
0                EEC         William Smith           smithw1@cc