## 1. Dependency Management and Setup

In [None]:
!pip install openai

In [None]:
from bs4 import BeautifulSoup
import requests
import openai
import pandas as pd
from openai import OpenAI
import json
import os

In [None]:
# Replace 'your_api_key_here' with your actual OpenAI API key
os.environ['OPENAI_API_KEY'] = 'your_api_key_here'

In [None]:
# Set max column width to None for no truncation
pd.set_option('display.max_colwidth', None)

# Set the total display width
pd.set_option('display.width', None)

## 2. (Optional) File Handling and Web Scraping
This part is used to parse HTML into CSV. Skip this step if you have "session_5.csv", or "session_6.csv" in the folder. If not, go to Neurips poster session page (e.g. [Poster session 5](https://nips.cc/virtual/2023/session/74073)), right click save to download the html page.

In [None]:
def open_file(path):
  with open(path, 'r') as file:
      return file.read()

def get_titles(soup, posters):
  h5_tag = soup.find_all('h5')
  for i in range(1,len(h5_tag)):
      this_tag = h5_tag[i]
      title = this_tag.get_text(strip=True)
      posters[i] = {}
      posters[i]["title"] = title
  return posters

def get_abstract_and_position(soup, posters):
  abstract_tags = soup.find_all(class_='abstract')
  poster_position_tags = soup.find_all('div', title='Poster Position')

  for j in range(len(abstract_tags)):
      tag = abstract_tags[j]
      abstract = tag.get_text(strip=True)
      posters[j+1]["abstract"] = abstract
      poster_position = poster_position_tags[j].get_text(strip=True)
      posters[j + 1]["position"] = poster_position[1:]
  return posters

def get_posters(path):
  html_content = open_file(path)
  soup = BeautifulSoup(html_content, 'lxml')
  posters = {}
  posters = get_titles(soup = soup, posters = posters)
  posters = get_abstract_and_position(soup = soup, posters = posters)
  return pd.DataFrame.from_dict(posters, orient='index')

In [None]:
path = "/content/session_6.html"
df = get_posters(path).reset_index(drop=True)
df.to_csv("session_6.csv", index=False)

## 3. Get summary, score and keywords from API

In [None]:
csv_path = "/content/session_6.csv"
df = pd.read_csv(csv_path)
posters = df.to_dict(orient='index')

### Modify the following prompt according to your interests
This will be used to evaluate whether a paper is relevant to your interest area or not. The response also contains a relecancy score from 0 to 10, 0 means not relevant at all, 10 means very relevant.

In [None]:
# interests_prompt = "keyword 1, keyword 2, keyword 3..."

In [None]:
# Keep the format prompt, no change needed
format_prompt = """Follow the instructions as precisely as possible. Every time you receive input in this json format: {{"title": "Awesome paper title", "abstract": "Awesome paper abstract text"}}I want you to provide summary, keywords and score following json format in plain text. Never provide additional context. {{"summary": "This is one sentence summary about this paper explained in simple words.","keywords": ["keyword1", "keyword2", "keyword3", ..., "keywordn"],"score": 10}}"""

client = OpenAI()
def api_call(client, title, description):
  completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": "As a machine learning expert, your task is to distill key information from NeuRIPS 2023 poster sessions for students. For each poster: Craft a one-sentence summary that encapsulates the primary research focus using simple, easy-to-understand words. If it's related to any specific domain knowledge, explain in simple terms but keep concise. Identify five pertinent keywords associated with the research. Assign a score between 1 to 10, reflecting how well the research aligns with the student's interests in " + interests_prompt + ". A score of 10 denotes high relevance and exceptional quality, whereas a score of 1 indicates minimal relevance which has none or little overlapping with student's insterests. Adopt a critical and discerning approach when assigning scores; avoid indiscriminately high ratings. Only rate 10 when a paper covers every interest. The evaluation should guide students towards posters that are most relevant to their specified interests, aiding in their academic pursuits."},
      {"role": "user", "content": "Poster's title: "+ title + "Abstract: "+ description + " " + format_prompt}
    ]
  )
  return completion.choices[0].message.content

In [None]:
def validate_and_return_json(s):
    try:
        data = json.loads(s)
    except json.JSONDecodeError:
        return False

    if not all(key in data for key in ["summary", "keywords", "score"]):
        return False

    if not isinstance(data["summary"], str):
        return False

    if not (isinstance(data["keywords"], list) and all(isinstance(k, str) for k in data["keywords"])):
        return False

    if not isinstance(data["score"], int):
        return False

    return json.dumps(data)

In [None]:
def validated_api_call(client, title, abstract):
    max_attempts = 3
    for attempt in range(max_attempts):
        try:
            response = api_call(client, title, abstract)
            validation_result = validate_and_return_json(response)
            if validation_result:
                validated_data = json.loads(validation_result)
                return validated_data["summary"], validated_data["keywords"], validated_data["score"]
        except Exception as e:
            print(f"An error occurred: {e}")
    return None

In [None]:
annotated_posters={}
i = 0
while i <= (len(posters)-1):
    title = posters[i]['title']
    position = posters[i]['position']
    abstract = posters[i]["abstract"]
    response = validated_api_call(client, title, abstract)
    posters[i]['summary'] = response
    annotated_posters[i] = {}
    annotated_posters[i]["title"] = title
    annotated_posters[i]["position"] = position
    annotated_posters[i]["abstract"] = abstract
    if response:
      annotated_posters[i]["summary"] = response[0]
      annotated_posters[i]["keywords"] = response[1]
      annotated_posters[i]["score"] = response[2]
    else:
      annotated_posters[i]["summary"] = None
      annotated_posters[i]["keywords"] = []
      annotated_posters[i]["score"] = None
    print(posters[i])
    i += 1

In [None]:
df_annotated = pd.DataFrame.from_dict(annotated_posters, orient='index').sort_index()

In [None]:
df_annotated.to_csv("annotated_posters.csv")

## 4. Review relevant papers

In [None]:
def get_ranked_papers(df, threshold):
  df_annotated_noab = df[['title','position','summary','keywords','score']].dropna(subset=['score'])
  df_annotated_noab['score'] = df_annotated_noab['score'].astype('int')
  df_annotated_noab_high = df_annotated_noab[df_annotated_noab['score'] >= threshold]
  df_annotated_noab_high = df_annotated_noab_high.sort_values(by=['score'], ascending=False)
  return df_annotated_noab_high

In [None]:
# Modify the threshold to filter the papers
threshold = 9
df_ranked = get_ranked_papers(df_annotated, threshold)
df_ranked.head()

Unnamed: 0,title,position,summary,keywords,score
0,Sample-efficient Multi-objective Molecular Optimization with GFlowNets,100,This research focuses on developing a method for optimizing molecular structures with desired properties by considering multiple objectives and efficiently exploring the search space.,"[molecular optimization, multi-objective optimization, hypernetworks, Bayesian optimization, sample efficiency]",7
3,Uncovering Neural Scaling Laws in Molecular Representation Learning,104,"This paper explores the relationship between data quantity and quality on molecular representation learning (MRL) in drug and materials discovery, identifying scaling laws and potential avenues for improving learning efficiency.","[molecular representation learning, data quantity and quality, scaling laws, learning efficiency, drug and materials discovery]",7
4,ProteinShake: Building datasets and benchmarks for deep learning on protein structures,105,"ProteinShake is a software package that simplifies dataset creation and evaluation for deep learning on protein structures, providing benchmarks and datasets to improve performance in biology-related tasks.","[ProteinShake, deep learning, dataset creation, protein structures, benchmark]",7
