In [1]:

import json
from dotenv import load_dotenv
from google import genai
import os
import pickle
from pypdf import PdfReader
import requests
from classes import ChemrxivItem

open_pickle_file = open("top_10_items_of_the_week.pkl", "rb")
items = pickle.load(open_pickle_file)

def get_pdf(item: ChemrxivItem):
    if os.path.exists(f"{item.title}.pdf"):
        pass
    else:
        url = item.asset.original.url
        response = requests.get(url)
        with open(f"{item.title}.pdf", "wb") as f:
            f.write(response.content)
    try:
        with open(f"{item.title}.pdf", "rb") as f:
            pdf_reader = PdfReader(f)
            paper = ""
            for page in pdf_reader.pages:
                paper += page.extract_text()
    except FileNotFoundError:
        return "Error: PDF file not found."
    return paper

    

def summerize_pdf(item: ChemrxivItem):
    paper = get_pdf(item)
    return send_to_llm('Explain the following chemistry paper to an undegrade student, write with 5 sentencesw:' + paper + 'in the format of a {response: ""}')

def generate_questions(item: ChemrxivItem):
    paper = get_pdf(item)
    generate_questions_prompt = """
Generate 10 questions based on the paper and the following course content, the questions should be multiple choice questions with 4 answers, the answers should be in the format of a list of strings, the correct answer should be the first answer in the list, it also must be json format, 
do not mention the course content in the response, do not respond with anything else than the json, it must be serializable by json.loads()
response:


{
        "question1": [answer1, answer2, answer3, answer4],
        "question2": [answer1, answer2, answer3, answer4],
        "question3": [answer1, answer2, answer3, answer4],
        "question4": [answer1, answer2, answer3, answer4],
        "question5": [answer1, answer2, answer3, answer4],
        "question6": [answer1, answer2, answer3, answer4],
        "question7": [answer1, answer2, answer3, answer4],
        "question8": [answer1, answer2, answer3, answer4],
        "question9": [answer1, answer2, answer3, answer4],
        "question10": [answer1, answer2, answer3, answer4]
}


### Course Content List for CHEM101: General Chemistry I

1. **Matter and Measurements**  
2. **The Atom**  
3. **Bonding**  
4. **Chemical Formulas and Equations**  
5. **States of Matter**  
6. **Thermochemistry and Thermodynamics**  
7. **Acid-Base and Oxidation-Reduction Reactions**  
8. **Nuclear Chemistry**
""" + paper
        
    responce = send_to_llm(generate_questions_prompt)

    import re
    json_pattern = re.compile(r'\{.*\}', re.DOTALL)
    match = json_pattern.search(responce)
    if match:
        json_string = match.group(0)
        try:
            questions = json.loads(json_string)
            print(questions)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
    else:
        print("No JSON found in the response.")



def send_to_llm(context):
    load_dotenv(dotenv_path = ".env")
    api_key = os.getenv("GEMINI_API_KEY")
    if api_key:
        client = genai.Client(api_key=api_key)
        response = client.models.generate_content(model='gemini-2.0-flash-exp', contents=context)
        return response.text
    else:
        print("Error: GEMINI_API_KEY not found in .env file.")



print(summerize_pdf(items[0]))
print(generate_questions(items[0]))

{response: "This paper explores using large language models (LLMs) to predict if a new inorganic crystal structure can be made, which is called synthesizability. The researchers trained LLMs using text descriptions of crystal structures and found that they could predict synthesizability as well as, or even better than, previous specialized machine learning methods. They also showed that LLMs could explain *why* a structure is predicted to be synthesizable or not, providing insights into the chemical principles influencing synthesis. Importantly, this approach can help scientists identify ways to modify hypothetical materials, making them more likely to be synthesized. Ultimately, this research demonstrates a novel way to use LLMs to advance materials design by combining predictive accuracy with human-understandable explanations."}



JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [8]:
item = items[0]



{'question1': ['Fine-tuned LLMs can achieve synthesizability predictive performance comparable to bespoke graph neural network models.', 'Fine-tuned LLMs perform worse than bespoke graph neural network models.', 'Fine-tuned LLMs cannot be used for synthesizability predictions.', 'Fine-tuned LLMs require more complex data than graph neural network models.'], 'question2': ['Text embeddings derived from LLMs can enhance synthesizability predictions when used with a PU classifier.', 'Text embeddings are not useful for synthesizability predictions.', 'Text embeddings are less effective than graph-based representations.', 'Text embeddings slow down the model training process.'], 'question3': ['Thermodynamic energy-based predictions often fail to account for materials that are energetically stable but unsynthesized.', 'Thermodynamic energy-based predictions are always accurate.', 'Thermodynamic energy-based predictions are only applicable to metastable materials.', 'Thermodynamic energy-based

In [6]:
print(responce)


```json
{
    "question1": [
        "Large language models (LLMs) trained on text descriptions of crystal structures can achieve comparable synthesizability prediction performance to bespoke graph neural networks.",
        "LLMs perform significantly worse than bespoke convolutional graph neural network methods for synthesizability prediction.",
         "LLMs can not be trained on human readable text descriptions.",
        "LLMs cannot be used to predict crystal structures."
    ],
    "question2": [
        "Positive-unlabeled learning, using text embeddings of the structure, can further improve synthesizability prediction accuracy.",
         "Positive-unlabeled learning decreases synthesizability prediction accuracy.",
        "Positive-unlabeled learning is not useful for this problem.",
        "Text embeddings of structure do not impact synthesizability prediction accuracy."
    ],
    "question3": [
        "LLMs can generate human-readable explanations for factors governing