In [1]:
import pandas as pd
import numpy as np
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoModel, AutoTokenizer  
from huggingface_hub import login
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
import json
from unimol_tools import UniMolRepr
torch.device('cuda' if torch.cuda.is_available() else 'cpu')

2025-01-30 16:47:23 | unimol_tools\weights\weighthub.py | 17 | INFO | Uni-Mol Tools | Weights will be downloaded to default directory: c:\Users\tianren\Anaconda3\lib\site-packages\unimol_tools\weights


device(type='cuda')

In [None]:
login(token = "XXXXXXXXXXXX")
model_id = "meta-llama/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModel.from_pretrained(model_id, device_map="auto")



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [None]:
with open(f"polymer_descriptions.json", 'r') as file:
    polymer_language = json.load(file)

In [4]:
def get_Llama_embedding(tokenizer,model, text):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():     
        outputs = model(**inputs) 
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Pooling for sentence embeddings
    return embeddings.squeeze()
    

In [5]:
def transform_polymer_json(input_data):
    """Transform the JSON structure to use polymer IDs as keys."""
    transformed_data = {}
    
    # Iterate through polymers and restructure
    for polymer in input_data["polymers"]:
        polymer_id = polymer["id"]
        # Create new structure without the id field since it's now the key
        transformed_data[polymer_id] = {
            "name": polymer["name"],
            "description": polymer["description"],
            "metadata": polymer["metadata"]
        }
    
    return transformed_data

transformed_polymer_language = transform_polymer_json(polymer_language)

In [6]:
from tqdm import tqdm
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

polymer_language_embedding_text = {}
GPT_DIM = 4096

# Get total count for progress bar
total_items = len(transformed_polymer_language.items())
processed = 0

# Create progress bar with additional stats
pbar = tqdm(total=total_items, 
            desc="Processing polymers",
            bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')

for key, text in sorted(transformed_polymer_language.items()):
    if key not in polymer_language_embedding_text:
        processed += 1
        pbar.set_description(f"Processing polymer {key} ({processed}/{total_items})")
        
        if transformed_polymer_language[key] == '':
            polymer_language_embedding_text[key] = np.zeros(GPT_DIM)
        else:
            polymer_language_embedding_text[key] = get_Llama_embedding(tokenizer, model, text['description'])
        
        pbar.update(1)

pbar.close()

Processing polymer P522021 (4379/4379): 100%|██████████| 4379/4379 [4:22:15<00:00,  3.59s/it]  


In [8]:
import pickle
with open(f"Llama_polymer_embeddings_v3_smile.pickle", "wb") as fp:
    pickle.dump(polymer_language_embedding_text, fp)