In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = str(4) + "," + str(5) 

import json
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
from tqdm import tqdm
from itertools import islice
from torch.utils.data import Dataset, DataLoader

print(f"Process ID: {os.getpid()}")

Process ID: 1302


In [62]:
# # Model checkpoint and paths
# CHECKPOINT = "unsloth/Llama-3.3-70B-Instruct-bnb-4bit"
# CACHE_DIR = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/.cache/huggingface/hub/"
# INPUT_FILE = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/commercial_buildings_7980.jsonl"
# OUTPUT_FILE = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/generated_tweets__.jsonl"

# Model checkpoint and paths
CHECKPOINT = "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # unsloth/Llama-3.3-70B-Instruct-bnb-4bit
CACHE_DIR = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/.cache/huggingface/hub/"
INPUT_FILE = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/residential_buildings_6558.jsonl"
OUTPUT_FILE = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/generated_tweets_residential_4000.jsonl"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

# Define the quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,        # Specify Int8 quantization
    # llm_int8_threshold=6.0    # Adjust this if needed for performance
)

# Load the model with sharding
model = AutoModelForCausalLM.from_pretrained(
    CHECKPOINT,
    device_map="balanced",      # Automatically shard layers across GPUs/CPU
    quantization_config=quantization_config,
    torch_dtype="float16",  # Use FP16 to save memory (optional)
    # offload_folder="./offload",  # Optional: Folder for CPU offloading if GPUs run out of memory
    cache_dir=CACHE_DIR
)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [63]:
# Create text generation pipeline
text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="balanced",
    repetition_penalty=1.3, # Discourage repetition
    temperature=1.4,       # Increase randomness
    top_k=100,              # Consider top 50 probable words
    top_p=0.9,            # Use nucleus sampling
    max_new_tokens=250,
    return_full_text=False,  # To focus on the generated part only
)


Device set to use cuda:0


In [64]:
# context_prompt = (
#     "You are tasked with generating tweets as if they were posted by real Twitter users in a specific commercial building.\n"
#     "Diversity within tweets for a single commercial building: Each tweet should reflect a unique experience or perspective.\n"
#     "Diversity across commercial buildings: Tweets for different commercial buildings should not reuse the same templates or expressions from previous buildings.\n"
#     "Minimized Template Overlap: Use a larger pool of unique expressions and ensure non-repetition across tweets and buildings.\n"
#     "User Simulation: Imagine switching personas for every new tweet to simulate multiple Twitter users. Treat each tweet as a unique thought or experience from a different user. You can also use @, # and emojis sometimes.\n"
#     "Imaginative Scenarios: Generate tweets that highlight varied aspects such as architecture, service, food, events, or location.\n"
#     "Use all building tags and all building names of a building as inspiration. Incorporate them naturally without overemphasis or repetition.\n"
#     "Tweet should not start with 'Just', 'Scored','Love','Loving','Shopped','Shopping','Spend','Shout','Exploring'\n"
#     "You must generate only one tweet in each language specified under 'tweet language distribution', written directly in that language.\n\n"
    
#     "Example:\n\n"
#     '{"building id":227579, "building tags": "university", "building names": ["Institute of Psychiatry, Psychology & Neuroscience"], "tweet language distribution": ["English", "Chinese", "Chinese"]}\n\n'
#     '{227579:["Truly inspiring to see the groundbreaking work they\'re doing in mental health research @IPPN # ResearchLife .", "了解到了许多关于心理健康的研究 👏👏", "有钱真好，羡慕这里的学生。@Mengshan"]}'
# )

context_prompt = (
    "Generate tweets as if they were posted by real Twitter users in a residential building.\n"
    "Diversity within tweets for a single residential building: Ensure that each tweet reflects a unique perspective or experience.\n"
    "Consider varying the tone (e.g., humorous, synic, formal, casual), the length (short and concise, longer and detailed), and the use of mention, emojis or hashtags.\n"
    "Imaginative Scenarios: Highlight varied aspects of the building, such as its services, location, or activities, etc. Be creative and explore more angles.\n"
    "Personas: Imagine switching personas for each tweet, simulating thoughts from different types of users, such as teenagers, friends, relatives or families.\n"
    "Incorporate building-specific details like building tags implicitly without overemphasize, and do not mention the building name. \n"
    "Tweet should not start with 'Just','Love','Loving','Exciting','Excited','Woke','Still','Feeling','My','Moved','Die Aussicht','Je' in any language."
    "You shoule only generate one tweet in each language specified under 'tweet language distribution' list provided, and written directly in that language.\n\n"
    
    "Example:\n\n"
    '{"building id":227579, building city: "London", "building tags": "apartments", "building names": "Moo", "tweet language distribution": ["English", "German", "Chinese"]}\n\n'
    '{227579: ["Finally moved in my little aprtment in London! 🏡✨ #NewBeginnings", "@Viola Erstaunlich ruhig, trotz der zentralen Lage ", "偶尔冥想，好像时间都慢下来了 🧘"]}'
)

In [66]:
# Format data into a prompt
def format_data_prompt(metadata):
    return json.dumps(metadata, ensure_ascii=False) + ' returns {building id: list of generated tweets}'

# Process a single line of metadata
def process_metadata(metadata):
    data_prompt = format_data_prompt(metadata)
    prompt = [
        {"role": "system", "content": context_prompt},
        {"role": "user", "content": data_prompt}]
    return prompt

In [67]:
# Function to write results iteratively
def write_result(outfile, index, result):
    try:
        outfile.write(json.dumps({"index": index, "output": result}, ensure_ascii=False) + "\n")
        outfile.flush()  # Immediately save to disk
    except Exception as e:
        print(f"Error writing result for index {index}: {e}")

In [61]:
# Start processing from the 4000th row
START_ROW = 4280

with open(INPUT_FILE, "r", encoding="utf-8") as infile, open(OUTPUT_FILE, "a", encoding="utf-8") as outfile:
    # Skip the first START_ROW - 1 rows
    for index, line in enumerate(tqdm(islice(infile, START_ROW - 1, None), desc="Processing buildings"), start=START_ROW):
        try:
            metadata = json.loads(line)
            prompt = process_metadata(metadata)

            # Call the text generation pipeline
            result = text_pipeline(prompt)
            generated_text = result[0]["generated_text"]

            # Write the generated result with the index
            write_result(outfile, index, generated_text)
        except Exception as e:
            # Write "failed" with the exception message
            write_result(outfile, index, str(e))

Processing buildings: 645it [22:36:04, 64.47s/it] 