In [4]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from openai import OpenAI
import requests
import json
from tqdm import tqdm
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer, util

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# %pip install datasets torch peft openai requests tqdm

In [7]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B", padding_side = 'left')
model = AutoModelForCausalLM.from_pretrained("./dpo_model")

# IMPORTANT: SET PAD TOKEN
tokenizer.add_special_tokens({'pad_token': '<|pad|>',
                              'bos_token': '<|im_start|>',
                              'eos_token': '<|im_end|>'})
model.config.pad_token_id = tokenizer.pad_token_id
model.config.bos_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896, padding_idx=151665)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_e

In [8]:
# Load and shuffle the full test split
train_ultrafeedback = load_dataset("HuggingFaceH4/ultrafeedback_binarized",
                                   revision="292c16329d921287c4166934cac1a6ad1e13a6c5",
                                   split = 'train_prefs')

test_ultrafeedback = load_dataset("HuggingFaceH4/ultrafeedback_binarized", 
                        revision="292c16329d921287c4166934cac1a6ad1e13a6c5", 
                        split="test_prefs").shuffle(seed=42)
# test_sample = test_ultrafeedback.select(range(100))
# eval_prompts = test_sample['prompt']

In [9]:
train_sample = train_ultrafeedback.shuffle(seed=42).select(range(10000)).select_columns(["prompt", "chosen"]).map(
    lambda example: {
        "prompt": example["prompt"],
        "chosen": example["chosen"][1]["content"] if len(example["chosen"]) > 1 else None
    })

test_sample = test_ultrafeedback.shuffle(seed=42).select(range(500)).select_columns(["prompt", "chosen"]).map(
    lambda example: {
        "prompt": example["prompt"],
        "chosen": example["chosen"][1]["content"] if len(example["chosen"]) > 1 else None
    })

Map: 100%|██████████| 10000/10000 [00:00<00:00, 23007.82 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 24105.75 examples/s]


In [10]:
train_sample[1]

{'prompt': 'Summorize the movie Beer from 1985',
 'chosen': '"Beer," also known as "The Selling of America" or "Beer: The Movie," is a 1985 comedy film directed by Patrick Kelly. The story revolves around the advertising world and the marketing of a beer product.\n\nThe plot follows an advertising executive named B.D. Tucker (played by Loretta Swit) who is given the challenge of increasing sales for a failing beer brand called Norbecker. In an attempt to create a groundbreaking advertising campaign, she discovers a group of average, beer-loving guys in a bar, consisting of Merle (William Russ), Frankie (Saul Stein), and Elliot (David Alan Grier). Tucker hires them as the new faces of Norbecker beer, believing their authenticity will appeal to the brand\'s target demographic.\n\nThe campaign becomes an instant hit as viewers connect with the trio\'s genuine love for beer and their blue-collar humor. However, the sudden fame goes to the men\'s heads, and they become difficult to manage. 

In [11]:
train_lengths = [len(tokenizer(prompt)['input_ids']) for prompt in train_sample['prompt']]

# find 3 points to split for curriculum
sorted_indices = np.argsort(train_lengths)
n = len(sorted_indices)
split1 = n // 3
split2 = 2 * n // 3

# actual indices for each split
indices_split1 = sorted_indices[:split1]
indices_split2 = sorted_indices[split1:split2]
indices_split3 = sorted_indices[split2:]

# Final datasets
train_short_raw = train_sample.select(indices_split1.tolist())
train_med_raw = train_sample.select(indices_split2.tolist())
train_long_raw = train_sample.select(indices_split3.tolist())


In [12]:
print(np.median([len(tokenizer(prompt)['input_ids']) for prompt in train_short_raw['prompt']]))
print(np.median([len(tokenizer(prompt)['input_ids']) for prompt in train_med_raw['prompt']]))
print(np.median([len(tokenizer(prompt)['input_ids']) for prompt in train_long_raw['prompt']]))

15.0
74.0
272.0


In [14]:
# train_short_raw[1]["chosen"][1]["content"]
train_med_raw[1]["prompt"]

'write a roadmap for learning three.js and react-three-fiber and related technology to go from a novice to a professional to get a job doing 3d development'

In [None]:
# qwen_input_prompt = (
#     "Please categorize the following instructions into a few types, and for each type, "
#     "provide a short criteria describing what makes a good response.\n\n"
#     "Instructions:\n")
    
# train_sample_100 = train_sample.shuffle(seed=42).select(range(400))

# for i, prompt in enumerate(train_sample_100, 1):
#     qwen_input_prompt += f"{i}. {prompt['prompt']}\n"

# qwen_input_prompt += "\nFor each category, please write:\n" \
#                       "- Category name\n" \
#                       "- Short criteria (1-3 sentences) describing what makes a good response."

# print(qwen_input_prompt)

Please categorize the following instructions into a few types, and for each type, provide a short criteria describing what makes a good response.

Instructions:
1. Write a 500-word news article with a journalistic style and tone exploring the potential benefits and challenges of using rooftops for solar energy production, including interviews with experts in the field and examples of successful rooftop solar projects. The article should also touch on the implications of this technology for energy independence, greenhouse gas emissions reduction, and cost savings for consumers and businesses. Please ensure that all sources are properly cited and the article is free of bias.
2. Given a sentence, an entity and its sentiment towards the entity, verify if it is the correct sentiment towards the entity. Answer should be yes or no. Note that URLs in the text have been replaced with [Link].

[EX Q]: Verify if the sentiment of the following document towards the entity Anthony Rizzo is Negative 

In [15]:
critera_dict = {
    "Content Creation & Writing" : 
        "A good response is clear, coherent, and well-structured, addressing the key points outlined in the prompt. It should maintain the requested style (e.g., journalistic, instructional, engaging), meet length and formatting expectations, and ensure accurate, relevant content.",
         
    "Code & Technical Implementation" : 
        "The response should provide clean, executable code that directly addresses the task, with clear comments explaining each step. It should handle any specified edge cases and, where relevant, demonstrate understanding of underlying concepts.",
   
    "Translation & Language Conversion" : 
        "The response should accurately and succinctly label or classify the input based on context and examples provided, using the correct label or answer format requested.",

    "Sentiment & Classification Tasks" :
        "The response should accurately and succinctly label or classify the input based on context and examples provided, using the correct label or answer format requested.",
        
    "Summarization & Paraphrasing" : 
        "Responses should condense the input into concise, coherent text that captures the main ideas while preserving essential details and ensuring logical flow.",  
    
    "Inference & Reasoning" : 
        "Good responses use logical reasoning to justify conclusions, clearly showing how the inference aligns with the provided premise or context.", 
    
    "Short-Form & Factual Q&A" : 
        "The answer should be short, direct, and factual, addressing the core of the question and providing useful, precise information.", 
    
    "Creative Ideation & Strategy" : 
        "Good responses are original, creative, and relevant to the context. They should align with the goals (e.g., improving engagement, enhancing accessibility) and include actionable, well-reasoned ideas."
}

In [16]:
categories_dict = {
    'Content Creation & Writing' : 
        ['Write a 500-word news article with a journalistic style and tone exploring the potential benefits and challenges of using rooftops for solar energy production, including interviews with experts in the field and examples of successful rooftop solar projects. The article should also touch on the implications of this technology for energy independence, greenhouse gas emissions reduction, and cost savings for consumers and businesses. Please ensure that all sources are properly cited and the article is free of bias.',
        'Write an informative essay about the effects of climate change on the planet.',
        'Using a combination of written text and tables, provide a detailed analysis of the major differences between inherited traits and learned behaviors. In your analysis, include specific examples of each and describe how they impact individual development and evolution. Additionally, discuss the role of epigenetics in shaping inherited traits and the impact of cultural transmission on learned behaviors. Finally, compare and contrast the mechanisms by which these two types of characteristics are passed down through generations and discuss the implications for future research in the fields of genetics and behavioral science.',
        "Rewrite the expression to specify the geographical location with a clear meaning. \n Distinct from regions outside of the North American continent."],
        
    'Code & Technical Implementation' :
        ["Please write a [Python code] that implements a basic generative adversarial network (GAN). In your code, please define the generator and discriminator models as [TensorFlow] functions. Then, use these functions to train the GAN on a dataset of your choice. Finally, generate some sample images using the trained model and save them as [JPEG] files. Can you explain how the GAN works and how the training process improves the generator's ability to create realistic images?",
        'I have the following script:\n\n#Check if the specific text exists in the file\nif grep -q "z-10 fixed bottom-5 inset-x-0 mx-auto max-w-fit rounded-lg px-3 py-2 bg-white border border-gray-100 shadow-md flex justify-between space-x-2 items-center" /var/www/$filename.html; then\n\n#Remove the line with the specific text if it exists\nsudo sed -i \'/z-10 fixed bottom-5 inset-x-0 mx-auto max-w-fit rounded-lg px-3 py-2 bg-white border border-gray-100 shadow-md flex justify-between space-x-2 items-center/d\' /var/www/$filename.html\nfi\n\n--I want to execute this on every file inside /var/www on CentOS, how do I do that?',
        'How can I modify the given C# code to create a chatbot that integrates with external APIs and has more dialog options? For example, can I use Latex to create a more complex chatbot that responds to user input with mathematical equations or graphs? Can you provide a code snippet for how to integrate the chatbot with an external API, such as a weather service or news feed?',
        'how do you add a dynamic number of display elements in pysimplegui depending on the number of items in a list?\nfor example:\n\nfitness\\_classes[(cardio, 3pm, 5pm),(spin. 9am, 11am)]\n\nsg.Text(Cardio)\nsg.Text(Spin)\n\nand if there are 3 tuples in the fitness\\_classes list:\n\nfitness\\_classes[(cardio, 3pm, 5pm),(spin. 9am, 11am),(pilates,4pm,6pm)]\n\nsg.Text(Cardio)\nsg.Text(Spin)\nsg.Text(Pilates)'],
    
    'Translation & Language Conversion' : 
        ["TASK DEFINITION: This task is about translating a given English language sentence to French.\nPROBLEM: And you see that over here.\n\nSOLUTION: Et on voit ça ici.\n\nPROBLEM: If you figure out what up and down really are, it's a complex physical set of skills to be able to throw a ball down and up and everything, but then you add in sideways.\n\nSOLUTION: Si vous comprenez ce que sont vraiment le haut et le bas, il faut un ensemble complexe de compétences physiques pour pouvoir lancer une balle vers le bas ou le haut, mais ensuite vous ajoutez le déplacement latéral.\n\nPROBLEM: So that might well chime with your view that America is a sort of warmongering military machine, out to overpower the world with its huge industrial-military complex.\n\nSOLUTION:",   
        'Definition: You are given a sentence in Italian. Your job is to translate the Italian sentence into English.\nInput: Ma avevo saputo di un membro del Congresso che aveva un punto di vista decisamente diverso.\nOutput:',
        "Definition: Given a sentence in French, provide an equivalent paraphrased version from the original that retains the same meaning.\nInput: La recherche d'une clé binaire dans un arbre de recherche spécifique peut être programmée de manière récursive ou itérative.\nOutput:",
        'Teacher: You are given a sentence in Hebrew. Your job is to translate the Hebrew sentence into Japanese.\nTeacher: Now, understand the problem? If you are still confused, see the following example:\nמה מייחד את המוח האנושי?\nSolution: 人の脳の何がそんなに特別なのでしょうか ？\nReason: The Hebrew sentence is correctly translated into Japanese. They both refer to the human brain.\n\nNow, solve this instance: כאשר נתקלים בעיכובים או סיבוכים קשים במיוחד, מתגלה התנהגות מעניינת במיוחד: קישוט.\nStudent:'],
        
    "Sentiment & Classification Tasks" :
        ['Classify the emotion conveyed in the following sentence as happy, sad, or neutral.\nI had a lovely day.',
        '"mention mysterious , sensual , emotionally intense , and replete "\nHow would the sentiment of this sentence be perceived --\n\nPossible answers: (i) negative; (ii) positive;\nAnswer:',
        "Teacher: In this task, you will be given a sentence about a person. You should determine how the sentence affects how the person is perceived by most people in society. Your choices are: \nPositive: The social perception of [PERSON] in the sentence is considered predominantly positive.\nNegative: The social perception of [PERSON] in the sentence is considered predominantly negative.\nNo impact: There is no clear impact of social perception of [PERSON] associated with the sentence.\nTeacher: Now, understand the problem? If you are still confused, see the following example:\n[PERSON] behaved like a professional in the job interview.\nSolution: Positive\nReason: Behaving professionaly in a job interview has a positive impact on people's perception of a person.\n\nNow, solve this instance: [PERSON] started working as a waitress at a restaurant, so she didn't have an excuse to dress for a job interview.\nStudent:",
        'Detailed Instructions: In this task, you must classify if a given review is positive/negative, indicating your answer as P or N.\nProblem:synopsis : a small town thug convinces a dimwitted high - school student to spy on his girlfriend , who is babysitting three unruly kids for their alcoholic parents on a friday night .\nwhile this is going on , the thug , the father , the boyfriend , and a kid reaching the onset of puberty all have fantasies about the babysitter .\ncomments : to say this movie is bad would be quite an understatement .\ni \'m not sure what writer and director ferland was hoping to accomplish with the babysitter .\nsuspense ?\nfamily drama ?\nhumor ?\nerotica ?\nelements of each of these genres exist in the film , yet\nit fails to successfully achieve any of them .\ninstead , the babysitter is a dull , lifeless movie which manages to bore and ultimately irritate its viewers .\ni suppose this film could have been watchable if it were n\'t for the fact that the characters are universally unappealing\n.\nalicia silverstone is completely wasted playing jennifer , the babysitter .\nher character has absolutely no depth at all , and her sole purpose in the movie is to be the object of the other characters \' fantasies .\neveryone else in the film seems to be in a competition to see who can stoop to the lowest level by the time the film ends .\nthe parents are alcoholics who become increasingly obnoxious as the movie proceeds .\nthe father ( played by j .\nt .\nwalsh ) fantasizes about the babysitter ; the mother fantasizes about her husband \'s best friend .\nnone of these fantasy sequences , trust me , are things that need to be seen , but we see them anyway , complete with cheesy , make - out saxophone music .\nthe thug , in the meantime , proves that he \'s evil through his annoying habit of smashing half - empty beer bottles all of a sudden and for no apparent reason .\nthe most absurd character , however , is the babysitter \'s boyfriend who seems catatonically brain - dead .\nthe thug , in a manipulative , iago - like manner ( though he does n\'t really need to try hard ) , manages to talk the boyfriend into binge - drinking , smoking grass , running away from cops , and playing peeping tom on his own girlfriend in a matter of minutes .\nincredible !\n( of course , the boyfriend \'s original plan for the evening was , try not to laugh , to sit in an empty diner and read catcher in the rye by j . d . salinger . ) if the goal of the babysitter was to be suspenseful , then it fails .\nthere are surprisingly few tense moments in this film , and nearly all of them come at the final minute .\nby that time , however , the audience is so tired of the inane characters that no one truthfully cares what happens to any of them .\nno suspense occurs in the dream sequences either because every single one of them is obviously a dream sequence from its inception .\nif the goal of the babysitter was to be humorous , then it also fails .\ni found nothing even remotely funny about the boozing parents who seemed , at times , to be played for laughs .\nif the goal of the babysitter was to be dramatic , then , once again , it fails .\nthe characters are one - dimensional and uninteresting .\nfinally , if the goal of the babysitter was to be titillating ( the type of film destined to be played ad infinitum on hbo at 2 in the morning ) , then\nit fails as well .\nthe dream sequences are n\'t erotic ; they are too brief and , outside of one very short scene , contain no nudity .\ni ca n\'t completely trash this movie .\nthe first 10 minutes or so vaguely resemble an interesting film , and the conclusion sports a halfway decent fistfight .\nthe other 79 minutes , though , are a drag .\nsilverstone \'s character , at the end of the movie , turns to her boyfriend and asks " what were you thinking ?\n"\ni asked myself the same question , having spent 99 cents renting this turkey .\nSolution:'],
        
    "Summarization & Paraphrasing" :
        ['A.\tTo provide medical and educational aid to the poor people, especially women, children and aged. B.\tTo universalize literacy through formal and informal education. C.\tTo promote employment and income generation activates by the different means of small scale industries handloom, handicrafts and petty trading D. To create awareness among the people on organizational development on legal issues by promoting grass root level based groups of people in the operational area. E.\tTo ensure development of infrastructure such as road, electricity, water, communication etc. within the operational area. F.\tTo encourage the youth to handle with the changing norms and values of society and to cope with these changes and there by create a nation of healthy youth in all means. Can you summarize the goals of the organization described in the text material?' ,
        '1. Specimens may be borrowed from the Milwaukee Public Museum Geology Collections for purposes of research. All loans must be authorized and documented by a formal loan agreement or contract.\n2. Requests for loans must be made in writing to the Geology Collections Manager by a qualified staff member of a recognized educational facility. Loans are not issued to students but may be assigned to their major professor if that person can vouch for the professionalism of the student and ensure compliance with the loan agreement. Loan requests should describe the nature and scope of work contemplated, the worker’s name, and all pertinent information on requested specimens. Loans are made to institutions and not to individuals. The borrowing facility assumes responsibility for the safekeeping and return of the borrowed materials.\n4. Specimens on loan should be kept under conditions and handled carefully to avoid damage.\n5. Each specimen should be annotated or affirmed using annotation slips and permanent ink or type and included with specimen. Annotations should not be made on original labels and should include date and name of recorder.\n6. Unless otherwise arranged, all specimens in one loan must be returned together carefully packaged to avoid damage in transit. The borrower is expected to pay return shipping costs, to adequately insure the material, and to attach all necessary travel certificates, documents, or permits.\n7. Loans from the Milwaukee Public Museum Geology Collections should be acknowledged in published works. The Geology Collections Manager should receive a reprint of each publication and digital copies of any photography of loan material.\nCan you summarize the requirements for borrowing specimens from the Milwaukee Public Museum Geology Collections?',
        'Summarize the text input in two sentences to the best of your ability\nJohn had been looking for a job for a few months now. He had applied to a few places but none of them seemed to be suitable for him. He had a few interviews but was turned down for each of them.',
        "We have everything! National cuisines from all over the world, as well as unique combinations of different cuisines. You will be able to enjoy the best Persian delicacies alongside supreme Russian food, juicy Balkan treats and the best dishes of African cuisine. This rich, authentic tapestry of tastes that cannot be found anywhere else in the world makes Israel the perfect location for the perfect cooking vacation.\nIsrael is indeed a small country. You can cross it in one week of easy traveling, have a taste of the delicious boutique wines of the north and breathe the fresh mountain air of Jerusalem on the same day. Instead of spending the whole week in one location you will be able to travel through many magical places while enjoying the amazing variety of exotic cuisines the land of Israel has to offer.\nThe weather in Israel is lovely most of the year. When most of Europe and the US are snowed in, Israel is warm and sunny. This is why Israel is a preferred tourist destination year-round and even more so for Cooking Vacations.\nAs is well known, Israel is one of the leading agricultural powers in the world. This is also manifested in the amazing variety of fresh fruits and vegetables for the reasonable prices of agricultural produce. Local restaurants offer a wonderful selection of vegetable dishes, salads and casseroles that you have probably never tasted before.\nFor those of you who are partial for Kosher food, you'll be happy to find dozens of excellent quality Kosher restaurants, offering the best Kosher cuisine that will allow you to enjoy the Cooking Vacations to their fullest.\nFor such a small country, Israel has an astoundingly varied topography. The sea brings in through the ports a selection of fresh fish and seafood and the mountains and meadows are great for raising goats and lambs to produce the wonderful selection of local cheeses, as well as vineyards for producing the excellent local wine.\nBut perhaps the best reason for Cooking Vacations in Israel is the people. Israelis take great pride in their food and are very enthusiastic to share the secrets of their cuisine. Israeli people love to tell stories about their family recipes and special family delicacies. They are an easy-going, informal, hospitable people and their warm and open attitude towards gourmet food lovers makes Israel a real paradise for culinary travelers. This is the place to ask, to learn, to experiment, to snoop around, to stick your fingers in the pot and to have a taste of one of the most fascinating cuisines in the world.\nAll in all, cooking vacations in Israel present a unique and wonderful combination of authentic local foods, boutique wines, exotic landscape, wonderful weather and the most warm and easy going people who would be happy to extend their hospitality and to teach you the secrets of the local cuisine. What else do you need for perfect Cooking Vacations?\nCan you summarize the main reasons why Israel is a great location for cooking vacations?"],
    
    "Inference & Reasoning" : 
        ['Q: Premise: "Two military workers help to clean up rubble."\nBased on this premise, can we conclude that the hypothesis "The army privates are particpating in base clean-up day." is true?\nOptions:\n- yes\n- it is not possible to tell\n- no\nA: Military workers helping to clean up rubble are not necessarily particpating in base clean-up day and are not necessarily army privates.\nThe answer is it is not possible to tell.\n\nQUESTION: Given the sentence "People in the military are outside at night." is it true that "Some military personnel are smoking outside at night."?\n\nLet\'s solve it slowly: Just because personnel are outside does not mean that they are smoking.\nThe answer is it is not possible to tell.\n\n[QUESTION] If "A child in a striped dress and a woman in a black dress." does that mean that "A mother and child model dresses."?\nOptions:\n- yes\n- it is not possible to tell\n- no\nA mother and child wear dresses for reasons other than to model them.\nThe answer is it is not possible to tell.\n\nQ: Premise: "An older man in a white short-sleeve shirt admiring a bush."\nHypothesis: "A man admires a bush."\nDo we know that the hypothesis entailed by the premise?\nA: If the man is admiring a bush then he admire the bush.\nThe answer is yes.\n\nQUESTION: Can we conclude from "A man sits on a sidewalk while leaning up against a building." that "The man is eating a sandwich while sitting on the sidewalk."?\nOptions:\n- yes\n- no\n- it is not possible to tell\n\nLet\'s solve it slowly: A man sitting on a sidewalk while leaning up against a building doesn\'t indicate he is eating a sandwich.\nThe answer is it is not possible to tell.\n\nQUESTION: Premise: "Three new york jets quarterbacks are practicing dropping back for a pass."\nHypothesis: "Three soccer players run sprints down the field."\nDo we know that the hypothesis entailed by the premise?\n\nLet\'s solve it slowly:',
        'In this task, you will be presented with a premise and a hypothesis sentence. Determine whether the hypothesis sentence entails (implies), contradicts (opposes), or is neutral with respect to the given premise sentence. Please answer with "Contradiction", "Neutral", or "Entailment".\n\n[EX Q]: Premise: Davenport missed Wednesday\'s practice due to an undisclosed injury but was seen working on the sidelines Thursday, Josh Katzenstein of The New Orleans Times-Picayune reports. It\'s unclear exactly what is bothering Davenport, but he was at least able to do some individual work on the side Thursday. The rookie defensive end has reportedly been going through some growing pains in his first training camp, and Davenport will need to get healthy and back on the field as soon as possible in order to prove his worth as the 14th overall pick. <sep> Hypothesis: Josh Katzenstein is a reporter\n[EX A]: Entailment\n\n[EX Q]: Premise: What is certain, however, is that around 120,000 years ago, at least, the modern Homo sapiens first appeared and lived contemporaneously with the Neanderthal for at least 90,000 years, until the Neanderthal disappeared 30,000 years ago. <sep> Hypothesis: Homo sapiens lived at the same time as neanderthals for more than 30,000 years \n[EX A]: Entailment\n\n[EX Q]: Premise: "Child labor" is, generally speaking, work for children that harms them or exploits them in some way (physically, mentally, morally, or by blocking access to education). <sep> Hypothesis: Child labor is preferable to going to school\n[EX A]:',    
        "Let's play a game. You are Gary Vaynerchuk and you are working on an extremely important new social media account. You are explaining to me, your social media manager assigned to this account, how you want me to set up the social media monthly report document. Please tell me what you want to see in the report, and how I should format it. The focus for the account is rapid growth on Instagram and TikTok so focus on the metrics that will best show ROI with those goals in mind.",
        "Keri has 7 books that she plans on printing in her new library. 3 of her books are fiction, and the other 4 are nonfiction. In her library, the fiction books take up 25% of the shelf space, and the nonfiction books take up 80% of the shelf space. How much shelf space will she have for the fiction books?"],
    
    "Short-Form & Factual Q&A" : 
        ['why do texans say "remember the alamo"',
        "what do you know about high-end and luxury industries management ?",
        "Why is Mars a tourism country?",
        "How many aliens are there on the earth?"],
          
    "Creative Ideation & Strategy" : 
        ['What innovative ideas can you come up with to enhance the digital menu experience for a vegan sushi restaurant that combines molecular gastronomy with traditional Japanese cuisine? Please include a new constraint that ensures the menu is also accessible to visually-impaired users. Describe the interface and user interactions, and detail how you would use animation and interactive elements to highlight the fusion of flavors and techniques.',
        'What strategies and techniques should be employed to effectively create, implement and manage successful Bing advertising campaigns? What are the best practices and pitfalls to avoid? How can one optimize and track the performance of their Bing campaigns? What tools and resources are available for enhancing the visibility, targeting and conversion rates of Bing ads?',
        'What would you say to motivate a friend who is about to face a difficult challenge?',
        'I own a creative studio and I would like to hire a full-time developer to work on projects to improve our pipeline. What are some effective ways to find someone great?'],
}

In [21]:
sentence_bert = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')  # you can choose another SBERT model

def assign_category(prompt, sentence_bert=sentence_bert, categories_dict=categories_dict):

    # compute mean-pooled embeddings for each category
    category_mean_embeds = {}

    for category, examples in categories_dict.items():
        example_embeds = sentence_bert.encode(examples, convert_to_tensor=True)
        mean_embeds = torch.mean(example_embeds, dim=0)
        category_mean_embeds[category] = mean_embeds

    prompt_embed = sentence_bert.encode(prompt, convert_to_tensor=True)

    # compute cosine similarity between curr prompt and each category's mean embedding to pick best match category
    similarities = {}
    for category, mean_embedding in category_mean_embeds.items():
        cosine_sim = util.cos_sim(prompt_embed, mean_embedding).item()
        similarities[category] = cosine_sim
    most_similar_category = max(similarities, key=similarities.get)

    # print(most_similar_category)
    return most_similar_category


In [17]:
def create_citing_dataset(training_set_raw):
    training_set = []
    for instruction in training_set_raw:
        x = instruction["prompt"]
        y = instruction["chosen"]
        category = assign_category(x)
        c = critera_dict[category]
        
        training_set.append({"x" : x, "y" : y, "c" : c})
    
    return training_set

In [None]:
train_short = create_citing_dataset(train_short_raw)
with open("train_short.json", "w") as f:
    json.dump(train_short, f, indent=4)

train_med = create_citing_dataset(train_med_raw)
with open("train_med.json", "w") as f:
    json.dump(train_med, f, indent=4)

train_long = create_citing_dataset(train_long_raw)
with open("train_long.json", "w") as f:
    json.dump(train_long, f, indent=4)

test = create_citing_dataset(test_sample)
with open("test.json", "w") as f:
    json.dump(test, f, indent=4)

In [None]:
# train_med[332]['x']
print(np.median([len(tokenizer(prompt['x'])['input_ids']) for prompt in train_long]))

283.5
