In [1]:
from datasets import load_dataset

In [2]:
import re
from transformers import AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/deepseek-math-7b-rl')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### math-shepherd

In [3]:
# tokenizer.encode(' ки')[-1] # 14682
def remove_and_track_indices(lst, target=14682):
    # remove target and return the index of token before target in new list
    indices_of_target = [i for i, num in enumerate(lst) if num == target]
    adjusted_indices = []
    count = 0  # Counter to keep track of how many targets have been removed

    for index in indices_of_target:
        if index > 0:  # Ensure there is an element before the target
            adjusted_indices.append(index - 1 - count)
        count += 1  # Increment for each target found
    
    lst = [num for num in lst if num != target]  # Create a new list without the target
    return lst, adjusted_indices

def process_texts(example):
    t1, t2 = example['input'],example['label']
    i = 0
    j = 0  # separate index for t2
    result_list = []
    while i < len(t1):
        if t1[i:i+2] == "ки":
            if t2[j] == '+':
                result_list.append(1)
            elif t2[j] == '-':
                result_list.append(0)
            i += 2  # Skip "ки" in t1
            j += 1  # Move past the corresponding '+' or '-' in t2
        else:
            i += 1
            j += 1  # Keep t2 in sync with t1

    s = re.sub(r"Step \d+: ", "", t1) # Step i
    s = re.sub(r"<<[^>]*>>", "", s) # <<80*2=1600>>
    input_id = tokenizer.encode(s)
    input_id,index = remove_and_track_indices(input_id)
    example['input_id'] = input_id
    example['index'] = index
    example['targets'] = result_list
    return example

In [4]:
dataset = load_dataset("peiyi9979/Math-Shepherd")
processed_dataset = dataset.map(process_texts, num_proc=4)

Map (num_proc=4):   0%|          | 0/444655 [00:00<?, ? examples/s]

In [5]:
processed_dataset = processed_dataset.remove_columns(['input', 'label', 'task'])

In [8]:
processed_dataset.save_to_disk('../Data/Math-Shepherd')

Saving the dataset (0/2 shards):   0%|          | 0/444655 [00:00<?, ? examples/s]

In [2]:
dataset = load_dataset("peiyi9979/Math-Shepherd")

#### generataion

In [1]:
import os
import pandas as pd
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/deepseek-math-7b-rl')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
directory = "../llmOutputs"
texts = []
targets = []
starts_ends = []
for root, _, files in os.walk(directory):
    for file in files:
        # Check if the file is a CSV
        if file.endswith('.csv'):
            # Construct the full file path
            file_path = os.path.join(root, file)
            df = pd.read_csv(file_path)
            print(file_path)
            problems = tokenizer.batch_encode_plus(df.problem.tolist(),add_special_tokens=True,return_attention_mask=False)['input_ids']
            solutions = tokenizer.batch_encode_plus(df.output.tolist(),add_special_tokens=False,return_attention_mask=False)['input_ids']
            texts.extend([p+s for p,s in zip(problems,solutions)])
            # yhats can be error msg.
            yhats = []
            for yhat in df.yhat.tolist():
                try:
                    yhat = int(yhat)
                except:
                    yhat = float("-inf")
                yhats.append(yhat)
            targets.extend([(y==yhat)*1.0 for y,yhat in zip(df.y.tolist(),yhats)])
            # start from problem + 10 contexts (need some context)
            starts_ends.extend([(len(p)+10,len(p)+len(s)-1) for p,s in zip(problems,solutions)])

../llmOutputs/model1/generations.csv
../llmOutputs/model8/generations.csv
../llmOutputs/model5/generations.csv
../llmOutputs/model2/generations.csv
../llmOutputs/model4/generations.csv
../llmOutputs/model9/generations.csv
../llmOutputs/model7/generations.csv
../llmOutputs/model6/generations.csv


In [3]:
import pickle
with open('../Data/PRM_data/gen_texts.pkl', 'wb') as file:
    pickle.dump(texts, file)
with open('../Data/PRM_data/gen_targets.pkl', 'wb') as file:
    pickle.dump(targets, file)
with open('../Data/PRM_data/gen_starts_ends.pkl', 'wb') as file:
    pickle.dump(starts_ends, file)       

#### Solution

In [17]:
import json
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/deepseek-math-7b-rl')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
file_paths = ['../Data/OlympiadBench_Dataset/data/TP_MM_maths_en_COMP.json',
                '../Data/OlympiadBench_Dataset/data/OE_MM_maths_en_COMP.json',
                '../Data/OlympiadBench_Dataset/data/OE_TO_maths_en_COMP.json',
                '../Data/OlympiadBench_Dataset/data/TP_TO_maths_en_COMP.json',
                '../Data/AMC/aime_normal.json', 
                '../Data/AMC/amc12_normal.json', 
                '../Data/MATH/outputs_normal.json']

In [34]:
texts = []
starts_ends = []

for file_path in file_paths:
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    problems = []
    solutions = []
    for d in data:
        # more than one sol for a given problem
        problem = d['question']
        for sol in d['solution']:
            problems.append(problem)
            solutions.append(sol)
    
    problems = tokenizer.batch_encode_plus(problems,add_special_tokens=True,return_attention_mask=False)['input_ids']
    solutions = tokenizer.batch_encode_plus(solutions,add_special_tokens=False,return_attention_mask=False)['input_ids']
    texts.extend([p+s for p,s in zip(problems,solutions)])

    # start from problem + 10 contexts (need some context)
    starts_ends.extend([(len(p)+10,len(p)+len(s)-1) for p,s in zip(problems,solutions)])    

Token indices sequence length is longer than the specified maximum sequence length for this model (4491 > 4096). Running this sequence through the model will result in indexing errors


In [38]:
import pickle
with open('../Data/PRM_data/sol_texts.pkl', 'wb') as file:
    pickle.dump(texts, file)
with open('../Data/PRM_data/sol_starts_ends.pkl', 'wb') as file:
    pickle.dump(starts_ends, file) 

#### MMOS

In [14]:
from datasets import load_dataset,load_from_disk
ds = load_dataset("cyzhh/MMOS")

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/deepseek-math-7b-rl')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def process_texts(example):
    problem, solution = example['prompt'],example['completion']
    problem = problem.replace("<|user|> ","").replace("<|assistant|> ","")
    problem, solution = tokenizer.encode(problem,add_special_tokens=True), \
                        tokenizer.encode(solution,add_special_tokens=False)
    example['input_id'] = problem + solution
    example['starts_ends'] = (len(problem)+10,len(problem)+len(solution)-1)
    return example

In [4]:
processed_dataset = ds.map(process_texts, num_proc=4)

In [7]:
processed_dataset = processed_dataset.remove_columns(['idx','prompt', 'completion'])

In [11]:
processed_dataset.save_to_disk('../Data/MMOS')

Saving the dataset (0/1 shards):   0%|          | 0/134610 [00:00<?, ? examples/s]