In [1]:
# credits:
# https://www.kaggle.com/code/olyatsimboy/aimo-openmath-mistral-baseline
# https://www.kaggle.com/code/aatiffraz/prompt-prediction-w-mixtral-mistral7b-gemma-llama
# https://www.kaggle.com/code/thedrcat/aimo-mixtral-baseline

# Zero-shot MMOS-DeepSeekMath-7B with self-consistency and generated code reasoning evaluation

Self-consistency is a modification of the standard greedy decoding in reasoning pipelines via sampling several diverse answers followed by aggregation, e.g., most common answer ([SC-CoT paper](https://arxiv.org/pdf/2203.11171.pdf)).

In this kernel, we will consider MMOS-DeepSeekMath-7B RL-tuned backbone; in my experiments, this model produces more consistent code reasoning and the code block execution will allow us to decrease arithmetic hallucinations.

In [2]:
import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    AutoConfig,
    set_seed
)

set_seed(42)

MODEL_PATH = "deepseek-ai/deepseek-math-7b-rl"

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

config = AutoConfig.from_pretrained(MODEL_PATH)
config.gradient_checkpointing = True


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
#     quantization_config=quantization_config,
    config=config
)

config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors.index.json:   0%|          | 0.00/23.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/5.23G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

In [3]:
model.dtype

torch.bfloat16

In [4]:
import pandas as pd
from tqdm import tqdm
PRIVATE = True

df = pd.read_csv('../Data/ai-mathematical-olympiad-prize/test.csv')
df.head()

Unnamed: 0,id,problem
0,000aaa,What is $1-1$?
1,111bbb,What is $0\times10$?
2,222ccc,Solve $4+x=4$ for $x$.


In [5]:
if len(df) < 5:
    df = pd.read_csv('../Data/ai-mathematical-olympiad-prize/train.csv')
    PRIVATE = False
df.head()

Unnamed: 0,id,problem,answer
0,208387,"The points $\left(x, y\right)$ satisfying $((\...",320
1,2cda49,For how many positive integers $m$ does the eq...,199
2,68704f,There exists a unique increasing geometric seq...,211
3,7543ec,A function $f: \mathbb N \to \mathbb N$ satisf...,199
4,7b58de,Let the `sparkle' operation on positive intege...,702


In [6]:
import gc
device = 'cuda'

In [7]:
def naive_parse(answer):
    out = []
    start = False
    end = False
    for l in reversed(list(answer)):
        if l in '0123456789' and not end:
            start = True
            out.append(l)
        else:
            if start:
                end = True
        
    out = reversed(out)
    return ''.join(out)

In [8]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype='auto',
    device_map="auto",
)

In [9]:
print(f"Transformers Version: {transformers.__version__}")

Transformers Version: 4.39.3


In [None]:
# this code requires python scipt from llm to
# 1.this code expects python code to be separated by ''' and take the second [1] chunks, and first 7 char are not useful
# 2.will print or write to stdout ans and only ans, or expr that can be eval to get ans


code = output.split('```')[1][7:]

with open('code.py', 'w') as fout:
    fout.write(code)

# Unix timeout command to limit the execution time of the script to 7 seconds
batcmd = 'timeout 7 ' + sys.executable + ' code.py'
try:
    # Executes the command constructed in the previous line in the shell. 
    # subprocess.check_output runs the command and returns the output as a byte string. 
    # Setting shell=True allows the command string to be executed as a shell command.
    # note the code.py needs to print or output to stdout for it to be captured
    shell_output = subprocess.check_output(batcmd, shell=True).decode('utf8')
    print(shell_output)
    code_output = round(float(eval(shell_output))) % 1000
except:
    code_output = -1

In [10]:
import torch

torch.backends.cuda.enable_mem_efficient_sdp(False)

In [11]:
import re
import sys
import subprocess


def process_output(output):
    result = output
    
    try:
        code = output.split('```')[1][7:]

        with open('code.py', 'w') as fout:
            fout.write(code)

        batcmd = 'timeout 7 ' + sys.executable + ' code.py'
        try:
            shell_output = subprocess.check_output(batcmd, shell=True).decode('utf8')
            print(shell_output)
            code_output = round(float(eval(shell_output))) % 1000
        except:
            code_output = -1

        print('CODE RESULTS', code_output)
    
    except Exception as e:
        print(e)
        print('ERROR PARSING')
        code_output = -1
    
    try:
        result_output = re.findall(r'\\boxed\{(.*)\}', result)

        print('BOXED', result_output)
        if not len(result_output):
            result_output = naive_parse(result)
        else:
            result_output = result_output[-1]

        print('BOXED', result_output)
        if not len(result_output):
            result_output = -1
        
        else:
            result_output = round(float(eval(result_output))) % 1000
    
    except Exception as e:
        print(e)
        print('ERROR PARSING')
        result_output = -1
    
    return result_output, code_output

In [12]:
import re
from collections import defaultdict


tool_instruction = " The answer should be given as a non-negative modulo 1000."
tool_instruction += '\nPlease integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}.'


n_repetitions = 5 if PRIVATE else 2

total_results = []
total_answers = []

for i in tqdm(range(len(df))):
    id_ = df['id'].loc[i]
    problem = df['problem'].loc[i]
    
    messages = [
        {
            "role": "user", 
            "content": problem + tool_instruction
        }
    ]
    
    query_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False
    )
    
    results = []
    answers = []
    
    for _ in tqdm(range(n_repetitions)):
        try:
            raw_output = pipeline(
                query_prompt, 
                max_new_tokens=2048, 
                do_sample=True, 
                temperature=0.7,
                return_full_text=False
            )
            raw_output = raw_output[0]['generated_text']

            result_output, code_output = process_output(raw_output)

            torch.cuda.empty_cache()
            gc.collect()

        except Exception as e:
            print(e)
            result_output, code_output = -1, -1
        
        results.append(result_output)
        answers.append(code_output)
    
    total_results.append(results)
    total_answers.append(answers)

  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


64

CODE RESULTS 64
BOXED []
BOXED 64


Traceback (most recent call last):
  File "/home/zhenlan/Desktop/Projects/IMO/Code/code.py", line 31, in <module>
    result = polygon_area()
  File "/home/zhenlan/Desktop/Projects/IMO/Code/code.py", line 8, in polygon_area
    points_1 = solve([Abs(x + y) - 10, Abs(x - y) - 10], [x, y])
  File "/home/zhenlan/anaconda3/lib/python3.10/site-packages/sympy/solvers/solvers.py", line 1007, in solve
    raise NotImplementedError('solving %s when the argument '
NotImplementedError: solving Abs(x + y) when the argument is not real or imaginary.
100%|██████████| 2/2 [00:23<00:00, 11.82s/it]
  0%|          | 0/10 [00:23<?, ?it/s]

CODE RESULTS -1
BOXED []
BOXED 1000





In [20]:
output  = raw_output
code = output.split('```')[1][7:]

In [23]:
        with open('code.py', 'w') as fout:
            fout.write(code)

In [28]:
batcmd = 'timeout 7 ' + sys.executable + ' code.py'

In [46]:
eval(subprocess.check_output(batcmd, shell=True).decode('utf8'))

5

In [30]:
!timeout 7 /home/zhenlan/anaconda3/bin/python code.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
process_output(raw_output)

CODE RESULTS -1
BOXED []
BOXED 1000


Traceback (most recent call last):
  File "/home/zhenlan/Desktop/Projects/IMO/Code/code.py", line 31, in <module>
    result = polygon_area()
  File "/home/zhenlan/Desktop/Projects/IMO/Code/code.py", line 8, in polygon_area
    points_1 = solve([Abs(x + y) - 10, Abs(x - y) - 10], [x, y])
  File "/home/zhenlan/anaconda3/lib/python3.10/site-packages/sympy/solvers/solvers.py", line 1007, in solve
    raise NotImplementedError('solving %s when the argument '
NotImplementedError: solving Abs(x + y) when the argument is not real or imaginary.


(0, -1)

In [19]:
'timeout 7 ' + sys.executable + ' code.py'

'timeout 7 /home/zhenlan/anaconda3/bin/python code.py'

In [14]:
print(raw_output)

The solution is as follows:
```python
from sympy import symbols, solve, simplify, Abs

def polygon_area():
    """The points $\left(x, y\right)$ satisfying $((\vert x + y \vert - 10)^2 + ( \vert x - y \vert - 10)^2)((\vert x \vert - 8)^2 + ( \vert y \vert - 8)^2) = 0$ enclose a convex polygon. What is the area of this convex polygon? The answer should be given as a non-negative modulo 1000."""
    x, y = symbols('x y')
    
    # Points satisfying (|x + y| - 10)^2 + (|x - y| - 10)^2 = 0
    points_1 = solve([Abs(x + y) - 10, Abs(x - y) - 10], [x, y])
    
    # Points satisfying (|x| - 8)^2 + (|y| - 8)^2 = 0
    points_2 = solve([Abs(x) - 8, Abs(y) - 8], [x, y])

    # Combine the points
    points = points_1 + points_2

    # Calculate the area of the polygon formed by these points
    area = 0
    for i in range(len(points)):
        x1, y1 = points[i]
        if i == len(points) - 1:
            x2, y2 = points[0]
        else:
            x2, y2 = points[i + 1]
        area += (x1 

In [14]:
import numpy as np
from collections import Counter

final_answers = []

for a, b in zip(total_answers, total_results):
    a = np.array(a)
    b = np.array(b)
    a[a < 0] = b[a < 0]
    
    pred = Counter(a.tolist()).most_common(2)

    ans = pred[0][0] if not pred[0][0] < 0 else pred[1][0]

    final_answers.append(ans)
    print(ans)


504
18
155
416
310
200
449
36
478
396


In [15]:
df['answer'] = final_answers

In [16]:
df

Unnamed: 0,id,problem,answer
0,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",504
1,246d26,Each of the three-digits numbers $111$ to $999...,18
2,2fc4ad,Let the `sparkle' operation on positive intege...,155
3,430b63,What is the minimum value of $5x^2+5y^2-8xy$ w...,416
4,5277ed,There exists a unique increasing geometric seq...,310
5,739bc9,For how many positive integers $m$ does the eq...,200
6,82e2a0,Suppose that we roll four 6-sided fair dice wi...,449
7,8ee6f3,"The points $\left(x, y\right)$ satisfying $((\...",36
8,bedda4,Let $ABCD$ be a unit square. Let $P$ be the po...,478
9,d7e9c9,A function $f: \mathbb N \to \mathbb N$ satisf...,396


In [17]:
df[['id','answer']].to_csv("submission.csv", header=True, index=False)

In [18]:
df[['id','answer']].head()

Unnamed: 0,id,answer
0,229ee8,504
1,246d26,18
2,2fc4ad,155
3,430b63,416
4,5277ed,310


In [19]:
if not PRIVATE:
    df = pd.read_csv('../Data/ai-mathematical-olympiad-prize/train.csv')
    df['model_answer'] = final_answers
    df['match'] = df.answer == df.model_answer
    print(f'{df.match.sum()} matches in {len(df)} examples')

0 matches in 10 examples
