# Text Classification for Code Generation Tasks vs Logical Reasoning Tasks
### Vishakha Dikshit and Ashley Ziegler

## Imports & Load in Data

In [6]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import vertexai.preview.generative_models as generative_models
from transformers import pipeline
import os
from openai import OpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

In [None]:
train = pd.read_csv("full_train.csv")
train.head()

Unnamed: 0.1,Unnamed: 0,question,class
0,7766,What can you store silicone in?,reasoning
1,9211,The toddler thought it would be fun to try eat...,reasoning
2,6782,Billy was a forgiving person. When Marna conf...,reasoning
3,987,What southern U.S. state is know for having ma...,reasoning
4,7755,where do you go to see most statues?,reasoning


In [None]:
train["class"] = train["class"].replace({"reasoning": 0, "code": 1})
train["class"] = pd.to_numeric(train["class"])
train.head()

Unnamed: 0.1,Unnamed: 0,question,class
0,7766,What can you store silicone in?,0
1,9211,The toddler thought it would be fun to try eat...,0
2,6782,Billy was a forgiving person. When Marna conf...,0
3,987,What southern U.S. state is know for having ma...,0
4,7755,where do you go to see most statues?,0


## Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

tokenized_data = tokenizer(list(train["question"]), return_tensors="np", padding=True)
tokenized_data = dict(tokenized_data)

tokenizer_config.json: 100%|██████████| 49.0/49.0 [00:00<00:00, 30.4kB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 216kB/s]
vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.82MB/s]
tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 1.83MB/s]


## Pretrained BERT model

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased")
model.compile(optimizer="adam")

model.safetensors: 100%|██████████| 436M/436M [03:44<00:00, 1.95MB/s] 
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.fit(tokenized_data, train["class"])

In [None]:
model.save_pretrained("BERTModel")

## Testing data

In [None]:
test = pd.read_csv("full_test.csv")
test["class"] = test["class"].replace({"reasoning": 0, "code": 1})
test["class"] = pd.to_numeric(test["class"])
test.head()

Unnamed: 0.1,Unnamed: 0,question,class
0,4824,The country didn't want to join the union. The...,0
1,5913,What destination is a sailor typically most ex...,0
2,3092,Bob pulled a rock out of the ground. He wante...,0
3,3385,Someone expressing anger will shake their fist...,0
4,661,Where would you see a performer at a gathering...,0


In [None]:
tokenized_test = tokenizer(list(test["question"]), return_tensors="np", padding=True)
tokenized_test = dict(tokenized_test)

In [None]:
predict = model.predict(tokenized_test)



In [None]:
pred_probs = predict.logits
pred_classes = np.argmax(pred_probs, axis=1)

# Calculate accuracy
score = accuracy_score(test["class"], pred_classes)
print(score)

0.7918731417244796


In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained("BERTModel")

Some layers from the model checkpoint at BERTModel were not used when initializing TFBertForSequenceClassification: ['dropout_151']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at BERTModel.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


## Fined-Tuned Gemini Model

please note that this requires authentication and probably wont work without it, however we have attached a video to show it work

In [None]:
vertexai.init(project="879759828929", location="us-central1")
model = GenerativeModel(
    "projects/879759828929/locations/us-central1/endpoints/7605214177187069952",
)
chat = model.start_chat()


generation_config = {
    "max_output_tokens": 2048,
    "temperature": 1,
    "top_p": 1,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

## Test
print(chat.send_message(
    ["""A soccer game with multiple males playing. are some men playing a sport? give yes or no answer"""],
    generation_config=generation_config,
    safety_settings=safety_settings
))

## Mistral Model for Prompt Parsing

Again requires authentication

In [5]:
from transformers import pipeline
from huggingface_hub import login
login(token="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") #please add access token here

pipe = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1")

def generate_text(prompt):
    results = pipe(prompt, max_length=50, num_return_sequences=1)
    generated_text = results[0]['generated_text']
    return generated_text

## GPT for Code Generation

In [8]:


client = OpenAI(
    api_key= "key" , ##please add key here
)
# TEST
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "this is a test",
        }
    ],
    model="gpt-4-1106-preview",
)


## Instructor Model

In [None]:
def generate_instruction(task_type):
    if task_type == "Math":
        return "Please write code for the following problem: "
    elif task_type == "Reasoning":
        return "Please answer this in one word with Yes or No: "
    else:
        print("wrong prompt type")

## Main Code

In [None]:
sample_prompt = "A soccer game with multiple males playing. are some men playing a sport? give yes or no answer"
tokens = tokenizer([sample_prompt], return_tensors="np", padding=True)
tokens = dict(tokens)
p = model.predict(tokens)
best = np.argmax(p, axis=-1)
task_type = (["Reasoning", "Math"][best])
prompt = generate_instruction(task_type) + sample_prompt
if task_type == "Math":
    annotated_prompt = generate_text(prompt)
    chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": annotated_prompt,
        }
    ],
    model="gpt-4-1106-preview",
)
elif task_type == 'Reasoning':

    annotated_prompt = '""' +generate_text(prompt) + '""'
    print(chat.send_message(
    [annotated_prompt],
    generation_config=generation_config,
    safety_settings=safety_settings
))


reasoning


Evaluations are done by us in all cases so we dont have any code for the metrics