In [None]:
!pip install transformers
!pip install accelerate

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda"

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/CodeQwen1.5-7B-Chat",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/CodeQwen1.5-7B-Chat")

def generate_response(messages):
  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True
  )
  model_inputs = tokenizer([text], return_tensors="pt").to(device)

  generated_ids = model.generate(
      model_inputs.input_ids,
      max_new_tokens=4096,
  )
  generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
  ]

  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  return response


In [None]:
SYSTEM_PROMPT = """
You are a coding assistant for data science tasks.
Your task is to only output Python code.
Do not output any other text."""

messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": "# In[ ]:\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n\n# In[ ]:\n\n\n# You are a professional data scientist. Answer the following questions using pandas and matplotlib.\n\n\n# In[ ]:\n\n\n# # Exercise 1\n\n\n# In[ ]:\n\n\ndf = pd.read_csv('employee.csv')\n\n\n# In[ ]:\n\n\n# Schema of Dataframes:\n# Columns in df with example values:\n# name (Peter), gender (m), DOB (1992/01/17)\n\n\n# In[ ]:\n\n\n# Problem: How many male and female employees are born in 1992?\n\n\n# In[ ]:\n\n\n# Solution: Let's solve this problem step-by-step.\n# Step 1: convert date of birth in to datetime\ndf['DOB'] = pd.to_datetime(df['DOB'])\n# Step 2: get the number of male born in 1992\nnum_male_students = len(df[(df['DOB'].dt.year == 1992) & (df['gender'] == 'm')])\n# Step 3: get the number of female born in that year\nnum_female_students = len(df[(df['DOB'].dt.year == 1992) & (df['gender'] == 'f')])\n\n\n# In[ ]:\n\n\n# # Exercise 2\n\n\n# In[ ]:\n\n\ndf = pd.read_csv('geo.csv')\n\n\n# In[ ]:\n\n\n# Schema of Dataframes:\n# Columns in df with example values:\n# state (WA), capital (Seattle), population (1.4 millon)\n\n\n# In[ ]:\n\n\n# Problem: What is the population of California?\n\n\n# In[ ]:\n\n\n# Solution: Let's solve this problem step-by-step.\nresult = df[df['state'] == 'CA']['population']\n\n\n# In[ ]:\n\n\n# # Exercise 3\n\n\n# In[ ]:\n\n\ndf = pd.read_csv('score.csv')\n\n\n# In[ ]:\n\n\n# Schema of Dataframes:\n# Columns in df with example values:\n# name (John), score (97)\n\n\n# In[ ]:\n\n\n# Problem: Make a new column \"grade\" for letter grades (A: 90+, B: 70-90, C: <70) and plot the number of students in each grade.\n\n\n# In[ ]:\n\n\n# Solution: Let's solve this problem step-by-step.\n# Step 1: Define a function to convert scores to letter grades.\ndef get_grade(score):\n  if score >= 90:\n    return 'A'\n  elif 70 <= score < 90:\n    return 'B'\n  else:\n    return 'C'\n# Step 2: Convert scores to letter grades.\ndf['grade'] = df.score.apply(get_grade)\n# Step 3: Count the number of students by grade.\ncount_df = df['grade'].value_counts()\n# Step 4: Visualize in a bar chart.\ncount_df.plot(kind='bar')\n\n\n# In[ ]:\n\n\n# # Exercise 4\n\n\n# In[ ]:\n\n\ndf = pd.read_csv('phones.csv')\n\n\n# In[ ]:\n\n\n# Schema of Dataframes:\n# Columns in df with example values:\n# model (Pixel 6), brand (Google), price (387), release (2022)\n\n\n# In[ ]:\n\n\n# Problem: What is the most expensive phone in each brand.\n\n\n# In[ ]:\n\n\n# Solution: Let's solve this problem step-by-step.\n# Step 1: Group models by their brands.\nmodel_by_brand_df = df.groupby('brand')\n# Step 2: Find the index of rows that have the highest price in each group\nidx = model_by_brand_df['price'].idxmax()\n# Step 3: Get the rows using the index\nexpensive_models_df = df.loc[idx]\n# Step 4: Return the brand name, model and price.\nresult = expensive_models_df[['brand', 'model', 'price']]\n\n\n# In[ ]:\n\n\n# # Exercise 5\n\n\n# In[ ]:\n\n\nimport pandas as pd\nimport numpy as np\nfrom datetime import datetime\n\n\n# In[ ]:\n\n\ndf=pd.read_excel('dataset/orders_data.xlsx')\n\n\n# In[ ]:\n\n\n# Schema of Dataframes:\n# Columns in df with example values:\n# order_no (405-9763961-5211537), order_date (Sun, 18 Jul, 2021, 10:38 pm IST), buyer (Mr.), ship_city (CHANDIGARH,), ship_state (CHANDIGARH), sku (SKU:  2X-3C0F-KNJE), description (100% Leather Elephant Shaped Piggy Coin Bank | Block Printed West Bengal Handicr), quantity (1), item_total (\u20b9449.00), shipping_fee (\u20b960.18), cod (Cash On Delivery), order_status (Delivered to buyer)\n\n\n# In[ ]:\n\n\ndf.head()\n\n\n# In[ ]:\n\n\ndf.info()\n\n\n# In[ ]:\n\n\n# How many items purchased within the month of November were returned to seller?\n\n\n# In[ ]:\n\n\ndef months(x):\n    x=x.split(',')[1][-3:]\n    return x\ndf['Month']=df['order_date'].apply(months)\nlen(df[(df['Month']=='Nov') & (df['order_status']=='Returned to seller')])\n\n\n# In[ ]:\n\n\n# Convert item total column from string to float\n\n\n# In[ ]:\n\n\ndef total_float(price):\n    try:\n        return float(price.strip('\u20b9').strip(',').replace(',',\"\"))\n    except:\n        try:\n            return float(price)\n        except:\n            return np.NaN\ndf['item_total']=df['item_total'].apply(total_float)\n\n\n# In[ ]:\n\n\n# What is the average price for leather wallets?\n\n\n# In[ ]:\n\n\ndf[df['description'].str.lower().str.contains('wallet') & df['description'].str.lower().str.contains('leather')]['item_total'].mean()\n\n\n# In[ ]:\n\n\n# In what distinct years have the orders been placed in?\n\n\n# In[ ]:\n\n\nlist(pd.to_datetime(df['order_date'].map(lambda x: x[:-4])).dt.year.unique())\n\n\n# In[ ]:\n\n\n# What item is sold the most?\n\n\n# In[ ]:\n\n\ndf[df['sku']==df['sku'].value_counts(ascending=False).index[0]]['description'].values[0]\n\n\n# In[ ]:\n\n\n# What is the city that had cash on delivery as the most common payment method?\n\n\n# In[ ]:\n\n\ndf.groupby('ship_city').count()['cod'].sort_values(ascending=False).index[0][:-1]\n\n\n# In[ ]:\n\n\n# Problem: What are the average shipping fees for each state, starting from highest to lowest? (rounded to 2 decimal places)\n\n\n# In[ ]:\n\n\n# Solution: Let's solve this problem step-by-step.\n"}
]

print(generate_response(messages))

In [None]:
from arcade_nl2code.annotated_dataset import dataset as dataset_module
from pprint import pprint
from tqdm import tqdm

DATASET_PATH = "arcade_nl2code/annotated_dataset/dataset/new_tasks/derived_datasets/dataset.schema.originating_dfs.header_description.after_variable_cell.maxp2100.maxp_no_prefix900.maxctxcell-1.e1_2_3_5.step_by_steppreambleexplanation.json"
OUTPUT_PATH = f"predictions/codeqwen/predictions.json"

preds = []
dataset = dataset_module.load_dataset(DATASET_PATH)

for episode in tqdm(dataset, desc="Episodes"):
    episode_prediction = dict(
        metadata={k: v for k, v in episode.items() if k != 'turns'},
        turns=[]
    )

    for turn_example in tqdm(episode['turns'], "Turns", leave=False):
      messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": turn_example.input}
      ]
      code_str = generate_response(messages)
      turn_pred_entry = dict(
          metadata=dict(example=turn_example),
          predictions=[code_str],
      )
      episode_prediction['turns'].append(turn_pred_entry)

    preds.append(episode_prediction)

dataset_module.save_dataset(preds, OUTPUT_PATH)