In [51]:
import re
import numpy as np
import pandas as pd

In [52]:
INPUT = "training_data_clean.csv"

ID_COL  = "student_id"
TARGET  = "label"

TEXT_COLS = [
    "In your own words, what kinds of tasks would you use this model for?",
    "Which types of tasks do you feel this model handles best? (Select all that apply.)",
    "For which types of tasks do you feel this model tends to give suboptimal responses? (Select all that apply.)",
    "Think of one task where this model gave you a suboptimal response. What did the response look like, and why did you find it suboptimal?",
    "When you verify a response from this model, how do you usually go about it?"
]

LIKERT_COLS = [
    "How likely are you to use this model for academic tasks?",
    "Based on your experience, how often has this model given you a response that felt suboptimal?",
    "How often do you expect this model to provide responses with references or supporting evidence?",
    "How often do you verify this model's responses?"
]

In [40]:
def reformat_rename(df):
  # Rename columns
  new_names = ["student_id", "tasks_open", "academic_scale", "task_types",
          "suboptimal_scale","suboptimal_types",
          "suboptimal_open", "ref_scale", "verify_scale","verify_open","label"]
  df.columns = new_names

  # Remove parantheses in multiple select options. This is to prepare for the next splitting step
  df['task_types'] = df['task_types'].str.replace(r'\([^)]*\)', '', regex=True)
  df['suboptimal_types'] = df['suboptimal_types'].str.replace(r'\([^)]*\)', '', regex=True)

  # Split task_types into binary variables
  binary_df = df['task_types'].str.get_dummies(sep=',')
  df = pd.concat([df, binary_df], axis=1)
  # Split suboptimal_types into binary variables
  binary_df = df['suboptimal_types'].str.get_dummies(sep=',')
  df = pd.concat([df, binary_df], axis=1)
  df = df.drop(columns=['task_types', 'suboptimal_types'])

  df = rename_duplicate_columns(df)
  df = df.convert_dtypes()

  return df

In [41]:
def clean_data(df):
  # Normalize missing tokens
  # Convert non-breaking spaces to normal spaces, blank-only cells → NaN
  df.replace({u"\u00A0": " "}, regex=True, inplace=True)
  df.replace(r"^\s*$", np.nan, regex=True, inplace=True)

  MISSING_TOKENS = {
     "NA", "N/A", "na", "n/a",
      "None", "none",
      "null", "NULL",
      "Prefer not to say"
  }

  for col in df.columns:
      if df[col].dtype == "object":
          df[col] = df[col].replace(list(MISSING_TOKENS), np.nan)

  # Convert Likert scales to just numbers
  LIKERT_REGEX = re.compile(r"^\s*(\d+)\s*—?.*$")

  for c in LIKERT_COLS:
      if c in df.columns:
          # extract the number; invalid/missing stay NaN
          df[c] = df[c].astype(str).str.extract(LIKERT_REGEX)[0].astype(float)

  # Replace missing Likert values with column median
  medians = df[LIKERT_COLS].median(numeric_only=True)
  df[LIKERT_COLS] = df[LIKERT_COLS].fillna(medians)

  # Fill missing text with "no_response"
  for c in TEXT_COLS:
      if c in df.columns:
          df[c] = df[c].fillna("NA")

  df = reformat_rename(df)

  return df

In [42]:
def rename_duplicate_columns(df):
    # Create a copy to avoid modifying the original
    df_renamed = df.copy()

    # Dictionary to track occurrences of each column name
    seen_name = []
    new_columns = []

    for col in df.columns:
        if col not in seen_name:
            seen_name.append(col)
            new_columns.append(col)  # Keep first occurrence as is
        else:
            new_columns.append(f"sub_{col}")

    df_renamed.columns = new_columns
    return df_renamed

In [43]:
df = pd.read_csv(INPUT, keep_default_na=True, skipinitialspace=True)

In [44]:
df_clean = clean_data(df)

In [45]:
df_clean

Unnamed: 0,student_id,tasks_open,academic_scale,suboptimal_scale,suboptimal_open,ref_scale,verify_scale,verify_open,label,Brainstorming or generating creative ideas,...,Writing or editing essays/reports,sub_Brainstorming or generating creative ideas,sub_Converting content between formats,sub_Data processing or analysis,sub_Drafting professional text,sub_Explaining complex concepts simply,sub_Math computations,sub_NA,sub_Writing or debugging code,sub_Writing or editing essays/reports
0,1,"General purpose tasks, like translating text, ...",3,3,I find that [THIS MODEL] has too much watering...,1,4,I double check with google searches or find fo...,ChatGPT,0,...,1,1,1,0,0,1,0,0,1,0
1,1,Coding,4,3,Many times when i try to get [THIS MODEL] to d...,1,5,I make sure that the code it writes is well wr...,Claude,0,...,0,1,0,0,1,1,1,0,0,0
2,1,"Math, coding sometimes",3,4,[THIS MODEL] likes to put a lot of words and e...,2,3,I make sure the facts its giving is correct. s...,Gemini,0,...,0,1,0,1,1,0,0,0,0,0
3,2,"General first case go-to, catch-all type of m...",5,4,"Gave it the main ideas I wanted included, but...",3,2,Ask model to verify and doublecheck itself mul...,ChatGPT,1,...,1,0,0,1,1,0,0,0,1,1
4,2,Coding/debugging Problem-solving Coming up w...,4,4,Explain a complex concept Just restated the ...,2,1,#NAME?,Claude,0,...,0,1,1,1,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820,274,Mostly code writing.,3,2,"Often, when asking it to create simple scripts...",3,4,I mainly use this model to write code. When it...,Claude,0,...,0,0,0,1,0,0,1,0,0,0
821,274,Mostly for professional or creative writing.,2,3,If asking it to write an email or cover letter...,3,4,"I read any output produced line by line, befor...",Gemini,1,...,1,0,0,0,1,0,0,0,0,1
822,275,I would use this model for philosophical quest...,5,3,"For simple debugging or troubleshooting, it te...",2,3,"If it is a computation question, I usually hav...",ChatGPT,1,...,1,0,0,0,0,0,1,0,1,0
823,275,"Complex coding tasks, complicated logic or new...",2,4,I asked it to explain something about processe...,3,2,I would seek information from Google or a diff...,Claude,0,...,0,0,0,0,0,1,0,0,0,0


In [46]:
df_clean.columns

Index(['student_id', 'tasks_open', 'academic_scale', 'suboptimal_scale',
       'suboptimal_open', 'ref_scale', 'verify_scale', 'verify_open', 'label',
       'Brainstorming or generating creative ideas',
       'Converting content between formats ', 'Data processing or analysis',
       'Drafting professional text ', 'Explaining complex concepts simply',
       'Math computations', 'NA', 'Writing or debugging code',
       'Writing or editing essays/reports',
       'sub_Brainstorming or generating creative ideas',
       'sub_Converting content between formats ',
       'sub_Data processing or analysis', 'sub_Drafting professional text ',
       'sub_Explaining complex concepts simply', 'sub_Math computations',
       'sub_NA', 'sub_Writing or debugging code',
       'sub_Writing or editing essays/reports'],
      dtype='object')

In [47]:
def split_dataset(df):
  # Get unique student_ids and shuffle them
  unique_ids = df['student_id'].unique()
  # np.random.seed(311)
  np.random.shuffle(unique_ids)

  # Calculate split indices
  total_ids = len(unique_ids)
  split_1 = int(total_ids * 0.70)
  split_2 = int(total_ids * 0.85)

  # Partition the unique IDs
  train_ids = unique_ids[:split_1]
  val_ids = unique_ids[split_1:split_2]
  test_ids = unique_ids[split_2:]

  # Split the original DataFrame using the IDs
  df_train = df[df['student_id'].isin(train_ids)].drop(columns=['student_id'])
  df_val = df[df['student_id'].isin(val_ids)].drop(columns=['student_id'])
  df_test = df[df['student_id'].isin(test_ids)].drop(columns=['student_id'])

  return df_train, df_val, df_test

In [48]:
train, val, test = split_dataset(df_clean)

  np.random.shuffle(unique_ids)


In [50]:
train.to_csv("train_new.csv", index=False)
val.to_csv("val_new.csv", index=False)
test.to_csv("test_new.csv", index=False)