In [None]:
import pandas as pd
import numpy as np
import csv
import re
from sklearn.model_selection import GroupShuffleSplit, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.naive_bayes import CategoricalNB
import matplotlib.pyplot as plt


In [None]:
INPUT = "training_data_clean.csv"

ID_COL  = "student_id"
TARGET  = "label"

TEXT_COLS = [
    "In your own words, what kinds of tasks would you use this model for?",
    "Which types of tasks do you feel this model handles best? (Select all that apply.)",
    "For which types of tasks do you feel this model tends to give suboptimal responses? (Select all that apply.)",
    "Think of one task where this model gave you a suboptimal response. What did the response look like, and why did you find it suboptimal?",
    "When you verify a response from this model, how do you usually go about it?"
]

LIKERT_COLS = [
    "How likely are you to use this model for academic tasks?",
    "Based on your experience, how often has this model given you a response that felt suboptimal?",
    "How often do you expect this model to provide responses with references or supporting evidence?",
    "How often do you verify this model's responses?"
]

MODIFIED_TEXT_FEATS = [
    "Writing or debugging code",
    "Math computations",
    "Explaining complex concepts simply",
    "Drafting professional text (e.g., emails, résumés)",
    "Data processing or analysis",
    "Brainstorming or generating creative ideas",
    "Writing or editing essays/reports",
    "Converting content between formats (e.g., LaTeX)"
]

In [None]:
def reformat_rename(df):
  # Rename columns
  new_names = ["student_id", "tasks_open", "academic_scale", "task_types",
          "suboptimal_scale","suboptimal_types",
          "suboptimal_open", "ref_scale", "verify_scale","verify_open","label"]
  df.columns = new_names

  # Remove parantheses in multiple select options. This is to prepare for the next splitting step
  df['task_types'] = df['task_types'].str.replace(r'\([^)]*\)', '', regex=True)
  df['suboptimal_types'] = df['suboptimal_types'].str.replace(r'\([^)]*\)', '', regex=True)

  for feat in MODIFIED_TEXT_FEATS:
    # Create a one-hot flag per row: whether the multi-select text contains the option
    # Use string containment on the column (handle NaN -> empty string) and cast to int
    df[f'{feat}1'] = df['task_types'].fillna('').astype(str).str.contains(feat, regex=False, na=False).astype(int)
    df[f'{feat}2'] = df['suboptimal_types'].fillna('').astype(str).str.contains(feat, regex=False, na=False).astype(int)

  df = df.drop(columns=['task_types', 'suboptimal_types'])

  df = df.convert_dtypes()

  return df

In [None]:
def clean_data(df):
  # Normalize missing tokens
  # Convert non-breaking spaces to normal spaces, blank-only cells → NaN
  df.replace({u"\u00A0": " "}, regex=True, inplace=True)
  df.replace(r"^\s*$", np.nan, regex=True, inplace=True)

  # Convert Likert scales to just numbers
  LIKERT_REGEX = re.compile(r"^\s*(\d+)\s*—?.*$")

  for c in LIKERT_COLS:
      if c in df.columns:
          # extract the number; invalid/missing stay NaN
          df[c] = df[c].astype(str).str.extract(LIKERT_REGEX)[0].astype(float)

  # Replace missing Likert values with column median
  medians = df[LIKERT_COLS].median(numeric_only=True)
  df[LIKERT_COLS] = df[LIKERT_COLS].fillna(medians)

  # Fill missing text with "no_response"
  for c in TEXT_COLS:
      if c in df.columns:
          df[c] = df[c].fillna("no_response")

  df = reformat_rename(df)

  return df

In [None]:
df = pd.read_csv(INPUT, keep_default_na=True, skipinitialspace=True)

In [None]:
df = clean_data(df)

In [None]:
# Count the number of null values in each feature column
# df.isna().sum()

In [None]:
def split_dataset(df):
  # split (70:15:15) while keeping student_id groups intact
  gss = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=311)
  train_val_idx, test_idx = next(gss.split(df, groups=df[ID_COL]))
  train_val_df = df.iloc[train_val_idx].reset_index(drop=True)
  test_df = df.iloc[test_idx].reset_index(drop=True)

  # then split train_val into train/val by groups
  val_ratio_within_trainval = 0.15 / 0.85
  gss2 = GroupShuffleSplit(n_splits=1, test_size=val_ratio_within_trainval, random_state=311)
  train_idx, val_idx = next(gss2.split(train_val_df, groups=train_val_df[ID_COL]))
  train_df = train_val_df.iloc[train_idx].reset_index(drop=True)
  val_df = train_val_df.iloc[val_idx].reset_index(drop=True)

  return train_df, val_df, test_df

In [None]:
train_df, valid_df, test_df = split_dataset(df)
print("train:", len(train_df))
print("valid:  ", len(valid_df))
print("test: ", len(test_df))

train: 573
valid:   126
test:  126


In [None]:
def get_train_data(df, t_train_onehot: bool):

    if t_train_onehot:
        t_train = pd.get_dummies(df['label'])
    else:
        t_train = df['label']
    x_train = df.drop(columns=['label', 'student_id'])

    return x_train, t_train

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stops = ENGLISH_STOP_WORDS
stops = list(stops)


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
stops = nlp.Defaults.stop_words


In [None]:
stops = list(stops)

In [None]:
from collections import Counter, defaultdict
word_doc_count = defaultdict(int)

In [None]:
def create_vocab(df: pd.DataFrame, features: list[str]):
    """
    Returns a list of vocab words.
    Parameters:
    - df: cleaned dataset
    - col: list of open-ended features
    """
    min_df = 5
    vocab = set()

    skip_list = ['THIS', 'MODEL', '', '-', ' ', 'in', 'and', 'the', 'on', 'at',
                 'of', 'suboptimal', 'verify', 'open', 'types', 'ref', 'student',
                 'id'
                 ] + stops

    # words/symbols to avoid adding to vocab
    skip_words = {word.lower() for word in skip_list if word.strip()}

    # For each open ended feature, split the string into words and add to vocab
    for feat in features:
        df[feat].apply(lambda line: add_to_vocab(line, skip_words))

    vocab = {word for word, count in word_doc_count.items() if count >= min_df}

    return list(vocab)


def add_to_vocab(line: str, skip_words: set):
    """
    Helper function to split a given line and add to vocab
    """
    line = line.lower().strip()
    words = re.split(r'[ ,.?:[[\\\]/{()}///"";]+', line)
    for word in words:
          if word in skip_words or word.isdigit():
            continue
          word_doc_count[word] += 1


def create_bow(df, vocab, features):
    """
    Returns the df with bow for the given list of features and vocab.
    - df: the DataFrame to add bow to
    - vocab: list of unique words
    - features: list of open ended features to turn into bow
    """
    df_copy = df.copy()

    # Initialize a new DataFrame for Bag-of-Words features with zeros
    bow_data = np.zeros((df_copy.shape[0], len(vocab)), dtype=int)
    bow_df = pd.DataFrame(bow_data, columns=vocab, index=df_copy.index)

    # Populate the BoW DataFrame using vectorized operations
    for feat in features:
        # Ensure the feature column is string type and handle NaN values
        text_series = df_copy[feat].fillna('').astype(str).str.lower()
        for word in vocab:
            # Use .str.contains to check for whole words, update the bow_df
            # Use bitwise OR to combine results across multiple text features
            bow_df[word] = bow_df[word] | text_series.str.contains(r'\b' + re.escape(word) + r'\b', regex=True).astype(int)

    # Drop the original open-ended text columns from the df_copy
    df_copy = df_copy.drop(columns=features)

    # Concatenate the original df_copy (without text columns) with the new bow_df
    final_df = pd.concat([df_copy, bow_df], axis=1)

    # Assert the correct number of columns
    expected_cols = df.shape[1] - len(features) + len(vocab)
    assert(final_df.shape[1] == expected_cols)

    return final_df


In [None]:
open_features = ['tasks_open', 'suboptimal_open', 'verify_open']
vocab = create_vocab(train_df, open_features)
print("Length of vocab:", len(vocab))

Length of vocab: 704


In [None]:
vocab

In [None]:
train_bow_df = create_bow(train_df, vocab, open_features)
print("bow_df shape: ", train_bow_df.shape)

bow_df shape:  (573, 726)


In [None]:
valid_bow_df = create_bow(valid_df, vocab, open_features)
print("bow_df shape for valid df: ", valid_bow_df.shape)

bow_df shape for valid df:  (126, 726)


In [None]:
test_bow_df = create_bow(test_df, vocab, open_features)
print("bow_df shape for test df: ", test_bow_df.shape)

bow_df shape for test df:  (126, 726)


In [None]:
x_train, t_train = get_train_data(train_bow_df, False)
print("x_train shape: ", x_train.shape)
print("t_train shape: ", t_train.shape)

x_train shape:  (573, 724)
t_train shape:  (573,)


In [None]:
x_valid, t_valid = get_train_data(valid_bow_df, False)
x_test, t_test = get_train_data(test_bow_df, False)

In [None]:
# RFC with default hyperparams on x_train, t_train
rfc = RandomForestClassifier(random_state=311)
rfc = rfc.fit(X=x_train, y=t_train)
# Check accuracy on x_valid, t_valid
rfc_accuracy = rfc.score(X=x_valid, y=t_valid)
print("Valid accuracy for the base RFC: ", rfc_accuracy)
print("Train accuracy for the base RFC: ", rfc.score(X=x_train, y=t_train))

Valid accuracy for the base RFC:  0.6984126984126984
Train accuracy for the base RFC:  0.9947643979057592


In [None]:
# Tune hyperparams using RandomSearchCV
param_dist = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 5, 8, 10, 13, 15, 20, 25, 30],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [25, 30, 35, 40, 45],
    'min_samples_leaf': [10, 15, 20, 25, 30, 35, 40]
}

In [None]:
search = RandomizedSearchCV(
    estimator=rfc,
    param_distributions=param_dist,
    n_iter=300,
    cv=3,
    n_jobs=-1,
    random_state=311
)
search = search.fit(x_train, t_train)

In [None]:
search.best_params_


{'n_estimators': 50,
 'min_samples_split': 40,
 'min_samples_leaf': 10,
 'max_features': 'sqrt',
 'max_depth': 8,
 'criterion': 'entropy'}

In [None]:
# RFC with custom bow with all features (same as below except hyperparams)
rfc2 = RandomForestClassifier(
    random_state=311,
    criterion='entropy',
    n_estimators=200,
    max_depth=8,
    # max_features=,
    min_samples_split=40,
    min_samples_leaf=10
    )
rfc2 = rfc2.fit(X=x_train, y=t_train)
# Check accuracy on x_valid, t_valid
rfc_accuracy = rfc2.score(X=x_valid, y=t_valid)
print("Valid accuracy for the base RFC: ", rfc_accuracy)
print("Train accuracy for the base RFC: ", rfc2.score(X=x_train, y=t_train))

Valid accuracy for the base RFC:  0.7063492063492064
Train accuracy for the base RFC:  0.7382198952879581


In [None]:
rfc2.score(X=x_test, y=t_test)

0.6428571428571429