In [1]:
import re, numpy as np, pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
INPUT  = "/mnt/training_data_clean.csv"
OUTPUT = "/mnt/training_data_with_split.csv"


In [3]:
ID_COL = "student_id"
TARGET = "label"

In [4]:
TEXT_COLS = [
    "In your own words, what kinds of tasks would you use this model for?",
    "Which types of tasks do you feel this model handles best? (Select all that apply.)",
    "For which types of tasks do you feel this model tends to give suboptimal responses? (Select all that apply.)",
    "Think of one task where this model gave you a suboptimal response. What did the response look like, and why did you find it suboptimal?",
    "When you verify a response from this model, how do you usually go about it?"
]

In [5]:
LIKERT_COLS = [
    "How likely are you to use this model for academic tasks?",
    "Based on your experience, how often has this model given you a response that felt suboptimal?",
    "How often do you expect this model to provide responses with references or supporting evidence?",
    "How often do you verify this model's responses?"
]

In [8]:
df = pd.read_csv(INPUT, keep_default_na=True, skipinitialspace=True)

df.replace({u"\u00A0": " "}, regex=True, inplace=True)  # non-breaking spaces -> normal space
df.replace(r"^\s*$", np.nan, regex=True, inplace=True)  # empty or whitespace-only -> NaN

MISSING_TOKENS = {"NA", "N/A", "na", "n/a", "None", "none", "null", "NULL", "Prefer not to say"}
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].replace(list(MISSING_TOKENS), np.nan)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [10]:
LIKERT_REGEX = re.compile(r"^\s*(\d+)\s*(?:—)?.*$")  # only matches em dash


In [11]:
for c in TEXT_COLS:
    if c in df.columns:
        df[c] = df[c].fillna("no_response")


In [9]:
likert_cols = [
    "How likely are you to use this model for academic tasks?",
    "Based on your experience, how often has this model given you a response that felt suboptimal?",
    "How often do you expect this model to provide responses with references or supporting evidence?",
    "How often do you verify this model's responses?"
]

# Regex to extract the leading number before the em dash —
LIKERT_REGEX = re.compile(r"^\s*(\d+)\s*—?.*$")

for c in likert_cols:
    if c in df.columns:
        # Extract the number and convert to float
        df[c] = df[c].astype(str).str.extract(LIKERT_REGEX)[0].astype(float)

medians = df[likert_cols].median(numeric_only=True)
df[likert_cols] = df[likert_cols].fillna(medians)

# Check a few rows to confirm
df[likert_cols].head()

Unnamed: 0,How likely are you to use this model for academic tasks?,"Based on your experience, how often has this model given you a response that felt suboptimal?",How often do you expect this model to provide responses with references or supporting evidence?,How often do you verify this model's responses?
0,3.0,3.0,1.0,4.0
1,4.0,3.0,1.0,5.0
2,3.0,4.0,2.0,3.0
3,5.0,4.0,3.0,2.0
4,4.0,4.0,2.0,1.0


In [13]:
import pandas as pd

train_df = pd.read_csv("/mnt/train_final.csv")
val_df   = pd.read_csv("/mnt/val_final.csv")
test_df  = pd.read_csv("/mnt/test_final.csv")

print(train_df.shape, val_df.shape, test_df.shape)
print(train_df.columns)

(576, 26) (123, 26) (126, 26)
Index(['tasks_open', 'academic_scale', 'suboptimal_scale', 'suboptimal_open',
       'ref_scale', 'verify_scale', 'verify_open', 'label',
       'Brainstorming or generating creative ideas',
       'Converting content between formats ', 'Data processing or analysis',
       'Drafting professional text ', 'Explaining complex concepts simply',
       'Math computations', 'NA', 'Writing or debugging code',
       'Writing or editing essays/reports',
       'Brainstorming or generating creative ideas.1',
       'Converting content between formats .1',
       'Data processing or analysis.1', 'Drafting professional text .1',
       'Explaining complex concepts simply.1', 'Math computations.1', 'NA.1',
       'Writing or debugging code.1', 'Writing or editing essays/reports.1'],
      dtype='object')


In [14]:
# 1) Text columns in the new data
TEXT_COLS = ["tasks_open", "suboptimal_open", "verify_open"]

# 2) Numeric columns (4 scales + all the select-all task dummies)
NUMERIC_COLS = [
    "academic_scale",
    "suboptimal_scale",
    "ref_scale",
    "verify_scale",
    # “best tasks” 0/1
    "Brainstorming or generating creative ideas",
    "Converting content between formats ",
    "Data processing or analysis",
    "Drafting professional text ",
    "Explaining complex concepts simply",
    "Math computations",
    "NA",
    "Writing or debugging code",
    "Writing or editing essays/reports",
    # “suboptimal tasks” 0/1
    "Brainstorming or generating creative ideas.1",
    "Converting content between formats .1",
    "Data processing or analysis.1",
    "Drafting professional text .1",
    "Explaining complex concepts simply.1",
    "Math computations.1",
    "NA.1",
    "Writing or debugging code.1",
    "Writing or editing essays/reports.1",
]

# Fill missing text with a token
for df in (train_df, val_df, test_df):
    for col in TEXT_COLS:
        df[col] = df[col].fillna("no_response")

def make_combined_text(df):
    return df[TEXT_COLS].astype(str).agg(" ".join, axis=1)

train_df["combined_text"] = make_combined_text(train_df)
val_df["combined_text"]   = make_combined_text(val_df)
test_df["combined_text"]  = make_combined_text(test_df)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Targets
y_train = train_df["label"]
y_val   = val_df["label"]
y_test  = test_df["label"]

# 2.1 Text → TF-IDF (you can tweak these hyperparams)
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 1),
    min_df=3,
    sublinear_tf=True,
)

X_train_text = tfidf.fit_transform(train_df["combined_text"])
X_val_text   = tfidf.transform(val_df["combined_text"])
X_test_text  = tfidf.transform(test_df["combined_text"])

# 2.2 Numeric features → StandardScaler
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse compatibility

X_train_num = scaler.fit_transform(train_df[NUMERIC_COLS].astype(float))
X_val_num   = scaler.transform(val_df[NUMERIC_COLS].astype(float))
X_test_num  = scaler.transform(test_df[NUMERIC_COLS].astype(float))

# 2.3 Combine text + numeric
X_train = hstack([X_train_text, X_train_num])
X_val   = hstack([X_val_text,   X_val_num])
X_test  = hstack([X_test_text,  X_test_num])

print("Train feature shape:", X_train.shape)
print("Val feature shape:",   X_val.shape)
print("Test feature shape:",  X_test.shape)

Train feature shape: (576, 1327)
Val feature shape: (123, 1327)
Test feature shape: (126, 1327)


In [18]:
softmax = LogisticRegression(
    multi_class="multinomial",  # THIS makes it softmax
    solver="lbfgs",
    C=0.5,
    max_iter=1000
)

softmax.fit(X_train, y_train)

# Evaluate on validation
val_pred = softmax.predict(X_val)
print("=== VALIDATION ===")
print("Accuracy:", accuracy_score(y_val, val_pred))
print(classification_report(y_val, val_pred, digits=3))



=== VALIDATION ===
Accuracy: 0.7073170731707317
              precision    recall  f1-score   support

     ChatGPT      0.795     0.854     0.824        41
      Claude      0.612     0.732     0.667        41
      Gemini      0.733     0.537     0.620        41

    accuracy                          0.707       123
   macro avg      0.714     0.707     0.703       123
weighted avg      0.714     0.707     0.703       123



In [19]:
from scipy.sparse import vstack

X_trainval = vstack([X_train, X_val])
y_trainval = np.concatenate([y_train.values, y_val.values])

softmax.fit(X_trainval, y_trainval)

test_pred = softmax.predict(X_test)
print("\n=== TEST ===")
print("Accuracy:", accuracy_score(y_test, test_pred))
print(classification_report(y_test, test_pred, digits=3))
print("\nConfusion matrix (rows=true, cols=pred):\n", confusion_matrix(y_test, test_pred))
from sklearn.metrics import accuracy_score

# Predict on the training set
train_pred = softmax.predict(X_train)

# Compute accuracy
train_acc = accuracy_score(y_train, train_pred)

print("=== TRAINING ACCURACY ===")
print("Training accuracy:", train_acc)




=== TEST ===
Accuracy: 0.7063492063492064
              precision    recall  f1-score   support

     ChatGPT      0.809     0.905     0.854        42
      Claude      0.651     0.667     0.659        42
      Gemini      0.639     0.548     0.590        42

    accuracy                          0.706       126
   macro avg      0.700     0.706     0.701       126
weighted avg      0.700     0.706     0.701       126


Confusion matrix (rows=true, cols=pred):
 [[38  2  2]
 [ 3 28 11]
 [ 6 13 23]]
=== TRAINING ACCURACY ===
Training accuracy: 0.765625
