In [1]:
# Major parts of the code borrowed from: https://github.com/surabhisnath/aut-utility-prediction/blob/main/Main_Analysis_Script.ipynb

In [2]:
import pandas as pd
import glob
import re
import nltk
from nltk.corpus import stopwords, wordnet as wn

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /Users/snath/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/snath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/snath/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/snath/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Functions

In [3]:
# Function for reading in Stevenson, 2020 data files and uniting rater_01 and rater_02 files
def read_files(path):
    """
    Read the csv files from ./data/Stevenson-2020-human

    :param path: string with path to files
    :return dataset: merged dataset
    """

    path = path
    all_files = glob.glob(path + "/*.csv")
    liR1 = []
    liR2 = []

    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0, nrows=1, encoding='latin1')
        if len(df.columns) == 1:
            df = pd.read_csv(filename, index_col=None, header=0, encoding='latin1', sep=';')
        else:
            df = pd.read_csv(filename, index_col=None, header=0, encoding='latin1')

        if '_rater01' in filename:
            liR1.append(df)


        else:
            liR2.append(df.loc[:, ['response_id', 'respondent_id', 'originality_rater02', 'utility_rater02']])

    frameR1 = pd.concat(liR1, axis=0, ignore_index=True)
    frameR2 = pd.concat(liR2, axis=0, ignore_index=True)

    df = frameR1.merge(frameR2, on=['response_id','respondent_id'],
                   how='left')

    df["translated_response"] = df["translated_response"].astype(str)
    df["response_id"] = df["response_id"].astype(str)
    df["respondent_id"] = df["respondent_id"].astype(str)

    return df


# Ensuring they are adjectives and at least 3 characters
def is_adjective_and_long_enough(word):
    return len(word) >= 3 and len(wn.synsets(word, pos=wn.ADJ)) > 0


# Function for dropping invalid answers
def drop_invalid(df):
    """
    Drops all answers that were either empty, had a rating of 0 for at least one score,
    or of which the respondent number was 9999 (indicating an invalid respondent)

    :param df: dataset with all columns needed for further steps
    :return dataset, dropped_data:  dataset without invalid data,
                dataset of invalid data
    """
    liV = [1] * len(df)
    condition = (df[['utility_rater01', 'utility_rater02', 'originality_rater01', 'originality_rater02']] == 0).any(axis=1)

    liV = [0 if cond else li for cond, li in zip(condition, liV)]

    # Dropping answers rated as 0 by at least one rater
    df['valid'] = liV
    df_invalid = df[df['valid'] == 0]
    df = df[df['valid'] != 0]

    # Dropping respondent_id that seems to belong to no one
    df_strange = df[df['respondent_id'] == 9999]
    df = df[df['respondent_id'] != 9999]

    # Dropping empty answers
    df_empty = df[df['original_response'] == 'nan']
    df = df[df['original_response'] != 'nan']
    df = df.drop(columns=['valid'])

    df_dropped = pd.concat([df_empty, df_strange, df_invalid], axis=0, ignore_index=True)

    return df, df_dropped


# Function for cleaning valid responses
def clean_response(dataset, col_response):
    """
    Function cleans the responses

    :param dataset: dataset which include column(s) of responses
    :param col_response: column name of responses to be cleaned
    :return dataset: input dataset with clean responses added
    """
    # Upper to lowercase, remove punctuation and redundant spaces/letters
    dataset[col_response] = [x.lower() for x in dataset[col_response]]
    dataset[col_response] = [re.sub(r'[^\w\s]', ' ', x) for x in dataset[col_response]]  # delete any signs
    dataset[col_response] = [re.sub(r'\b\w\b', ' ', x) for x in dataset[col_response]] # delete loose letters
    dataset[col_response] = [x.strip() for x in dataset[col_response]]  # delete extra white space before/after string
    dataset[col_response] = [' '.join(x.split()) for x in dataset[col_response]]  # delete every extra space in string
    return dataset

## Loding data

In [5]:
# Reading in files where encoding needs to be detected (change file paths as needed)
# with open('../data/Nath-2024-LLM.csv', 'rb') as f:
#     result = chardet.detect(f.read())
# df1 = pd.read_csv('../data/Nath-2024-LLM.csv', encoding=result['encoding'])

# with open('../data/Hubert-2024-LLM.csv', 'rb') as f:
#     result = chardet.detect(f.read())
# df2 = pd.read_csv('../data/Hubert-2024-LLM.csv', encoding=result['encoding'])

# Loading the rest of the data files (change file paths as needed)
df3 = pd.read_csv('../data/Nath-2024-human.csv')
df4 = pd.read_csv('../data/Stevenson-2022-human.csv')
df5 = read_files('../data/Stevenson-2020-human')
# df6 = pd.read_excel('../data/additional-LLM.xlsx')

# Uniting all data files in one dataframe
df = pd.concat([df3, df4, df5], axis=0)

In [7]:
"research_id" in df3.columns

True

## Cleaning Data

In [5]:
df['translated_response'] = df['translated_response'].astype(str)
df, df_dropped = drop_invalid(df)
df = clean_response(df, 'translated_response')

# Surabhi additons
df = df[["object", "translated_response", "utility_rater01", "utility_rater02"]]
df = df.drop_duplicates(subset=['object', 'translated_response']).reset_index(drop=True)
df = df[(~pd.isna(df["utility_rater02"])) & (~pd.isna(df["utility_rater01"]))]
df = df.rename(columns={"object": "target object", "translated_response": "alternate use", "utility_rater01": "utility_r1", "utility_rater02": "utility_r2"})
df["mean_utility"] = (df["utility_r1"] + df["utility_r2"])/2
df = df.sort_values(by=["target object"])
df = df.reset_index(drop=True)

In [None]:
df

## Save csv

In [7]:
df.to_csv("../data/aut.csv", index=False)