In [None]:
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import operator
import nltk
import math
from scipy.stats import norm
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
data = pd.read_csv('adjudicated.txt', sep='\t', header=None, names=['ID', 'Adjudicated', 'Label', 'Text'])
data.head()

Unnamed: 0,ID,Adjudicated,Label,Text
0,1,adjudicated,Adolescent,Summary: Helen Hunt Jackson is probably most f...
1,2,adjudicated,Adult,Summary: Dr. Woodson describes the internal mi...
2,3,adjudicated,Child,"Summary: In the summer, Don and Joyce stay on ..."
3,4,adjudicated,Young Adult,"Summary: ""But the Knyght was a little less tha..."
4,5,adjudicated,Adult,Summary: The young Niel Herbert idolizes Maria...


In [None]:
train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42)
dev_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_data.to_csv('splits/train.txt', sep='\t', index=False, header=False)
dev_data.to_csv('splits/dev.txt', sep='\t', index=False, header=False)
test_data.to_csv('splits/test.txt', sep='\t', index=False, header=False)

In [None]:
num_training_data = train_data.shape[0]
num_dev_data = dev_data.shape[0]
num_test_data = test_data.shape[0]

print(num_training_data, num_dev_data, num_test_data)

300 100 100


In [None]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#Changed this since for some reason the original function was taking the header of the data and using that as the
#first row of our actual data.

def load_ordinal_data(filename, ordering):
    X = []
    Y = []
    orig_Y=[]
    for ordinal in ordering:
        Y.append([])

    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols = line.split("\t")
            idd = cols[0]
            label = cols[2].lstrip().rstrip()
            text = cols[3]

            X.append(text)

            index=ordering.index(label)
            for i in range(len(ordering)):
                if index > i:
                    Y[i].append(1)
                else:
                    Y[i].append(0)
            orig_Y.append(label)

    return X, Y, orig_Y

In [None]:
# testing to see if load_original_data is correct
ordering=["Child", "Adolescent", "Young Adult", "Adult"]
load_ordinal_data("adjudicated.txt", ordering)

#X: A list containing the texts corresponding to each data point.
#Y: A list of lists where each inner list corresponds to the binary representation of the ordinal label for each data point. Each inner list should have a length of len(ordering) - 1, where each element indicates whether the data point belongs to a category above (1) or below (0) the corresponding category in the ordering list. For example, if the label is "Adolescent", the corresponding inner list would be [1, 0, 0], indicating that it's above "Child" but below "Young Adult" and "Adult".
#orig_Y: A list containing the original labels for each data point.

In [None]:
class OrdinalClassifier:

    def __init__(self, ordinal_values, feature_method, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY):
        self.ordinal_values=ordinal_values
        self.feature_vocab = {}
        self.feature_method = feature_method
        self.min_feature_count=2
        self.log_regs = [None]* (len(self.ordinal_values)-1)

        self.trainY=trainY
        self.devY=devY
        self.testY=testY

        self.orig_trainY=orig_trainY
        self.orig_devY=orig_devY
        self.orig_testY=orig_testY

        self.trainX = self.process(trainX, training=True)
        self.devX = self.process(devX, training=False)
        self.testX = self.process(testX, training=False)

    # Featurize entire dataset
    def featurize(self, data):
        featurized_data = []
        for text in data:
            feats = self.feature_method(text)
            featurized_data.append(feats)
        return featurized_data

    # Read dataset and returned featurized representation as sparse matrix + label array
    def process(self, X_data, training = False):

        data = self.featurize(X_data)

        if training:
            fid = 0
            feature_doc_count = Counter()
            for feats in data:
                for feat in feats:
                    feature_doc_count[feat]+= 1

            for feat in feature_doc_count:
                if feature_doc_count[feat] >= self.min_feature_count:
                    self.feature_vocab[feat] = fid
                    fid += 1

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        for idx, feats in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[feat]

        return X


    def train(self):
        (D,F) = self.trainX.shape


        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            best_dev_accuracy=0
            best_model=None
            for C in [0.1, 1, 10, 100]:

                log_reg = linear_model.LogisticRegression(C = C, max_iter=1000)
                log_reg.fit(self.trainX, self.trainY[idx])
                development_accuracy = log_reg.score(self.devX, self.devY[idx])
                if development_accuracy > best_dev_accuracy:
                    best_dev_accuracy=development_accuracy
                    best_model=log_reg


            self.log_regs[idx]=best_model

    def test(self):
        cor=tot=0
        counts=Counter()
        preds=[None]*(len(self.ordinal_values)-1)
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            preds[idx]=self.log_regs[idx].predict_proba(self.testX)[:,1]

        preds=np.array(preds)

        for data_point in range(len(preds[0])):


            ordinal_preds=np.zeros(len(self.ordinal_values))
            for ordinal in range(len(self.ordinal_values)-1):
                if ordinal == 0:
                    ordinal_preds[ordinal]=1-preds[ordinal][data_point]
                else:
                    ordinal_preds[ordinal]=preds[ordinal-1][data_point]-preds[ordinal][data_point]

            ordinal_preds[len(self.ordinal_values)-1]=preds[len(preds)-1][data_point]

            prediction=np.argmax(ordinal_preds)
            counts[prediction]+=1
            if prediction == self.ordinal_values.index(self.orig_testY[data_point]):
                cor+=1
            tot+=1

        return cor/tot

In [None]:
#Added lemmatizer .48 -> .44 and stopwords .44 ->.47
nltk.download('wordnet')
nltk.download('stopwords')

def binary_bow_featurize(text):
    lemmatizer =  WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    feats = {}
    words = nltk.word_tokenize(text)

    for word in words:
        word=word.lower()
        if word not in stop_words:
          lemma = lemmatizer.lemmatize(word)
          feats[lemma]=1

    return feats

def confidence_intervals(accuracy, n, significance_level):
    critical_value=(1-significance_level)/2
    z_alpha=-1*norm.ppf(critical_value)
    se=math.sqrt((accuracy*(1-accuracy))/n)
    return accuracy-(se*z_alpha), accuracy+(se*z_alpha)

def run(trainingFile, devFile, testFile, ordinal_values):
    trainX, trainY, orig_trainY=load_ordinal_data(trainingFile, ordinal_values)
    devX, devY, orig_devY=load_ordinal_data(devFile, ordinal_values)
    testX, testY, orig_testY=load_ordinal_data(testFile, ordinal_values)

    simple_classifier = OrdinalClassifier(ordinal_values, binary_bow_featurize, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY)
    simple_classifier.train()
    accuracy=simple_classifier.test()

    lower, upper=confidence_intervals(accuracy, len(testY[0]), .95)
    print("Test accuracy for best dev model: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy, lower, upper))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Bow: lemmatization, stopword removal
Feature eng: TF-IDF Vect, word embeddings, POS
Architecture: Bi-LSTM, Attention Mechanisms, BERT
self training

In [None]:
trainingFile = 'splits/train.txt'
devFile = 'splits/dev.txt'
testFile = 'splits/test.txt'


run(trainingFile, devFile, testFile, ordering)

Test accuracy for best dev model: 0.470, 95% CIs: [0.372 0.568]

