In [None]:
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import operator
import nltk
import math
from scipy.stats import norm
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
data = pd.read_csv('adjudicated.txt', sep='\t', header=None, names=['ID', 'Adjudicated', 'Label', 'Text'])
data.head()

Unnamed: 0,ID,Adjudicated,Label,Text
0,1,adjudicated,Adolescent,Summary: Helen Hunt Jackson is probably most f...
1,2,adjudicated,Adult,Summary: Dr. Woodson describes the internal mi...
2,3,adjudicated,Child,"Summary: In the summer, Don and Joyce stay on ..."
3,4,adjudicated,Young Adult,"Summary: ""But the Knyght was a little less tha..."
4,5,adjudicated,Adult,Summary: The young Niel Herbert idolizes Maria...


In [None]:
train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42)
dev_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_data.to_csv('splits/train.txt', sep='\t', index=False, header=False)
dev_data.to_csv('splits/dev.txt', sep='\t', index=False, header=False)
test_data.to_csv('splits/test.txt', sep='\t', index=False, header=False)

In [None]:
num_training_data = train_data.shape[0]
num_dev_data = dev_data.shape[0]
num_test_data = test_data.shape[0]

print(num_training_data, num_dev_data, num_test_data)

300 100 100


In [None]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Replaced it with the original function in OrdianlRegression.ipynb
def load_ordinal_data(filename, ordering):
    X = []
    Y = []
    orig_Y=[]
    for ordinal in ordering:
        Y.append([])

    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols = line.split("\t")
            idd = cols[0]
            label = cols[2].lstrip().rstrip()
            text = cols[3]

            X.append(text)

            index=ordering.index(label)
            for i in range(len(ordering)):
                if index > i:
                    Y[i].append(1)
                else:
                    Y[i].append(0)
            orig_Y.append(label)

    return X, Y, orig_Y

In [None]:
# testing to see if load_original_data is correct
ordering=["Child", "Adolescent", "Young Adult", "Adult"]
load_ordinal_data("adjudicated.txt", ordering)
print("done loading data")

#X: A list containing the texts corresponding to each data point.
#Y: A list of lists where each inner list corresponds to the binary representation of the ordinal label for each data point. Each inner list should have a length of len(ordering) - 1, where each element indicates whether the data point belongs to a category above (1) or below (0) the corresponding category in the ordering list. For example, if the label is "Adolescent", the corresponding inner list would be [1, 0, 0], indicating that it's above "Child" but below "Young Adult" and "Adult".
#orig_Y: A list containing the original labels for each data point.

done loading data


In [None]:
class OrdinalClassifier:

    def __init__(self, ordinal_values, feature_method, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY):
        self.ordinal_values=ordinal_values
        self.feature_vocab = {}
        self.feature_method = feature_method
        self.min_feature_count=2
        self.log_regs = [None]* (len(self.ordinal_values)-1)

        self.trainY=trainY
        self.devY=devY
        self.testY=testY

        self.orig_trainY=orig_trainY
        self.orig_devY=orig_devY
        self.orig_testY=orig_testY

        self.trainX = self.process(trainX, training=True)
        self.devX = self.process(devX, training=False)
        self.testX = self.process(testX, training=False)

    # Featurize entire dataset
    def featurize(self, data):
        featurized_data = []
        for text in data:
            feats = self.feature_method(text)
            featurized_data.append(feats)
        return featurized_data

    # Read dataset and returned featurized representation as sparse matrix + label array
    def process(self, X_data, training = False):

        data = self.featurize(X_data)

        if training:
            fid = 0
            feature_doc_count = Counter()
            for feats in data:
                for feat in feats:
                    feature_doc_count[feat]+= 1

            for feat in feature_doc_count:
                if feature_doc_count[feat] >= self.min_feature_count:
                    self.feature_vocab[feat] = fid
                    fid += 1

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        for idx, feats in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[feat]

        return X


    def train(self):
        (D,F) = self.trainX.shape


        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            best_dev_accuracy=0
            best_model=None
            for C in [0.1, 1, 10, 100]:

                log_reg = linear_model.LogisticRegression(C = C, max_iter=1000)
                log_reg.fit(self.trainX, self.trainY[idx])
                development_accuracy = log_reg.score(self.devX, self.devY[idx])
                if development_accuracy > best_dev_accuracy:
                    best_dev_accuracy=development_accuracy
                    best_model=log_reg


            self.log_regs[idx]=best_model

    def test(self):
        cor=tot=0
        counts=Counter()
        preds=[None]*(len(self.ordinal_values)-1)
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            preds[idx]=self.log_regs[idx].predict_proba(self.testX)[:,1]

        preds=np.array(preds)

        for data_point in range(len(preds[0])):


            ordinal_preds=np.zeros(len(self.ordinal_values))
            for ordinal in range(len(self.ordinal_values)-1):
                if ordinal == 0:
                    ordinal_preds[ordinal]=1-preds[ordinal][data_point]
                else:
                    ordinal_preds[ordinal]=preds[ordinal-1][data_point]-preds[ordinal][data_point]

            ordinal_preds[len(self.ordinal_values)-1]=preds[len(preds)-1][data_point]

            prediction=np.argmax(ordinal_preds)
            counts[prediction]+=1
            if prediction == self.ordinal_values.index(self.orig_testY[data_point]):
                cor+=1
            tot+=1

        return cor/tot

In [None]:
def binary_bow_featurize(text):
    feats = {}
    words = nltk.word_tokenize(text)

    for word in words:
        word=word.lower()
        feats[word]=1

    return feats

def confidence_intervals(accuracy, n, significance_level):
    critical_value=(1-significance_level)/2
    z_alpha=-1*norm.ppf(critical_value)
    se=math.sqrt((accuracy*(1-accuracy))/n)
    return accuracy-(se*z_alpha), accuracy+(se*z_alpha)

def run(trainingFile, devFile, testFile, ordinal_values):

    trainX, trainY, orig_trainY=load_ordinal_data(trainingFile, ordinal_values)
    devX, devY, orig_devY=load_ordinal_data(devFile, ordinal_values)
    testX, testY, orig_testY=load_ordinal_data(testFile, ordinal_values)

    simple_classifier = OrdinalClassifier(ordinal_values, binary_bow_featurize, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY)
    simple_classifier.train()
    accuracy=simple_classifier.test()

    lower, upper=confidence_intervals(accuracy, len(testY[0]), .95)
    print("Test accuracy for best dev model: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy, lower, upper))

In [None]:
trainingFile = 'splits/train.txt'
devFile = 'splits/dev.txt'
testFile = 'splits/test.txt'

run(trainingFile, devFile, testFile, ordering)


Test accuracy for best dev model: 0.480, 95% CIs: [0.382 0.578]



# Additional Features in binary_bow_featurize

In [None]:
childrens_book_words = [
    "Adventure", "Magic", "Fairy", "Princess", "Dragon", "Castle", "Forest", "Treasure", "Pirate", "Quest",
    "Hero", "Monster", "Unicorn", "Mermaid", "Wizard", "Fairy tale", "Mystery", "Animal", "Friendship", "Brave",
    "Journey", "Enchantment", "Quest", "Spell", "Fairies", "Prince", "Witch", "Ogre", "Knight", "Talking animals",
    "Pixie", "Elf", "Adventure", "Enchanted", "Magic wand", "Wand", "Secret", "Mysterious", "Courage", "Mystery",
    "Wondrous", "Happy", "Smile", "Laughter", "Fun", "Joy", "Playful", "Excitement", "Wonderland", "Dream"
]

adult_book_words = [
    "Intrigue", "Conspiracy", "Betrayal", "Romance", "Drama", "Suspense", "Thriller", "Murder", "Mystery", "Detective",
    "Investigation", "Crime", "Passion", "Lust", "Seduction", "Politics", "Power", "Corruption", "Deception", "Scandal",
    "War", "Conflict", "Espionage", "Espouse", "Affair", "Affection", "Tension", "Tragedy", "Legacy", "Destiny",
    "Ambition", "Ambiguous", "Temptation", "Devotion", "Obsession", "Revenge", "Manipulation", "Complication", "Betrayal",
    "Sacrifice", "Redemption", "Intrigue", "Deceit", "Guilt", "Consequences", "Secret", "Revelation", "Truth", "Despair", "Hope"
]
def binary_bow_featurize_additional(text):
    feats = {}
    words = nltk.word_tokenize(text)

    for word in words:
        word=word.lower()
        if word in adult_book_words:
            feats['adult_feat'] = 1
            # feats[word] = 1

        elif word in childrens_book_words:
            feats['child_feat']=1
            # feats[word] = 1
        else:
            feats[word] = 1
    return feats



def run_additional(trainingFile, devFile, testFile, ordinal_values):

    trainX, trainY, orig_trainY=load_ordinal_data(trainingFile, ordinal_values)
    devX, devY, orig_devY=load_ordinal_data(devFile, ordinal_values)
    testX, testY, orig_testY=load_ordinal_data(testFile, ordinal_values)

    simple_classifier = OrdinalClassifier(ordinal_values, binary_bow_featurize_adult, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY)
    simple_classifier.train()
    accuracy=simple_classifier.test()

    lower, upper=confidence_intervals(accuracy, len(testY[0]), .95)
    print("Test accuracy for word features model: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy, lower, upper))
run2(trainingFile, devFile, testFile, ordering)


# Word Embeddings

Tried another classifier that extends OrdinalClassifier class and uses vector Embeddings
It has new featurize function called "word_embedding_featurize" that uses glove embeddings instead of regular word feature embeddings in our original model.
We did the accuracy and confidence interval analysis but it turned out not to be our best performing model, but there is lot of room for improvement.

imported models from https://radimrehurek.com/gensim/models/word2vec.html



In [None]:
import gensim.downloader as api

print(api.info())

model_name = "word2vec-google-news-300"
word2vec_model = api.load(model_name)
print("Done laoding word2vec_model from word2vec-google-news-300 !!!!!!!!!")


In [None]:
from gensim.downloader import load
import numpy as np
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))


word2vec_model = load("word2vec-google-news-300")

trainX, trainY, orig_trainY=load_ordinal_data(trainingFile, ordering)
devX, devY, orig_devY=load_ordinal_data(devFile, ordering)
testX, testY, orig_testY=load_ordinal_data(testFile, ordering)

def word_embedding_featurize(text, word_embedding_model):
    words = nltk.word_tokenize(text.lower())
    valid_words = [word for word in words if word not in stop_words]
    word_embeddings = []
    for word in valid_words:
        if word in word_embedding_model:
            word_embeddings.append(word_embedding_model[word])
    if word_embeddings:
        return np.mean(word_embeddings, axis=0)
    else:
        return np.zeros(word_embedding_model.vector_size)

class OrdinalClassifierWithEmbeddings(OrdinalClassifier):

    def __init__(self, ordinal_values, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY):
        super().__init__(ordering, word_embedding_featurize, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY)

    def featurize(self, data):
        featurized_data = []
        for text in data:
            feats = self.feature_method(text, word2vec_model)
            word_embedding_feats = word_embedding_featurize(text, word2vec_model)
            combined_feats = np.concatenate((feats, word_embedding_feats), axis=None)
            featurized_data.append(combined_feats)
        return featurized_data

    # Overload this function from the base class OrdinalClassifier
    # Read dataset and returned featurized representation as sparse matrix + label array
    def process(self, X_data, training = False):

        data = self.featurize(X_data)
#         print(data)

        if training:
            fid = 0
            feature_doc_count = Counter()
            for feats in data:
                for feat in feats:
                    feature_doc_count[feat]+= 1

            for feat in feature_doc_count:
                if feature_doc_count[feat] >= self.min_feature_count:
                    self.feature_vocab[feat] = fid
                    fid += 1

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        for idx, feats in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[idx]

        return X


def run_with_embeddings(trainingFile, devFile, testFile, ordinal_values):
    ordinal_values=["Child", "Adolescent", "Young Adult", "Adult"]
    classifier_with_embeddings = OrdinalClassifierWithEmbeddings(ordinal_values, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY)
    classifier_with_embeddings.train()
    accuracy_with_embeddings = classifier_with_embeddings.test()
    lower, upper = confidence_intervals(accuracy_with_embeddings, len(testY[0]), 0.95)
    print("Test accuracy for model with word embeddings: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy_with_embeddings, lower, upper))

run_with_embeddings(trainingFile, devFile, testFile, ordering)


The GloVe embedding model exhibits a similar phenomenon. It represents each word in the input text with a GloVe embedding vector, capturing the semantic meaning of words within the data context. These embedding vectors then serve as input features for the OrdinalClassifierWithEmbeddings.


In [None]:

# trainingFile = "/splits/train.txt"
# devFile = "/splits/dev.txt"
# testFile = "/splits/test.txt"


# run(trainingFile, devFile, testFile, ordering)
