In [48]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# An implementation of Softmax Regression (aka Multinomial logistic regression) classifier. 
# train and test functions require a pair of lists (document, label), where label is in the format (0,..., 1, ...0) indicating
# to which class the label belongs
# initiated with n for n-gram 

# class computes n-grams

class SoftmaxRegression:
    
    def __init__(self, num_grams, num_features):
        self.num_grams = num_grams
        self.num_features = num_features  
        self.features_built = False
        
    def train(self, training_set):
        print("Building features.")
        self.K = len(training_set[1][0])
        self.grams_features = self.build_features(training_set[0]) # extends grams_features as necessary
        print("Features built.")
        print("Now converting documents to input matrix.")
        X = self.build_input(training_set[0])
        Y = training_set[1]
      
        self.theta = self.initialize_parameters()
        
        self.batch_gradient_descent(X, Y, 2000)
    
    def batch_gradient_descent(self, X, Y, num_iterations = 2000, learning_rate = 0.3):
        m = X.shape[1]
        n = self.num_features  
        print("Initiating batch gradient descent with " , num_iterations, " iterations.")
        for i in range(num_iterations):
            if i % 50 == 0:
                print("completed " , i , " iterations")
            H = self.softmax(np.dot(X, self.theta))
            grads = self.grads(X, Y, H)
            self.theta = self.theta - learning_rate * grads
            
    def test(self, test_set):
        test_X = self.build_input(test_set)
        test_Y = self.softmax(np.dot(test_X, self.theta)) 
        max_indices = np.argmax(test_Y, axis = 1)
        return max_indices
    
    def build_input(self, corpus):
        X = []
        for document in corpus:
            X.append(self.doc_to_gram_vec(document))
        return np.array(X)
    
    def build_features(self, corpus):
        all_grams = []
        for n in range(1, self.num_grams):
            ngrams = []
            
            for doc in corpus:
                ngrams.extend(self.compute_grams(doc, n))
                
            fdist = FreqDist(ngram for ngram in ngrams)
            for phrase in fdist.most_common(self.num_features // self.num_grams):
                all_grams.append(phrase[0]) # add common n-gram
        grams_dict = dict(zip(all_grams, range(len(all_grams)))) # converts into dictionary with positions
        self.features_built = True
        return grams_dict
    
    def doc_to_gram_vec(self, doc): # given document, returns vector representing all features
        assert self.features_built
        doc_vec = np.zeros(self.num_features) 
        doc_grams = []
        for n in range(1, self.num_grams):
            doc_grams.extend(self.compute_grams(doc, n))
            
        for gram in doc_grams:
            if gram in self.grams_features:
                doc_vec[self.grams_features[gram]] = 1
        
        return doc_vec
        
    def compute_grams(self, doc, num_grams):  # given a document, and selected n num_grams, computes all n_grams
        tokens = word_tokenize(doc)
        if num_grams == 1:
            return tokens
        else:
            return ngrams(tokens, num_grams)        
    
    def initialize_parameters(self):
        theta = np.random.randn(self.num_features, self.K) * 0.01
        return theta
    
    def linear_forward(self, W, x, b): # forward pass, representing a single layer
        assert shape(x)[0] == shape(W)[1]
        assert shape(x) == shape(b)
        return np.dot(W,x) + b
    
    def forward_pass(self, X):
        np.dot(X, self.theta)
        
    
    def cost(self, x, y, theta, LAMBDA = 0): 
        J = softmax(np.dot(theta.T, x))[y]
    
    def grads(self, X, Y, H, LAMBDA = 1.5): # grads will be a matrix
        grads = -np.dot(X.T, (Y-H))
        m = X.shape[0]
        grads = (1/m) * grads      
        return grads
    
    def update_parameters(self, grads, learning_rate):
        self.theta = self.theta - (learning_rate * grads)
        
    def sigmoid(self, x):
        z = np.exp(x)
        z = z / (z+1)
        return z
    
    def predict(self, x):
        probs = softmax(np.dot(self.theta.T, x))
        return argmax(probs)
    
    def softmax(self, Z): # Given matrix Z, returns softmax treating each row as a vector
        Z = np.exp(Z)
        denoms = np.sum(Z, axis = 1)
        denoms = denoms.reshape(Z.shape[0], 1)
        return Z / denoms
        
        
        
        

In [2]:
t = np.zeros(5)
s = np.zeros(5)
s[2] = 1
t[3] = 1
w = []
w.append(t)
w.append(s)
w = np.array(w)
q = np.sum(w, axis = 1).reshape(w.shape[0], 1)
j = w / q
maxes = np.argmax(j, axis = 1)
maxes

array([3, 2], dtype=int64)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

The following is a pre-processing step. Used from the NB multinomial code.

In [33]:
import pandas as pd
from pandas import DataFrame

def preprocessing_SST(dictionary_of_phrases_filepath, sentences_filepath, phrases_sentiments_filepath, splits_filepath):
    dictionaryDF = pd.read_table(dictionary_of_phrases_filepath, sep = '|', names = ("phrase", "phrase id"))
    sentencesDF = pd.read_table(sentences_filepath, sep = '\t')
    sentimentsDF = pd.read_table(phrases_sentiments_filepath, sep = '|')
    splitsDF = pd.read_csv(splitsFP)
    
    phrases = dict()
    for row in range(len(dictionaryDF)):
        phrase = dictionaryDF['phrase'][row]
        phraseId = dictionaryDF['phrase id'][row]
        sentiment = valToLabel(sentimentsDF['sentiment values'][phraseId])
        words = phrase.split()
        phrases[phrase] = {
            "id" : phraseId,
            "sentiment" : sentiment
        }
    train_docs = list()
    dev_docs = list()
    test_docs = list()
    for id in sentencesDF['sentence_index']:
        sentence = sentencesDF['sentence'][id - 1]
        if (splitsDF['splitset_label'][id - 1] == 1):
            train_docs.append(sentence)
        elif (splitsDF['splitset_label'][id - 1] == 2):
            test_docs.append(sentence)
        else: 
            dev_docs.append(sentence)  
            
    training = pairsToPairOfLists(makeInputTuples(train_docs, phrases))
    #training[1] = labeled_data_finegrained(training[1])
    test = pairsToPairOfLists(makeInputTuples(test_docs, phrases))
    #test[1] = labeled_data_finegrained(test[1])
    dev = pairsToPairOfLists(makeInputTuples(dev_docs, phrases))
    #dev[1] = labeled_data_finegrained(dev[1])
    
    # MAKE SURE TO CLEAN UP THE LISTS 
    return training, dev, test

def normalize(doc): # given document, returns normalized, negation-tracked version
    terminators = {';', '.', '?', '!', '\n', ':', ','}
    negations = {'not', 'no', 'neither', 'never', 'n\'t'}
    sentence = doc.split()
    normalized_doc = ''
    neg_flag = ''
    for word in sentence:
        #print('Considering word ', word)
        word = neg_flag + word
        if word in negations:
            neg_flag = '__NOT__'
        if word[-1] in terminators:
            neg_flag = ''
            word = word[0:-1]
        normalized_doc = normalized_doc + ' ' + word
    return normalized_doc

def makeInputTuples(docs, phrases_dictionary): # given documents, returns a tuple (docs, labels) where docs is all documents with a label and labels are corresponding labels
    doc_label_pairs = []
    for doc in docs:
        label = docToLabel(doc, phrases_dictionary)
        if label == 'Not found':
            continue
        else:
            doc = normalize(doc)
            doc_label_pairs.append((doc, label))
    return doc_label_pairs


def docToLabel(doc, phrases_dictionary): # given doc, either returns 'Not found' or the appropriate label
    if doc not in phrases_dictionary:
        return 'Not found'
    else:
        return phrases_dictionary[doc]['sentiment']
    
def valToLabel(val):
    
    if (val <= 0.2):
        label = 'very negative'
    elif (val <= 0.4):
        label = 'negative'
    elif (val <= 0.6):
        label = 'neutral'
    elif (val <= 0.8):
        label = 'positive'
    else:
        label = 'very positive'
    return label

def pairsToPairOfLists(list_of_pairs):
    list1 = []
    list2 = []
    for pair in list_of_pairs:
        list1.append(pair[0])
        list2.append(pair[1])
    return [list1, list2]

def labeled_data_finegrained(labels):  #output Y
    conversion = {
        'very negative' : 0,
        'negative' : 1,
        'neutral' : 2,
        'positive' : 3,
        'very positive' : 4
    }
    Y = list()
    for label in labels:
        y = [0, 0, 0, 0, 0]
        y[conversion[label]] = 1
        Y.append(y)
    return Y

def maxIndicesToLabels(max_indices):
    conversion = {
        0 : 'very negative',
        1 : 'negative',
        2 : 'neutral',
        3 : 'positive',
        4 : 'very positive'
    }
    labels = []
    for index in max_indices:
        labels.append(conversion[index])
    return labels
        
print("done")
    




done


In [4]:
def polarity_error_nb(predictions, labels):
   
    total = 0
    polarity_matches = 0
    for i in range(len(labels)):
        if labels[i] == 'neutral':
            continue
        total += 1
        if ((labels[i] == 'positive' or labels[i] == 'very positive') 
            and (predictions[i] == 'positive' or predictions[i] == 'very positive')):
            polarity_matches += 1
        if ((labels[i] == 'negative' or labels[i] == 'very negative') 
            and (predictions[i] == 'negative' or predictions[i] == 'very negative')):
            polarity_matches += 1
    return 1 - polarity_matches / total    
    

def fine_grained_error(predictions, labels):
    matches = 0
    for i in range(len(labels)):
        if labels[i] == predictions[i]:
            matches += 1
    return 1 - matches / len(labels)

In [34]:
dictionaryFP = './stanfordSentimentTreebank/dictionary.txt'
sentencesFP = './stanfordSentimentTreebank/datasetSentences.txt'
sentimentsFP = './stanfordSentimentTreebank/sentiment_labels.txt'
splitsFP = './stanfordSentimentTreebank/datasetSplit.txt'

train, dev, test = preprocessing_SST(dictionaryFP, sentencesFP, sentimentsFP, splitsFP)
print('Done preprocessing and splitting.')


Done preprocessing and splitting.


In [57]:
softmax = SoftmaxRegression(3, 1800)
softmax.train([train[0], labeled_data_finegrained(train[1])])


Building features.
Features built.
Now converting documents to input matrix.
Initiating batch gradient descent with  2000  iterations.
completed  0  iterations
completed  50  iterations
completed  100  iterations
completed  150  iterations
completed  200  iterations
completed  250  iterations
completed  300  iterations
completed  350  iterations
completed  400  iterations
completed  450  iterations
completed  500  iterations
completed  550  iterations
completed  600  iterations
completed  650  iterations
completed  700  iterations
completed  750  iterations
completed  800  iterations
completed  850  iterations
completed  900  iterations
completed  950  iterations
completed  1000  iterations
completed  1050  iterations
completed  1100  iterations
completed  1150  iterations
completed  1200  iterations
completed  1250  iterations
completed  1300  iterations
completed  1350  iterations
completed  1400  iterations
completed  1450  iterations
completed  1500  iterations
completed  1550  ite

In [58]:
answers = softmax.test(test[0])


In [59]:
labels = maxIndicesToLabels(answers)

In [26]:
train_answers = maxIndicesToLabels(softmax.test(train[0]))

In [60]:
polarity_error_nb(labels, test[1])

0.34076615208690675

In [53]:
answers

array([2, 3, 1, ..., 0, 3, 4], dtype=int64)

In [61]:
softmax.theta

array([[-0.21181405,  0.05186096,  0.11044354,  0.20229144, -0.14033383],
       [-0.01493069, -0.10321929, -0.35139686,  0.16178347,  0.31833489],
       [ 0.08796039, -0.02300391, -0.00071382,  0.00483279, -0.09672029],
       ..., 
       [ 0.01779515,  0.00899588,  0.00743604, -0.0035459 , -0.00808997],
       [ 0.00909743, -0.0092661 , -0.00228983,  0.00644285,  0.01114028],
       [ 0.01552446,  0.00431155, -0.00384072, -0.01954182,  0.00473385]])