In [16]:
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.datasets import load_digits 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import string
import re 
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [34]:
def sigmoid(x): 
    return 1.0 / (1.0 + np.exp(-x))

def logPredict(X, w): 
    return sigmoid(np.dot(X,w))

def logAvgCost(X, y, w, regularization=None, lmda=0):
    predict = logPredict(X, w)
    cost = y * np.log(predict) + (1.0 - y) * np.log(1.0 - predict)
    cost = - cost 
    totalCost = cost.sum()
    if regularization == 'l1':
        totalCost += l1Cost(lmda, w)
    elif regularization == 'l2':
        totalCost += l2Cost(lmda, w)
    return totalCost / y.shape[0]

def l1Cost(lmda, w): 
    return lmda * np.sum(np.abs(w))

def l2Cost(lmda, w):
    return lmda * np.sum(np.square(w))

def l1Grad(lmda, w):
    gradient = lmda * np.sign(w)
    zeros = np.array(np.where(gradient == 0)).T
    for zero in zeros:
        gradient[zero] = (2.0 * np.random.rand()) - 1.0
    return gradient

def l2Grad(lmda, w): 
    return lmda * 2 * w

def logAvgGrad(X, y, w, regularization=None, lmda=0):
    predict = logPredict(X, w)
    gradient = np.dot(X.T, predict - y) 
    if regularization == 'l1':
        gradient += l1Grad(lmda, w)
    elif regularization == 'l2':
        gradient += l2Grad(lmda, w)
    return gradient / y.shape[0]

def updateWeightsGD(X, y, w, lr, regularization=None, lmda=0):
    gradient = logAvgGrad(X, y, w, regularization, lmda)
    return w - (lr * gradient)

def trainGD(X, y, lr, numIters, logFreq, regularization=None, lmda=0):
    costs = [] 
    w = np.random.rand(X.shape[1], 1)
    for i in range(numIters):
        cost = logAvgCost(X, y, w, regularization, lmda)
        w = updateWeightsGD(X, y, w, lr, regularization, lmda)
        if i % logFreq == 0: 
            print("iter:", i, "cost:", cost)
            costs.append(cost)
    return w, costs

def trainGDMulticlass(X, y, lr, numIters, logFreq, regularization=None, lmda=0):
    uniqueYs = np.unique(y)
    ys = [] 
    for uniqueY in uniqueYs: 
        binaryYs = np.copy(y)
        binaryYs += 1 
        binaryYs[binaryYs != uniqueY + 1] = 0
        binaryYs[binaryYs == uniqueY + 1] = 1
        ys.append(binaryYs)
    ys = np.array(ys)
    ws = np.random.rand(uniqueYs.shape[0],\
        X.shape[1],1)
    costs = []
    for i in range(numIters):
        cost = 0
        for uniqueY in uniqueYs: 
            cost += logAvgCost(X, ys[int(uniqueY)], ws[int(uniqueY)], regularization, lmda)
            ws[int(uniqueY)] = updateWeightsGD(X, ys[int(uniqueY)],\
                ws[int(uniqueY)], lr, regularization, lmda)
        cost /= uniqueYs.shape[0]
        if i % logFreq == 0:
            print("iter:", i, "cost:", cost)
            costs.append(cost)
    costs = np.array(costs)
    return ws, costs  

def updateWeightsAdaGrad(X, y, w, lr, S, epsilon, regularization=None, lmda=0):
    gradient = logAvgGrad(X, y, w, regularization, lmda)
    S += np.square(gradient)
    multiplier = lr / np.sqrt(S + epsilon)
    return w - np.multiply(multiplier, gradient)

def trainAdaGrad(X, y, lr, numIters, logFreq, epsilon, regularization=None, lmda=0):
    costs = [] 
    w = np.random.rand(X.shape[1], 1)
    S = np.zeros((X.shape[1], 1))
    for i in range(numIters):
        cost = logAvgCost(X, y, w, regularization, lmda)
        w = updateWeightsAdaGrad(X, y, w, lr, S, epsilon, regularization, lmda)
        if i % logFreq == 0: 
            print("iter:", i, "cost:", cost)
            costs.append(cost)
    return w, costs

def trainAdaGradMulticlass(X, y, lr, numIters, logFreq, epsilon, regularization=None, lmda=0):
    uniqueYs = np.unique(y)
    ys = [] 
    for uniqueY in uniqueYs: 
        binaryYs = np.copy(y)
        binaryYs += 1 
        binaryYs[binaryYs != uniqueY + 1] = 0
        binaryYs[binaryYs == uniqueY + 1] = 1
        ys.append(binaryYs)
    ys = np.array(ys)
    ws = np.random.rand(uniqueYs.shape[0],\
        X.shape[1],1)
    Ss = np.zeros((uniqueYs.shape[0], X.shape[1], 1))
    costs = []
    for i in range(numIters):
        cost = 0
        for uniqueY in uniqueYs: 
            cost += logAvgCost(X, ys[int(uniqueY)], ws[int(uniqueY)], regularization, lmda)
            ws[int(uniqueY)] = updateWeightsAdaGrad(X, ys[int(uniqueY)],\
                ws[int(uniqueY)], lr, Ss[int(uniqueY)], epsilon, regularization, lmda)
        cost /= uniqueYs.shape[0]
        if i % logFreq == 0:
            print("iter:", i, "cost:", cost)
            costs.append(cost)
    costs = np.array(costs)
    return ws, costs  

def logPredictMulticlass(X, w):
    predictions = [] 
    for i in range(ws.shape[0]):
        predictions.append(logPredict(X,w[i]))
    predictions = np.array(predictions)
    return np.argmax(predictions,axis=0)

def logMulticlassAccuracy(X, w, y):
    predictions = logPredictMulticlass(X, w)
    return np.sum(predictions == y) / y.shape[0]

In [18]:
# MNIST
# Load dataset
digits = load_digits()
digX = digits.images 
digY = digits.target
# Reshape images into 1D vectors 
digX = np.reshape(digX, \
    (digX.shape[0], digX.shape[1] * digX.shape[2]))
# Normalize all images 
for i in range(digX.shape[0]):
    digX[i] = np.divide(digX[i], np.amax(digX,axis=1)[i])

# Split training and test sets 
digXTrain, digXTest, digYTrain, digYTest = \
    train_test_split(digX, digY, test_size=0.20)

# Ensure dimensionality matches 
digYTrain = np.expand_dims(digYTrain, axis=1)
digYTest = np.expand_dims(digYTest, axis=1)

In [19]:
# Gradient Descent Training
ws, costs = trainGDMulticlass(digXTrain,\
    digYTrain, 1, 1000, 100, 'l1', 0.1)

# Evaluation 
print("Accuracy (Training):",
    logMulticlassAccuracy(digXTrain, ws, digYTrain))
print("Accuracy (Test):",\
    logMulticlassAccuracy(digXTest, ws, digYTest))

iter: 0 cost: 8.745287319308373
iter: 100 cost: 0.08047161580376203
iter: 200 cost: 0.0612862602482915
iter: 300 cost: 0.053385599839098206
iter: 400 cost: 0.04883555817425237
iter: 500 cost: 0.04577769706669313
iter: 600 cost: 0.04353033336717747
iter: 700 cost: 0.04178374964997139
iter: 800 cost: 0.04037374487353195
iter: 900 cost: 0.0392017332186961
Accuracy (Training): 0.9756437021572721
Accuracy (Test): 0.9527777777777777


In [25]:
digYTrain.shape

(1437, 1)

In [20]:
# AdaGrad Training 
ws, costs = trainAdaGradMulticlass(digXTrain,\
    digYTrain, 1, 1000, 100, 1e-8, 'l1', 0.1)

# Evaluation 
print("Accuracy (Training):",
    logMulticlassAccuracy(digXTrain, ws, digYTrain))
print("Accuracy (Test):",\
    logMulticlassAccuracy(digXTest, ws, digYTest))

iter: 0 cost: 8.881569781496816
iter: 100 cost: 0.05327439542098885
iter: 200 cost: 0.043159124027085737
iter: 300 cost: 0.03869798508779089
iter: 400 cost: 0.03605231149446609
iter: 500 cost: 0.034250503898755266
iter: 600 cost: 0.03292081115693914
iter: 700 cost: 0.03189008862229752
iter: 800 cost: 0.031062098164134817
iter: 900 cost: 0.0303797038947066
Accuracy (Training): 0.9860821155184412
Accuracy (Test): 0.9583333333333334


In [21]:
# Sentiment classification  
# on the Twitter US Airline Sentiment Dataset 
tweets = pd.read_csv('./Tweets.csv', sep=',')

# Process labels 
textYRaw = np.array(tweets['airline_sentiment'])
textY = np.zeros(textYRaw.shape)
# Convert y categories into numerical values 
textY[textYRaw == 'negative'] = 0 
textY[textYRaw == 'neutral'] = 1
textY[textYRaw == 'positive'] = 2
textY = np.expand_dims(textY, axis=1)

# Clean text 
textXRaw = tweets['text'].tolist()
textXCleaned = []
for i in range(len(textXRaw)):
    rawText = textXRaw[i]
    # Set all lower case
    processed = rawText.lower()
    # Remove twitter tags 
    processed = " ".join(filter(lambda x:x[0] !='@', processed.split()))
    # Remove numbers 
    processed = re.sub(r'\d+','',processed)
    # Remove punctuation
    processed = processed.translate(str.maketrans("","", string.punctuation))
    # Remove whitespaces 
    processed = processed.strip()
    textXCleaned.append(processed)

# Split into training and test sets 
# Split training and test sets 
textXTrainText, textXTestText, textYTrain, textYTest = \
    train_test_split(textXCleaned, textY, test_size=0.20)

# Use the top 500 most frequently appearing 
# words for classification
# Bag of words approach: 
vectorizer = CountVectorizer(max_features=500)
textXTrainCounts = vectorizer.fit_transform(textXTrainText)
# tf-idf: term frequency-inverse document frequency 
tfTransformer = TfidfTransformer()
textXTrain = tfTransformer.fit_transform(textXTrainCounts)
textXTrain = textXTrain.toarray()
textXTestCounts = vectorizer.transform(textXTestText)
textXTest = tfTransformer.fit_transform(textXTestCounts)
textXTest = textXTest.toarray()

# textXTrain, textYTrain, textXTest, textYTest

# Our method of vectorizing text input from scratch
# Assume cleaned input (no punctuation, same case, etc.)
def vectorizeText(XCleaned):
    # Tokenize text 
    XTokenized = [] 
    stopWords = stopwords.words('english')
    wordnetLemmatizer = WordNetLemmatizer()
    for i in range(len(XCleaned)):
        tokens = nltk.word_tokenize(XCleaned[i])
        # Remove stop words 
        tokens = [j for j in tokens if not j in stopWords]
        # Lemmatize 
        tokens = [wordnetLemmatizer.lemmatize(word) for word in tokens]
        XTokenized.append(tokens)

    allTokens = [token for sublist in XTokenized for token in sublist]
    allTokens = np.array(allTokens)
    unique, count = np.unique(allTokens, return_counts=True)
    count, unique = zip(*sorted(zip(count, unique)))

    # Number of features to consider 
    nFeatures = 500
    textFeatures = unique[len(unique)-nFeatures:]

In [35]:
# Gradient Descent Training
ws, costs = trainGDMulticlass(textXTrain, textYTrain, 1, 1000, 100, 'l1', 0.1)

# Evaluation 
print("Accuracy (Training):",
    logMulticlassAccuracy(textXTrain, ws, textYTrain))
print("Accuracy (Test):",\
    logMulticlassAccuracy(textXTest, ws, textYTest))

iter: 0 cost: 1.2034470489406448
iter: 100 cost: 0.5955468521719501
iter: 200 cost: 0.5072853176584807
iter: 300 cost: 0.47582463026668703
iter: 400 cost: 0.4568390093746529
iter: 500 cost: 0.4427773408488948
iter: 600 cost: 0.4314913154220242
iter: 700 cost: 0.42208832703203747
iter: 800 cost: 0.4140851131919014
iter: 900 cost: 0.40717051362315587
Accuracy (Training): 0.733521174863388
Accuracy (Test): 0.7346311475409836


In [36]:
# AdaGrad Training 
ws, costs = trainAdaGradMulticlass(textXTrain,\
    textYTrain, 1, 1000, 100, 1e-8, 'l1', 0.1)

# Evaluation 
print("Accuracy (Training):",
    logMulticlassAccuracy(textXTrain, ws, textYTrain))
print("Accuracy (Test):",\
    logMulticlassAccuracy(textXTest, ws, textYTest))

iter: 0 cost: 1.1932035901298061
iter: 100 cost: 0.31398756617613877
iter: 200 cost: 0.3125809926971453
iter: 300 cost: 0.31225770007029
iter: 400 cost: 0.3121337699920735
iter: 500 cost: 0.31207295911594385
iter: 600 cost: 0.31203850843729625
iter: 700 cost: 0.3120172871040808
iter: 800 cost: 0.31200327996720234
iter: 900 cost: 0.3119938143894478
Accuracy (Training): 0.8037909836065574
Accuracy (Test): 0.7780054644808743
