In [103]:
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.datasets import load_digits 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import string
import re 
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
def sigmoid(x): 
    return 1.0 / (1.0 + np.exp(-x))

def logPredict(X, w): 
    return sigmoid(np.dot(X,w))

def logAvgCost(X, y, w, regularization=None, lmda=0):
    predict = logPredict(X, w)
    cost = y * np.log(predict) + (1.0 - y) * np.log(1.0 - predict)
    cost = - cost 
    totalCost = cost.sum()
    if regularization == 'l1':
        totalCost += l1Cost(lmda, w)
    elif regularization == 'l2':
        totalCost += l2Cost(lmda, w)
    return totalCost / y.shape[0]

def l1Cost(lmda, w): 
    return lmda * np.sum(np.abs(w))

def l2Cost(lmda, w):
    return lmda * np.sum(np.square(w))

def l1Grad(lmda, w):
    gradient = lmda * np.sign(w)
    zeros = np.array(np.where(gradient == 0)).T
    for zero in zeros:
        gradient[zero] = (2.0 * np.random.rand()) - 1.0
    return gradient

def l2Grad(lmda, w): 
    return lmda * 2 * w

def logAvgGrad(X, y, w, regularization=None, lmda=0):
    predict = logPredict(X, w)
    gradient = np.dot(X.T, predict - y) 
    if regularization == 'l1':
        gradient += l1Grad(lmda, w)
    elif regularization == 'l2':
        gradient += l2Grad(lmda, w)
    return gradient / y.shape[0]

def updateWeightsGD(X, y, w, lr, regularization=None, lmda=0):
    gradient = logAvgGrad(X, y, w, regularization, lmda)
    return w - (lr * gradient)

def trainGD(X, y, lr, numIters, logFreq, regularization=None, lmda=0):
    costs = [] 
    w = np.random.rand(X.shape[1], 1)
    for i in range(numIters):
        cost = logAvgCost(X, y, w, regularization, lmda)
        w = updateWeightsGD(X, y, w, lr, regularization, lmda)
        if i % logFreq == 0: 
            print("iter:", i, "cost:", cost)
            costs.append(cost)
    return w, costs

def trainGDMulticlass(X, y, lr, numIters, logFreq, regularization=None, lmda=0):
    uniqueYs = np.unique(y)
    ys = [] 
    for uniqueY in uniqueYs: 
        binaryYs = np.copy(y)
        binaryYs += 1 
        binaryYs[binaryYs != uniqueY + 1] = 0
        binaryYs[binaryYs == uniqueY + 1] = 1
        ys.append(binaryYs)
    ys = np.array(ys)
    ws = np.random.rand(uniqueYs.shape[0],\
        X.shape[1],1)
    costs = []
    for i in range(numIters):
        cost = 0
        for uniqueY in uniqueYs: 
            cost += logAvgCost(X, ys[uniqueY], ws[uniqueY], regularization, lmda)
            ws[uniqueY] = updateWeightsGD(X, ys[uniqueY],\
                ws[uniqueY], lr, regularization, lmda)
        cost /= uniqueYs.shape[0]
        if i % logFreq == 0:
            print("iter:", i, "cost:", cost)
            costs.append(cost)
    costs = np.array(costs)
    return ws, costs  

def updateWeightsAdaGrad(X, y, w, lr, S, epsilon, regularization=None, lmda=0):
    gradient = logAvgGrad(X, y, w, regularization, lmda)
    S += np.square(gradient)
    multiplier = lr / np.sqrt(S + epsilon)
    return w - np.multiply(multiplier, gradient)

def trainAdaGrad(X, y, lr, numIters, logFreq, epsilon, regularization=None, lmda=0):
    costs = [] 
    w = np.random.rand(X.shape[1], 1)
    S = np.zeros((X.shape[1], 1))
    for i in range(numIters):
        cost = logAvgCost(X, y, w, regularization, lmda)
        w = updateWeightsAdaGrad(X, y, w, lr, S, epsilon, regularization, lmda)
        if i % logFreq == 0: 
            print("iter:", i, "cost:", cost)
            costs.append(cost)
    return w, costs

def trainAdaGradMulticlass(X, y, lr, numIters, logFreq, epsilon, regularization=None, lmda=0):
    uniqueYs = np.unique(y)
    ys = [] 
    for uniqueY in uniqueYs: 
        binaryYs = np.copy(y)
        binaryYs += 1 
        binaryYs[binaryYs != uniqueY + 1] = 0
        binaryYs[binaryYs == uniqueY + 1] = 1
        ys.append(binaryYs)
    ys = np.array(ys)
    ws = np.random.rand(uniqueYs.shape[0],\
        X.shape[1],1)
    Ss = np.zeros((uniqueYs.shape[0], X.shape[1], 1))
    costs = []
    for i in range(numIters):
        cost = 0
        for uniqueY in uniqueYs: 
            cost += logAvgCost(X, ys[uniqueY], ws[uniqueY], regularization, lmda)
            ws[uniqueY] = updateWeightsAdaGrad(X, ys[uniqueY],\
                ws[uniqueY], lr, Ss[uniqueY], epsilon, regularization, lmda)
        cost /= uniqueYs.shape[0]
        if i % logFreq == 0:
            print("iter:", i, "cost:", cost)
            costs.append(cost)
    costs = np.array(costs)
    return ws, costs  

def logPredictMulticlass(X, w):
    predictions = [] 
    for i in range(ws.shape[0]):
        predictions.append(logPredict(X,w[i]))
    predictions = np.array(predictions)
    return np.argmax(predictions,axis=0)

def logMulticlassAccuracy(X, w, y):
    predictions = logPredictMulticlass(X, w)
    return np.sum(predictions == y) / y.shape[0]

In [4]:
# MNIST
# Load dataset
digits = load_digits()
digX = digits.images 
digY = digits.target
# Reshape images into 1D vectors 
digX = np.reshape(digX, \
    (digX.shape[0], digX.shape[1] * digX.shape[2]))
# Normalize all images 
for i in range(digX.shape[0]):
    digX[i] = np.divide(digX[i], np.amax(digX,axis=1)[i])

# Split training and test sets 
digXTrain, digXTest, digYTrain, digYTest = \
    train_test_split(digX, digY, test_size=0.20)

# Ensure dimensionality matches 
digYTrain = np.expand_dims(digYTrain, axis=1)
digYTest = np.expand_dims(digYTest, axis=1)

In [5]:
# Gradient Descent Training
ws, costs = trainGDMulticlass(digXTrain,\
    digYTrain, 1, 1000, 100, 'l1', 0.1)

# Evaluation 
print("Accuracy (Training):",
    logMulticlassAccuracy(digXTrain, ws, digYTrain))
print("Accuracy (Test):",\
    logMulticlassAccuracy(digXTest, ws, digYTest))

iter: 0 cost: 9.171039675266258
iter: 100 cost: 0.0780419604743979
iter: 200 cost: 0.059681815843701816
iter: 300 cost: 0.05203538584802823
iter: 400 cost: 0.04761466811195066
iter: 500 cost: 0.04464678293552123
iter: 600 cost: 0.04247749907692624
iter: 700 cost: 0.04080040349360599
iter: 800 cost: 0.03945171188530569
iter: 900 cost: 0.03833619130635034
Accuracy (Training): 0.97633959638135
Accuracy (Test): 0.95


In [6]:
# AdaGrad Training 
ws, costs = trainAdaGradMulticlass(digXTrain,\
    digYTrain, 1, 1000, 100, 1e-8, 'l1', 0.1)

# Evaluation 
print("Accuracy (Training):",
    logMulticlassAccuracy(digXTrain, ws, digYTrain))
print("Accuracy (Test):",\
    logMulticlassAccuracy(digXTest, ws, digYTest))

iter: 0 cost: 8.854384281421028
iter: 100 cost: 0.05260605045081612
iter: 200 cost: 0.04265014119521066
iter: 300 cost: 0.03833243396120719
iter: 400 cost: 0.03577810068697231
iter: 500 cost: 0.034046193080419576
iter: 600 cost: 0.03278032242059598
iter: 700 cost: 0.03180727542773276
iter: 800 cost: 0.03103504109167335
iter: 900 cost: 0.030402170707007153
Accuracy (Training): 0.9860821155184412
Accuracy (Test): 0.9583333333333334


In [127]:
# Sentiment classification  
# on the Twitter US Airline Sentiment Dataset 
tweets = pd.read_csv('./Tweets.csv', sep=',')

# Process labels 
yRaw = np.array(tweets['airline_sentiment'])
y = np.zeros(yRaw.shape)
# Convert y categories into numerical values 
y[yRaw == 'negative'] = 0 
y[yRaw == 'neutral'] = 1
y[yRaw == 'positive'] = 2
y = np.expand_dims(y, axis=1)

# Clean text 
XRaw = tweets['text'].tolist()
XCleaned = []
for i in range(len(XRaw)):
    rawText = XRaw[i]
    # Set all lower case
    processed = rawText.lower()
    # Remove twitter tags 
    processed = " ".join(filter(lambda x:x[0] !='@', processed.split()))
    # Remove numbers 
    processed = re.sub(r'\d+','',processed)
    # Remove punctuation
    processed = processed.translate(str.maketrans("","", string.punctuation))
    # Remove whitespaces 
    processed = processed.strip()
    XCleaned.append(processed)

# Use the top 500 most frequently appearing 
# words for classification
# Bag of words approach: 
vectorizer = CountVectorizer(max_features=500)
XCounts = vectorizer.fit_transform(XCleaned)
# tf-idf: term frequency-inverse document frequency 
tfTransformer = TfidfTransformer()
X = tfTransformer.fit_transform(XCounts)
X = X.toarray()

# Our method of vectorizing text input from scratch
# Assume cleaned input (no punctuation, same case, etc.)
def vectorizeText(XCleaned):
    # Tokenize text 
    XTokenized = [] 
    stopWords = stopwords.words('english')
    wordnetLemmatizer = WordNetLemmatizer()
    for i in range(len(XCleaned)):
        tokens = nltk.word_tokenize(XCleaned[i])
        # Remove stop words 
        tokens = [j for j in tokens if not j in stopWords]
        # Lemmatize 
        tokens = [wordnetLemmatizer.lemmatize(word) for word in tokens]
        XTokenized.append(tokens)

    allTokens = [token for sublist in XTokenized for token in sublist]
    allTokens = np.array(allTokens)
    unique, count = np.unique(allTokens, return_counts=True)
    count, unique = zip(*sorted(zip(count, unique)))

    # Number of features to consider 
    nFeatures = 500
    textFeatures = unique[len(unique)-nFeatures:]

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'