In [1]:
import numpy as np
import torch
import torchvision
import matplotlib.pyplot as plt
from time import time
from torchvision import datasets, transforms
from torch import nn, optim

transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])

trainset = datasets.MNIST('TrainSet', download=True, train=True, transform=transform)
valset = datasets.MNIST('ValSet', download=True, train=False, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=64, shuffle=True)

dataiter = iter(trainloader)
images, Y = dataiter.next()
images = images.numpy()
X = np.resize(images,(64,784))
Y = Y.numpy()

In [2]:
newY = np.zeros(shape=(64,10))
for i in range(Y.shape[0]):
    newY[i,Y[i]] = 1


In [12]:
def getSigmoid(T, x):
    numerator = np.exp(np.matmul(x,T))
    denomenator = numerator + 1 
    sig = np.divide(numerator,denomenator)
    return sig

def getLoss(T, Y, x):
    m = np.prod(Y.shape)
    activation = getSigmoid(T,x)
    loss= np.square(np.subtract(activation, Y))
    print(loss)
    loss = np.sum(loss)
    print(loss)
    loss = np.multiply(loss, 1/(2 * m))
    return loss

def getGradient(weights, Y, X):
    activation = getSigmoid(weights,X)
    z = np.square(np.subtract(activation, Y))
    gradient = np.matmul(X.transpose(), z)
    print(z)
    return gradient
    

In [14]:
theta = np.random.rand(784,10)
grad = getLoss(theta, newY, X)
print(grad)

[[1.84139381e-235 5.35008298e-223 1.66960994e-230 4.12226733e-221
  2.94813532e-229 6.48606254e-224 1.64540403e-233 2.97100982e-229
  1.78374349e-215 1.00000000e+000]
 [5.01607210e-237 1.13023212e-242 6.14313229e-241 5.30410242e-237
  1.97758608e-251 1.11459873e-239 2.46495320e-247 8.38190510e-245
  1.00000000e+000 9.53030652e-237]
 [1.24062296e-212 4.25507688e-217 1.00000000e+000 1.69027843e-220
  1.03021055e-225 1.03753788e-215 6.78772286e-212 1.49653265e-225
  6.15536875e-210 4.40026991e-214]
 [9.05706369e-260 1.75849859e-257 6.81507583e-260 2.61263553e-263
  4.26610061e-270 1.00000000e+000 5.35660680e-266 6.30700168e-260
  1.80628445e-247 8.35203445e-254]
 [2.93708461e-274 3.12950387e-272 5.56303091e-276 2.07745522e-271
  7.89864887e-277 1.51302006e-269 8.27759281e-280 1.70236833e-279
  1.52737090e-259 1.00000000e+000]
 [1.00000000e+000 1.13092082e-198 2.53206659e-194 6.14566051e-187
  3.50572502e-198 6.44402871e-195 2.94850781e-185 5.44725540e-201
  5.23292272e-194 8.07420692e-189

In [24]:
import numpy as np

num_iterations = 100
learning_rate = .01

# Initialize the model parameters
# Weights are represented as a matrix with one row for each sample and one column for each pixel
# Bias is a scalar value
weights = np.random.rand(X.shape[0], X.shape[1])
bias = np.random.rand()

# Define the sigmoid function
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

# Implement the logistic regression model
def predict(x, weights, bias):
  return sigmoid(np.dot(x, weights.T) + bias)

# Calculate the loss of the model on the training set
def loss(predictions, y):
  return np.mean(-(y * np.log(predictions) + (1 - y) * np.log(1 - predictions)))

# Train the model using gradient descent
for i in range(num_iterations):
  # Make predictions on the training set
  predictions = predict(X, weights, bias)

  # Calculate the loss of the model on the training set
  l = loss(predictions, Y)

  # Calculate the gradient of the loss with respect to the model parameters
  gradient_weights = np.dot(X.T, (predictions - Y)) / X.shape[0]
  gradient_bias = np.mean(predictions) - Y

  # Update the model parameters using the gradient and a learning rate
  weights -= learning_rate * gradient_weights.transpose()
  bias -= learning_rate * gradient_bias

predict(X[0], weights, bias)

  return np.mean(-(y * np.log(predictions) + (1 - y) * np.log(1 - predictions)))
  return np.mean(-(y * np.log(predictions) + (1 - y) * np.log(1 - predictions)))


array([1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 5.00166940e-113, 1.00000000e+000, 5.59477905e-111,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       3.58661229e-106, 1.97144794e-113, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.92839245e-107, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1

In [7]:
import numpy as np

# Number of classes
K = 10

# Number of features
N = 784

# Initialize weights
weights = np.zeros((K,N))

#Training Iterations
trainIters = 100

# sigmoid
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

# Training method
def one_vs_all_logreg(X, y, K, weights, learning_rate):
    
    # Number of training examples
    m = X.shape[0]
    
    for i in range(trainIters):
    # Initialize the cost
      J = 0
      
      # Compute the hypothesis
      h = np.dot(X, weights.T)
      
      # Compute the cost
      for k in range(K):
          temp = -y[:,k] * np.log(sigmoid(h[:,k])) - (1 - y[:,k]) * np.log(1 - sigmoid(h[:,k]))
          J += np.sum(temp)
          g = np.dot(X.T, (sigmoid(h)[:,k] - y[:,k]))
          g = g/m
          

          #update weights 
          weights[k] = weights[k] - learning_rate*g

          return weights
          
      J = J/m
      #J += np.sum(weights**2)
      print(J)
      
    


theta = one_vs_all_logreg(X, newY, K, weights, .1)

In [17]:
def predict(X,Weights):
    return argmax(sigmoid(np.dot(X, weights.T)))

print(predict(X[0,:], theta))

0.5
