### Logistic Regression

[Reference 1](https://towardsdatascience.com/introduction-to-logistic-regression-66248243c148)  
[Reference 2](https://towardsdatascience.com/building-a-logistic-regression-in-python-301d27367c24)  
[ML Glossary](https://ml-cheatsheet.readthedocs.io/en/latest/logistic_regression.html)  

#### Sigmoid Function

$$f(x)=\frac{1}{1+e^{-(x)}}$$  

which maps predicted values to probabilities  

#### Hypothesis Representation

$$Z=\beta_0+\beta_1X$$

$$h\Theta(x)=\text{sigmoid}(Z)=\frac{1}{1+e^{-(\beta_0+\beta_1X)}}$$

#### Cost Function

$$\text{Cost}(h_\theta(x),y)=
    \begin{cases}
    -\log(h_\theta(x)) & \quad \text{if } y = 1\\
    -\log(1-h_\theta(x)) & \quad \text{if } y = 0
    \end{cases}
$$

which is also

$$J(\theta)=-\frac{1}{m}\sum \Big[ y^{(i)}\log(h\theta(x^{(i)})) + (1-y^{(i)})\log(1-h\theta(x^{(i)})) \Big]$$

which should be minimized

#### Gradient Descent

$$\theta_j := \theta_j - \alpha\frac{\partial}{\partial\theta_j}J(\theta)$$

In this case:

$$\theta_j := \theta_j - \alpha\sum^m_{i=1}(h_\theta(x^{(i)})-y^{(i)})x_j^{(i)}$$

In [1]:
# import necessary libraries
using Random

In [2]:
X_data = rand(Float32, (100, 5))
Y_data = rand(0:1, 100)
println("Shape of X_data: ", size(X_data))
println("Shape 1of Y_data: ", size(Y_data))

Shape of X_data: (100, 5)
Shape 1of Y_data: (100,)


In [3]:
function sigmoid(Z::Number)::Float32
    k = 1 + MathConstants.e ^ (-Float32(Z))
    return 1 / k
end

sigmoid (generic function with 1 method)

In [4]:
# test sigmoid function
sigmoid(1234)

1.0f0

In [5]:
sigmoid(0)

0.5f0

In [6]:
sigmoid(0.123)

0.5307113f0

In [7]:
typeof(X_data)

Array{Float32,2}

In [8]:
using Statistics

function scale(X::Array)::Array
    @assert ndims(X) == 2
    u = mean(X, dims=1) # compute mean
    s = std(X, dims=1)  # compute standard deviation
    res = (X .- u) ./ s
    return res
end

scale (generic function with 1 method)

In [9]:
X_data_scaled = scale(X_data)
println("Mean: ", mean(X_data_scaled, dims=1))
println("Std: ", std(X_data_scaled, dims=1))

Mean: Float32[3.576279f-8 7.152557f-9 3.4570693f-8 1.2129546f-7 -6.198883f-8]
Std: Float32[1.0 1.0 1.0 0.9999999 1.0]


In [10]:
# update sigmoid function
function sigmoid(Z::Array)::Array
    denom = 1 .+ (MathConstants.e .^ (-Z))
    return 1 ./ denom
end

sigmoid (generic function with 2 methods)

In [11]:
# test it
sigmoid(X_data_scaled)

100×5 Array{Float32,2}:
 0.262469  0.430346  0.449475  0.512657  0.432395
 0.218255  0.781578  0.283412  0.228895  0.354363
 0.620761  0.264099  0.596806  0.429298  0.189896
 0.782045  0.514932  0.652579  0.216128  0.611644
 0.137294  0.206546  0.573674  0.480437  0.364577
 0.189417  0.73545   0.733499  0.384777  0.396689
 0.808771  0.186872  0.346732  0.278399  0.361494
 0.827221  0.258122  0.495954  0.850765  0.258863
 0.193749  0.428589  0.846818  0.720241  0.734614
 0.563925  0.209013  0.844589  0.509234  0.336402
 0.280219  0.272508  0.730328  0.704026  0.794778
 0.759372  0.247167  0.467897  0.783003  0.836474
 0.634904  0.589769  0.504559  0.521567  0.526058
 ⋮                                       
 0.209341  0.351337  0.543357  0.633214  0.826779
 0.686036  0.487483  0.239373  0.674853  0.453041
 0.808186  0.76354   0.733492  0.569827  0.520172
 0.197301  0.388686  0.710178  0.375428  0.328935
 0.801899  0.217809  0.519876  0.577182  0.483369
 0.521042  0.767945  0.169486  0.7

In [12]:
# initialize beta (contains beta_0 and beta_1)
beta = randn(size(X_data)[2]+1)
println(size(beta))

(6,)


In [13]:
function cost(X::Array, y::Array, beta::Array)::AbstractFloat
    @assert ndims(X) == 2
    @assert ndims(y) == 1
    @assert ndims(beta) == 1
    @assert size(X) == (size(y)[1], size(beta)[1]-1)
    m = size(X)[1]
    X_extended = hcat(X, ones(size(X)[1]))
    X_combined = X_extended * reshape(beta, (size(beta)[1], 1))
    prob = sigmoid(X_combined)
    y_prep = reshape(y, (size(y)[1], 1))
    vec = y_prep .* (log.(prob)) .+ (1 .- y_prep) .* (log.(1 .- prob))
    cost = -1 / float(m) * sum(vec)
    return cost
end

cost (generic function with 1 method)

In [14]:
# try it
cost(X_data_scaled, Y_data, beta)

1.2060107989382696

In [15]:
# prediction function
function predict_proba(X::Array, beta::Array)::Array
    @assert ndims(X) == 2
    @assert ndims(beta) == 1
    @assert size(X)[2] == size(beta)[1]-1
    X_extended = hcat(X, ones(size(X)[1]))
    X_combined = X_extended * reshape(beta, (size(beta)[1], 1))
    prob = sigmoid(X_combined)
    prob = [(prob...)...] # flatten to 1d array
    return prob
end

predict_proba (generic function with 1 method)

In [16]:
predict_proba(X_data_scaled, beta)

100-element Array{Float64,1}:
 0.7158651629426267
 0.6778311512661751
 0.48559152038790354
 0.9565381582600937
 0.844693045745
 0.8347089586825082
 0.7550244417751586
 0.09660030689390488
 0.9568948936778026
 0.846371377274259
 0.9660385052646239
 0.8821993763583131
 0.692905908620481
 ⋮
 0.9674160237556951
 0.2835392736987466
 0.6489047323612341
 0.8598429865192947
 0.7018694831578305
 0.015301314652217627
 0.833960704947854
 0.22954285418176149
 0.964364617292489
 0.8308707823703232
 0.925056402614478
 0.8580497619197903

In [17]:
function predict(X::Array, beta::Array)::Array
    @assert ndims(X) == 2
    @assert ndims(beta) == 1
    @assert size(X)[2] == size(beta)[1]-1
    X_extended = hcat(X, ones(size(X)[1]))
    X_combined = X_extended * reshape(beta, (size(beta)[1], 1))
    prob = sigmoid(X_combined)
    prob = [(prob...)...] # flatten to 1d array
    prob = map(m -> m >= 0.5 ? 1 : 0, prob)
    return prob
end

predict (generic function with 1 method)

In [18]:
predict(X_data_scaled, beta)

100-element Array{Int64,1}:
 1
 1
 0
 1
 1
 1
 1
 0
 1
 1
 1
 1
 1
 ⋮
 1
 0
 1
 1
 1
 0
 1
 0
 1
 1
 1
 1

In [19]:
# inplace learning function (Gradient Descent)
function learn!(X::Array, y::Array, beta::Array, alpha::AbstractFloat)
    @assert ndims(X) == 2
    @assert ndims(y) == 1
    @assert ndims(beta) == 1
    @assert size(X) == (size(y)[1], size(beta)[1]-1)
    predictions = predict_proba(X, beta)
    offset = predictions .- y
    offset = reshape(offset, (size(offset)[1], 1))
    X_extended = hcat(X, ones(size(X)[1]))
    gradients = X_extended' * offset
    gradients = gradients ./ size(X)[1]
    gradients = gradients .* alpha
    beta .= beta .- [(gradients...)...]
    return nothing
end

learn! (generic function with 1 method)

In [20]:
# try it
println(beta)

[-0.2847056981407118, -0.4377948464434017, 0.6514609110387156, -1.0704197309901735, 1.3742379679395351, 1.0673453103388324]


In [21]:
learn!(X_data_scaled, Y_data, beta, 0.01)
println(beta)

[-0.2834869286157294, -0.43693803301083645, 0.6507196071029732, -1.0682575866647863, 1.3720747411239878, 1.0659762979742713]


In [22]:
# define the logistic regression function
function train(X::Array, y::Array; learning_rate::AbstractFloat=0.01, max_iter::Integer=10, return_all::Bool=false)::Array
    @assert ndims(X) == 2
    @assert ndims(y) == 1
    @assert size(X)[1] == size(y)[1]
    @assert max_iter >= 0
    beta = Random.randn(size(X)[2]+1)
    res = nothing
    if return_all
        res = reshape(beta, (1, size(beta)[1]))
    else
        res = beta
    end
    for i = 1:max_iter
        if return_all
            beta = learn(X, y, beta, learning_rate)
            res = cat(res, reshape(beta, (1, size(beta)[1])), dims=1)
        else
            learn!(X, y, beta, learning_rate)
            res .= beta
        end
    end
    return res
end

train (generic function with 1 method)

In [23]:
# accuracy function
function accuracy(y_pred::Array, y_real::Array)::AbstractFloat
    @assert ndims(y_pred) == ndims(y_real) == 1
    @assert size(y_pred) == size(y_real)
    sum = 0
    for (m, n) in zip(y_pred, y_real)
        if m == n
            sum += 1
        end
    end
    return sum / size(y_pred)[1]
end

accuracy (generic function with 1 method)

In [24]:
accuracy(predict(X_data_scaled, beta), Y_data)

0.46

In [25]:
weights = train(X_data_scaled, Y_data, max_iter=100, learning_rate=0.5)
accuracy(predict(X_data_scaled, weights), Y_data)

0.53

Since it is linear model, we should not expect it to have a high accuracy on a completely randomly generated dataset  
But we can see the improvements