# Logistic regression

In this notebook we will study **Logistic Regression**.
We will make some interactive graphs that let us see how it works.

We will use interactive Jupyter widgets and the libraries **matplotlib** and **bqplot** for visualizations

To obtain more info you can read these posts [SPANISH]:


**Author**: Pablo González Carrizo ([unmonoqueteclea](https://twitter.com/unmonoqueteclea))

**Web**: https://unmonoqueteclea.github.io

## Importing dependencies

In [1]:
import math
import numpy as np
from bqplot import ( LinearScale, Axis, Scatter, Lines, Label, Figure)
from ipywidgets import HBox, VBox, Layout
import pandas as pd
from scipy import special
from sklearn import preprocessing 

## Defining sigmoid function

In [2]:
def sigmoid(z):
    return(1 / (1 + np.exp(-z)))

In [3]:
x_values = np.arange(-8,8)
test_sigmoid=[sigmoid(z) for z in x_values ]

Plotting the sigmoid function with bqplot

In [4]:
#Scalers
sc_x = LinearScale()
sc_y = LinearScale()
#Axis
ax_x = Axis(scale=sc_x, label='')
ax_y = Axis(scale=sc_y, orientation='vertical', tick_format='0.2f', label='')
#Creating the graph
line = Lines(x=x_values,y=test_sigmoid,scales={'x': sc_x, 'y': sc_y},colors=['blue'])
fig = Figure(marks=[line], axes=[ax_x, ax_y],layout=Layout(width='100%'), title="Sigmoid function")
#Displaying the graph
VBox([fig])

A Jupyter Widget

## Creating points

In [5]:
#(CLASS Y = 1) Positions of points with y = 1
posX1 = np.array([10,45,23,12,3 ,18,30,35, 5,32])
posY1 = np.array([12,16,20,60,80,99,54, 9,40,65])
#Creating matrix from positions
X1 = np.c_[np.ones(posX1.shape[0]),posX1,posY1]
#(CLASS Y = 0) Positions of points with y = 0
posX2 = np.array([67,53,90,87,71,59,95,80,65,80])
posY2 = np.array([34,67,54,8, 78,87,80,50,60,90])
#Creating matrix from positions
X2 = np.c_[np.ones(posX2.shape[0]),posX2,posY2]

X=np.concatenate([X1,X2])
#Classes (1 or 0)
y=np.concatenate([np.ones(posX1.shape[0]),np.zeros(posX2.shape[0])])
m = y.size # Number of training examples

## Plot function

In [6]:
def plot_points(x1,x2,y1,y2,title="",boundary=None):
    #Scalers
    sc_x = LinearScale(min=0,max=100)
    sc_y = LinearScale(min=0,max=100)
    #Axis
    ax_x = Axis(scale=sc_x, label='')
    ax_y = Axis(scale=sc_y, orientation='vertical', tick_format='0.2f', label='')
    #Creating plot
    scatt =  Scatter(x=x1, y=y1, scales={'x': sc_x, 'y': sc_y}, colors=['red'])
    scatt2 = Scatter(x=x2, y=y2, scales={'x': sc_x, 'y': sc_y}, colors=['blue'])
    if(boundary is None):
        fig = Figure(marks=[scatt,scatt2], axes=[ax_x, ax_y],layout=Layout(width='100%'), title=title)
    else:
        lines = Lines(x=boundary[0],y=boundary[1],scales={'x': sc_x, 'y': sc_y},colors=['green'])
        fig = Figure(marks=[scatt,scatt2,lines], axes=[ax_x, ax_y],layout=Layout(width='100%'), title=title)
    return fig

## Displying plot with all the points

In [7]:
fig = plot_points(posX1,posX2,posY1,posY2,title="")
HBox([fig])

A Jupyter Widget

## Hypotesis and Cost function

In [19]:
def h(mytheta,myX): 
    #The expit function, also known as the logistic function, 
    #is defined as expit(x) = 1/(1+exp(-x)). 
    #It is the inverse of the logit function.
    return special.expit(np.dot(myX,mytheta))

#Cost function
def computeCost(mytheta,myX,myy,regularization = 0): 
    term1 = np.dot( -np.array(myy).T , np.log(h(mytheta,myX)) )
    term2 = np.dot( (1-np.array(myy)).T , np.log(1-h(mytheta,myX)) )
    regterm = (regularization/2) * np.sum(np.dot(mytheta[1:].T,mytheta[1:])) #Skip theta0
    return float( (1./m) * ( np.sum(term1 - term2) + regterm ) )

In [20]:
initial_theta = np.zeros((X.shape[1],1))
computeCost(initial_theta,X,y) #Computing initial cost

0.6931471805599454

# Representing cost function

In [21]:
h = np.arange(0.0001,1,0.001)
y=0
# Computing cost
cost1 = [-math.log(h_value,10) for h_value in h]
cost2=[-math.log(1-h_value,10) for h_value in h]
#Scalers
sc_x = LinearScale()
sc_y = LinearScale()
#Axis
ax_x = Axis(scale=sc_x, label='h(x)')
ax_y = Axis(scale=sc_y, orientation='vertical', tick_format='0.2f', label='Cost')
#Creating the graph
line1 = Lines(x=h,y=cost1,scales={'x': sc_x, 'y': sc_y},colors=['blue'])
fig1 = Figure(marks=[line1], axes=[ax_x, ax_y],layout=Layout(width='100%'), title="y=1")
line2 = Lines(x=h,y=cost2,scales={'x': sc_x, 'y': sc_y},colors=['blue'])
fig2 = Figure(marks=[line2], axes=[ax_x, ax_y],layout=Layout(width='100%'), title="y=0")
#Displaying the graph
HBox([fig1,fig2])

A Jupyter Widget

# Obtaining theta

In [22]:
#This function minimizes our cost function using the "downhill simplex algorithm."
#http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.optimize.fmin.html
from scipy import optimize
def optimizeTheta(mytheta,myX,myy,mylambda=0.):
    result = optimize.fmin(computeCost, x0=mytheta, args=(myX, myy, mylambda), maxiter=400, full_output=True)
    return result[0], result[1]


In [23]:
theta, mincost = optimizeTheta(initial_theta,X,y)

TypeError: 'numpy.ndarray' object is not callable

In [None]:
def makePrediction(mytheta, myx):
    return h(mytheta,myx) >= 0.5

#Compute the percentage of samples I got correct:
pos_correct = float(np.sum(makePrediction(theta,X1)))
neg_correct = float(np.sum(np.invert(makePrediction(theta,X2))))
tot = len(X1)+len(X2)
prcnt_correct = float(pos_correct+neg_correct)/tot
print("Fraction of training samples correctly predicted: %f." % prcnt_correct)

In [None]:
def decission_boundary(mytheta):
    boundary_xs = np.array([np.min(X[:,1]), np.max(X[:,1])])
    boundary_ys = (-1./theta[2])*(theta[0] + theta[1]*boundary_xs)
    return (boundary_xs,boundary_ys)
    

In [None]:
fig = plot_points(posX1,posX2,posY1,posY2,title="",boundary=decission_boundary(theta))
HBox([fig])