In [1]:
import numpy as np
import scipy as sc
from sklearn.linear_model import LogisticRegression
import plotly.offline as py
from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode()
from itertools import product
np.random.seed = 0

In [2]:
# LARGE DOGS AND SMALL HORSES - SET DISTRIBUTION PARAMETERS

number_of_observations = 100 # per class
dog_weight_mean = 80
dog_weight_stddev = 10
dog_height_mean = 20
dog_height_stddev = 10
horse_weight_mean = 100
horse_weight_stddev = 10
horse_height_mean = 42
horse_height_stddev = 10

In [3]:
# GENERATE SAMPLE OBSERVATIONS

dog_weights = np.random.normal(dog_weight_mean,
                               dog_weight_stddev,
                               number_of_observations)
dog_heights = np.random.normal(dog_height_mean,
                               dog_height_stddev,
                               number_of_observations)
dog_targets = [0]*number_of_observations
horse_weights = np.random.normal(horse_weight_mean,
                                 horse_weight_stddev,
                                 number_of_observations)
horse_heights = np.random.normal(horse_height_mean,
                                 horse_height_stddev,
                                 number_of_observations)
horse_targets = [1]*number_of_observations

weights = np.hstack((dog_weights, horse_weights))
heights = np.hstack((dog_heights, horse_heights))

X = np.vstack((weights, heights)).T
y = np.hstack((dog_targets,horse_targets))

In [4]:
X_dogs = np.random.multivariate_normal(mean=[dog_weight_mean, 
                                             dog_height_mean],
                               cov=np.diag([dog_weight_stddev**2, 
                                            dog_height_stddev**2]),
                               size=number_of_observations)
y_dogs = [0]*number_of_observations

X_horses = np.random.multivariate_normal(mean=[horse_weight_mean, 
                                               horse_height_mean],
                               cov=np.diag([horse_weight_stddev**2, 
                                            horse_height_stddev**2]),
                               size=number_of_observations)
y_horses = [1]*number_of_observations

X = np.vstack((X_dogs, X_horses))
y = np.concatenate((y_dogs, y_horses))



In [5]:
# MAKE MODEL
model = LogisticRegression()
model.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
# PLOT UNDERLYING DISTRIBUTIONS
plot_weights = np.linspace(dog_weight_mean-4*dog_weight_stddev,
                           horse_weight_mean+4*horse_weight_stddev,
                           number_of_observations*2)

dog_weight_distribution = Scatter(name='Dog Weight Distribution',
                                  x=plot_weights,
                                  y=sc.stats.norm.pdf(plot_weights,
                                                      loc=dog_weight_mean,
                                                      scale=dog_weight_stddev),
                                  mode='lines+markers',
                                  line=Line(shape='spline'))

horse_weight_distribution = Scatter(name='Horse Weight Distribution',
                                    x=plot_weights,
                                    y=sc.stats.norm.pdf(plot_weights,
                                                        loc=horse_weight_mean,
                                                        scale=horse_weight_stddev),
                                    mode='lines+markers',
                                    line=Line(shape='spline'))

distribution_data = Data([dog_weight_distribution, horse_weight_distribution])

py.iplot(distribution_data)

In [7]:
# PLOT HISTOGRAMS OF SAMPLE
dog_weight_histogram = Histogram(x=X[y==0,0], 
                                 name='Dog Weights')
horse_weight_histogram = Histogram(x=X[y==1,0], 
                                   name='Horse Weights')

histogram_data = Data([dog_weight_histogram, horse_weight_histogram])

py.iplot(histogram_data)

In [8]:
# PLOT FEATURE SPACE
dog_feature_space = Scatter(name='Dogs',
                            x=dog_weights,
                            y=dog_heights,
                            mode='markers')
horse_feature_space = Scatter(name='Horses',
                              x=horse_weights,
                              y=horse_heights,
                              mode='markers')
feature_data = Data([dog_feature_space, 
                     horse_feature_space])
layout = Layout(title='Feature Space',
                xaxis=XAxis(title='Weight (lbs)',
                            range=[min(dog_weights),
                                   max(horse_weights)]),
    yaxis=YAxis(title='Height (in)',
                range=[min(dog_heights),
                       max(horse_heights)]))

feature_space_figure = Figure(data=feature_data, layout=layout)

py.iplot(feature_space_figure)

What is value of hypothesis function at decision boundary?

In [9]:
# PLOT DECISION BOUNDARY
intercept = model.intercept_
theta1, theta2 = model.coef_[0]

def decision_boundary(x1):
    x2 = -(intercept + theta1*x1)/theta2
    return x2

x1 = np.linspace(50,150,number_of_observations)
x2 = decision_boundary(x1)

decision_boundary_plot = Scatter(name='Decision Boundary',
                                 x=x1,
                                 y=x2,
                                 mode='line')

feature_data.append(decision_boundary_plot)
feature_space_figure_with_boundary = Figure(data=feature_data, 
                                            layout=layout)

py.iplot(feature_space_figure_with_boundary)


In [10]:
sample_weights = np.linspace(50, 150, number_of_observations*2)
sample_heights = np.linspace(0, 75, number_of_observations*2)
samples = list(product(sample_weights, sample_heights))
hypotheses = model.predict_proba(samples)[:,1]

hypotheses = np.reshape(hypotheses,
                 (len(sample_weights),
                  len(sample_heights)))

In [11]:
hypothesis_surface = Surface(z = hypotheses,
                             x = sample_weights,
                             y = sample_heights,
                             colorscale = [[0, 'rgb(31,119,180)'], 
                                           [0.5, 'rgb(143, 123, 196)'], 
                                           [1, 'rgb(255,127,97)']],
                             name='Hypothesis Function')

dog_scatter_3d = Scatter3d(x = dog_weights,
                           y = dog_heights,
                           z = dog_targets,
                           name = 'Dogs')

horse_scatter_3d = Scatter3d(x = horse_weights,
                             y = horse_heights,
                             z = horse_targets,
                             name = 'Horses')

data_3d_plot = Data([dog_scatter_3d, 
                     horse_scatter_3d, 
                     hypothesis_surface])

figure_3d = Figure(data=data_3d_plot)

py.iplot(figure_3d)



In [12]:
# PLOT TO SHOW TP/FP/TN/FN
examples = 25
x_sigmoid = np.linspace(-10, 10,examples)
y_sigmoid = 1./(1+np.e**(-(x_sigmoid)))

sigmoid_plot = Scatter(x = x_sigmoid,
                       y = y_sigmoid,
                       mode = 'line')

boundary_plot = Scatter(x = x_sigmoid,
                        y = [.5]*len(x_sigmoid),
                        mode = 'line')
negatives = np.random.normal(loc=-5, scale=5, size=examples)
positives = np.random.normal(loc=5, scale=5, size=examples)
negative_plot = Scatter(x = negatives,
                        y = [0]*len(negatives),
                        mode='markers')
positive_plot = Scatter(x = positives,
                        y = [1]*len(positives),
                        mode='markers')
layout = Layout(xaxis = XAxis(range = [-10, 10],
                              title = 'x'),
                yaxis = YAxis(title = 'h(x) - Estimated Probability'))
data = Data([negative_plot, positive_plot, sigmoid_plot])
figure = Figure(data=data, layout=layout)

py.iplot(figure)    