In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats

# This is an exploration of the Palmer Station penguins dataset 
# https://allisonhorst.github.io/palmerpenguins/
# Horst AM, Hill AP, Gorman KB (2020). palmerpenguins: Palmer Archipelago (Antarctica) penguin data. R package version 0.1.0. https://allisonhorst.github.io/palmerpenguins/. doi: 10.5281/zenodo.3960218.
# https://en.wikipedia.org/wiki/Pygoscelis

# Three hundred brush-tailed penguins were briefly captured
# and subjected to humiliating beak, flipper, and weight measurements 
# so that data science students don't have to look at Ronald Fisher's 
# iris dataset.

In [None]:
# There is a mirror of this dataset in vega-datasets.
# see if I'm allowed to download it by curl...
# curl https://raw.githubusercontent.com/vega/vega-datasets/next/data/penguins.json
!curl -OL https://raw.githubusercontent.com/vega/vega-datasets/next/data/penguins.json 

In [None]:
# That line may not work for you; if not, try
import urllib.request

req = urllib.request.Request('https://raw.githubusercontent.com/vega/vega-datasets/next/data/penguins.json')
with urllib.request.urlopen(req) as response:
   data = response.read().decode("utf8")
data[0:300]

In [None]:
data_pd = pd.read_json(data)

In [None]:
# Check that the download was data and not a webpage with an error...
!head penguins.json

In [None]:
penguins = pd.read_json("penguins.json")

In [None]:
penguins = pd.read_json(data)

In [None]:
g = penguins.iloc[np.random.permutation(len(penguins))]
len(penguins) * .25


In [None]:
penguins_train = g[87:]
penguins_test  = g[:87]
penguins_train.to_csv( "penguins_train.csv")
penguins_test.to_csv( "penguins_test.csv")
penguins = penguins_train
# There is a mistake in this cell that makes it not do what it 
# was intended.  

In [None]:
# Read in the two sets once they are fixed an in a file:
penguins_train=pd.read_csv( "penguins_train.csv")
penguins_test = pd.read_csv( "penguins_test.csv")
penguins = penguins_train


In [None]:
# How many penguins?
penguins.Species.value_counts(), len(penguins)

In [None]:
penguins.head()

In [None]:
penguins.describe()

In [None]:
penguins.Sex.value_counts()

In [None]:
# Dictionary for painting the dots high-contrast colors
colors = { "Adelie" : "green", "Gentoo": "blue", "Chinstrap": "red"}

In [None]:
plt.scatter(penguins["Beak Depth (mm)"], penguins["Body Mass (g)"] , color = penguins.Species.map(colors)  )
# Had to look up how to make a legend for this kind of data
# https://matplotlib.org/stable/gallery/text_labels_and_annotations/custom_legends.html
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=c[1], edgecolor='k', label=c[0]) for c in colors.items()]
plt.legend(handles=legend_elements)
plt.xlabel("Beak Depth(mm)"); plt.ylabel("Body mass (g)")

In [None]:
penguins.plot( "Beak Depth (mm)", "Body Mass (g)" ,
              kind="scatter", color = penguins.Species.map(colors)  )

In [None]:
# ONE-HOT ENCODING

penguins["SP1"] = penguins.Species == "Adelie"
penguins["SP2"] = penguins.Species == "Gentoo"
penguins["SP3"] = penguins.Species == "Chinstrap"
penguins_test["SP1"]= penguins.Species == "Adelie"
penguins_test["SP2"]= penguins.Species == "Gentoo"
penguins_test["SP3"]= penguins.Species == "Chinstrap"


In [None]:
penguins["SP1"].sum(), penguins["SP2"].sum(), penguins["SP3"].sum()

In [None]:
len(penguins)
115+ 95+ 47

In [None]:
penguins[["Flipper Length (mm)", "Beak Length (mm)"]].plot("Flipper Length (mm)", "Beak Length (mm)", kind="scatter", color = penguins.Species.map(colors))
plt.savefig("PENGUIN.png", dpi=300, bbox_inches="tight")

In [None]:
penguins[["Beak Depth (mm)", "Body Mass (g)"]].values.shape, penguins.SP2.values[:, np.newaxis].shape

In [None]:
# How do I get this in the right shape?
penguins.SP2.values[:, np.newaxis].shape

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
#reg = LinearRegression().fit(penguins[["Beak Depth (mm)", "Body Mass (g)"]].values, penguins.SP2.values)

ValueError: Input X contains NaN.

In [None]:
# Ok, ok, I'll clean up my dataframe.. sorry, scipy..

In [None]:
penguinclean = penguins[np.isfinite( penguins["Beak Depth (mm)"]) & 
                        np.isfinite( penguins["Body Mass (g)"]) &
                        np.isfinite( penguins["Flipper Length (mm)"]) & 
                        np.isfinite( penguins["Beak Length (mm)"])]

In [None]:
penguinclean.isnull().any()

In [None]:
penguins_train = penguins_train[np.isfinite( penguins_train["Beak Depth (mm)"]) & 
                        np.isfinite( penguins_train["Body Mass (g)"]) &
                        np.isfinite( penguins_train["Flipper Length (mm)"]) & 
                        np.isfinite( penguins_train["Beak Length (mm)"])]

In [None]:
penguins_test = penguins_test[np.isfinite( penguins_test["Beak Depth (mm)"]) & 
                              np.isfinite( penguins_test["Body Mass (g)"]) &
                              np.isfinite( penguins_test["Flipper Length (mm)"]) & 
                              np.isfinite( penguins_test["Beak Length (mm)"])]

In [None]:
penguins_train.isnull().any(), len(penguins_test)

In [None]:
penguins_test.isnull().any(), len(penguins_test)

In [None]:
# run linear regression... this with just two columns for X against an 
# indicator variable for species 2
reg = LinearRegression().fit(penguinclean[["Beak Depth (mm)", "Body Mass (g)"]].values, penguinclean.SP2.values)

In [None]:
dir(reg)

In [None]:
reg.coef_, reg.intercept_

In [None]:
# Let us see if I can build the linear classifier out of the 
# coefficients in reg... 
X = penguinclean[["Beak Depth (mm)", "Body Mass (g)"]].values
Y = penguinclean.SP2
YHAT = np.dot(X, reg.coef_ ) + reg.intercept_

In [None]:
# My linear-leastsquares-fit coefficients were 
#  (array([-0.13720996,  0.00033089]), 1.3226593522496826)
# That's b0, b1, and C...
# yhat = b0 * X[:,0] + b1* X[:,1] + C 
# high school algebra... the boundary is x1div = (0.5 - c) / b1 - b0 /b1 * x0

x0div = np.arange(14,24)
x1div = (0.67 - reg.intercept_)/ reg.coef_[1] - reg.coef_[0] / reg.coef_[1] * x0div

In [None]:
plt.scatter(X[:,0], X[:,1], color=penguinclean.Species.map(colors) )
plt.legend(handles=legend_elements)

plt.plot(x0div, x1div)

In [None]:
plt.scatter(penguins["Beak Depth (mm)"], penguins["Body Mass (g)"] , color = penguins.Species.map(colors)  )
plt.legend(handles=legend_elements)
plt.plot(x0div, x1div)

In [None]:
# "Shooting fish in a barrel."

# We took two features, performed linear regression, and 
# used the coefficients to build a linear classifier.

# Gentoo is easy to tell apart from the other two species of brush-tailed penguin.


In [None]:
# Other dimensions will be informative about the difference between
# the red and the green birds:
plt.scatter(penguins["Beak Length (mm)"], penguins["Flipper Length (mm)"] , color = penguins.Species.map(colors)  )
plt.legend(handles=legend_elements)
# I can glance at this graph and guess at the locations of the
# lines that would separate each of the three species from the
# rest, but I will need a little more theory to do three-way classificaiton. 

In [None]:
# Now I'm going to go after a harder problem; Adelie penguins with all four X
reg4_1 = LinearRegression().fit(penguins_train[["Beak Depth (mm)", "Body Mass (g)", "Beak Length (mm)", "Flipper Length (mm)"]].values, penguins_train.SP1.values)

In [None]:
reg4_1.coef_, reg4_1.intercept_

In [None]:
print(["Beak Depth (mm)", "Body Mass (g)", "Beak Length (mm)", "Flipper Length (mm)"])
reg4_1.coef_, reg4_1.intercept_

In [None]:
# Can I interpret these coefficients?  I can interpret their signs, of course..
# Three of the measurements are in mm, one is in g, and they have different
# relevant scales.


In [None]:
penguins_train.std()

In [None]:
# Let's set put the standard deviations of each column of X into an array:
feature_std = np.array([1.964146, 826.402823,  5.046194,  13.464290 ])
# If I arranged the columns correctly, this product 
print(["Beak Depth (mm)", "Body Mass (g)", "Beak Length (mm)", "Flipper Length (mm)"])

reg4_1.coef_ * feature_std



In [None]:
# tells us a little more about which fields had more weight.  
# Flipper length and Body mass aren't as weighted as two 
# beak measurements.

In [None]:
SP1PREDICT_train= np.dot( penguins_train[["Beak Depth (mm)", "Body Mass (g)", 'Beak Length (mm)', "Flipper Length (mm)"]].values,reg4_1.coef_) + reg4_1.intercept_

In [None]:
penguins_train["SP1PREDICT"] = SP1PREDICT_train > 0.5
penguins_train["SP1LINEAR"] = SP1PREDICT_train 

In [None]:
SP1PREDICT_test= np.dot( penguins_test[["Beak Depth (mm)", "Body Mass (g)", 'Beak Length (mm)', "Flipper Length (mm)"]].values,reg4_1.coef_) + reg4_1.intercept_

In [None]:
penguins_test["SP1PREDICT"] = SP1PREDICT_test > 0.5
penguins_test["SP1LINEAR"] = SP1PREDICT_test 

In [None]:
penguins_train.groupby(["SP1PREDICT", "SP1"]).SP1PREDICT.count()

In [None]:
len(penguins_train)

In [None]:
# On the training data, which is cheating,  
# Out of 256 penguins, the four-dimensional linear classifier got 252 right
# and 4 wrong on the "is-it-an-Adelie"

In [None]:
penguins_test.groupby(["SP1PREDICT", "SP1"]).SP1PREDICT.count()

In [None]:
penguins_test.head()

In [None]:
# Since the scores for a binary classifier are in one dimension, 
# I can histogram them

plt.hist(SP1PREDICT_train, bins=50)

In [None]:
plt.hist(SP1PREDICT_test, bins=50)

In [None]:
plt.hist(SP1PREDICT_test, bins=10)

In [None]:
# And they are nicely bimodal.
#  SP1PREDICT_train= np.dot( penguins_train[["Beak Depth (mm)", "Body Mass (g)", 'Beak Length (mm)', "Flipper Length (mm)"]].values,reg2.coef_) + reg2.intercept_

def penguinspredict(x):
  #  SP1PREDICTION= np.dot( x[["Beak Depth (mm)", "Body Mass (g)", 'Beak Length (mm)', "Flipper Length (mm)"]].values,reg4_1.coef_) + reg4_1.intercept_
    SP1PREDICTION= np.dot( x,reg4_1.coef_) + reg4_1.intercept_

    return(SP1PREDICTION > 0.5)


In [None]:
# TEST IT (that penguinspredict does something)
penguinspredict(penguins_train[["Beak Depth (mm)", "Body Mass (g)", 'Beak Length (mm)', "Flipper Length (mm)"]].values)

In [None]:
mean0 = penguins_train["Beak Depth (mm)"].mean()
mean1 = penguins_train["Body Mass (g)"].mean() 
mean2 = penguins_train['Beak Length (mm)'].mean()
mean3 = penguins_train["Flipper Length (mm)"].mean()


In [None]:
xgrid = np.arange(170, 230,2) # range for flipper length
ygrid = np.arange(30,59,1)  # range for beak length
xax, yax = np.meshgrid(xgrid, ygrid, indexing="ij")
z = np.zeros(xax.shape)
print(xax.shape, yax.shape, z.shape)
for i in range(len(xgrid)):
    for j in range(len(ygrid)):
        x = np.array([mean0, mean1, ygrid[j], xgrid[i]])
       # print(x)
        z[i,j]= penguinspredict(x)


In [None]:
colorsnum = { 0 : "green", 1: "blue", 2: "red"}
c = [colorsnum[i] for i in z.reshape(-1)]
plt.scatter(yax,xax, color=c, alpha=0.3)

plt.scatter(penguins["Beak Length (mm)"], penguins["Flipper Length (mm)"] , color = penguins.Species.map(colors)  )
plt.legend(handles=legend_elements)
plt.xlabel("Beak Length (mm)")
plt.ylabel( "Flipper Length (mm)")

In [None]:
# This is a contour-like plot of the boundary between "is-it-Adelie" and not Adelie.
# For the *one* dimensional classifier 

##   Let's try to do HW2 with sklearn.LogisticRegression...

In [135]:
penguins_train

Unnamed: 0.1,Unnamed: 0,Species,Island,Beak Length (mm),Beak Depth (mm),Flipper Length (mm),Body Mass (g),Sex,SP1,SP2,SP3,SP1PREDICT,SP1LINEAR
0,194,Chinstrap,Dream,50.9,19.1,196.0,3550.0,MALE,False,False,True,False,0.083466
1,295,Gentoo,Biscoe,48.6,16.0,230.0,5800.0,MALE,False,True,False,False,0.075181
2,263,Gentoo,Biscoe,49.6,15.0,216.0,4750.0,MALE,False,True,False,False,-0.160671
3,158,Chinstrap,Dream,46.1,18.2,178.0,3250.0,FEMALE,False,False,True,False,0.397042
4,292,Gentoo,Biscoe,48.2,15.6,221.0,5100.0,MALE,False,True,False,False,0.015667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
252,214,Chinstrap,Dream,45.7,17.0,195.0,3650.0,FEMALE,False,False,True,False,0.266289
253,20,Adelie,Biscoe,37.8,18.3,174.0,3400.0,FEMALE,True,False,False,True,1.031446
254,212,Chinstrap,Dream,51.9,19.5,206.0,3950.0,MALE,False,False,True,False,0.050500
255,61,Adelie,Biscoe,41.3,21.1,195.0,4400.0,MALE,True,False,False,True,1.075490


In [136]:
X4 = penguins_train.dropna()[["Beak Depth (mm)", "Body Mass (g)", 'Beak Length (mm)', "Flipper Length (mm)"]]
Y4 = penguins_train.dropna().Sex 

In [137]:
from sklearn.linear_model import LogisticRegression
fit4sex = LogisticRegression().fit(X4, Y4)

In [138]:
fit4sex.coef_, fit4sex.intercept_

(array([[-0.01635048,  0.00125618, -0.0244624 , -0.03436507],
        [-0.40947577, -0.00358774, -0.06954419,  0.13195433],
        [ 0.42582625,  0.00233156,  0.09400659, -0.09758926]]),
 array([-0.00056779,  0.00225905, -0.00169126]))

In [None]:
# WHAT IS THIS  ? ?

In [139]:
penguins_train.Sex.value_counts()

MALE      133
FEMALE    116
.           1
Name: Sex, dtype: int64

In [None]:
# <rant>  sklearn.linear_model.LogisticRegression
# does not have the same input type as LinearRegression
# or most of the rest of the sklearn ML models... 
# LogisticRegression takes a 1d vector of FACTORS for y</rant>


In [140]:
fit4sex.classes_

array(['.', 'FEMALE', 'MALE'], dtype=object)

In [141]:
penguins.iloc[np.where(penguins.isnull())]

Unnamed: 0,Sex,Beak Length (mm),Beak Depth (mm),Flipper Length (mm),Body Mass (g),Sex.1,Sex.2,Sex.3,Sex.4,Sex.5,Beak Length (mm).1,Beak Depth (mm).1,Flipper Length (mm).1,Body Mass (g).1,Sex.6
26,,47.3,13.8,216.0,4725.0,,,,,,47.3,13.8,216.0,4725.0,
67,,,,,,,,,,,,,,,
67,,,,,,,,,,,,,,,
67,,,,,,,,,,,,,,,
67,,,,,,,,,,,,,,,
67,,,,,,,,,,,,,,,
140,,37.8,17.3,180.0,3700.0,,,,,,37.8,17.3,180.0,3700.0,
164,,37.8,17.1,186.0,3300.0,,,,,,37.8,17.1,186.0,3300.0,
170,,46.2,14.4,214.0,4650.0,,,,,,46.2,14.4,214.0,4650.0,
190,,42.0,20.2,190.0,4250.0,,,,,,42.0,20.2,190.0,4250.0,


In [142]:
penguins.query("Sex == '.'")

Unnamed: 0.1,Unnamed: 0,Species,Island,Beak Length (mm),Beak Depth (mm),Flipper Length (mm),Body Mass (g),Sex,SP1,SP2,SP3
20,336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.,False,True,False


In [143]:
unknownpenguin = penguins.query("Sex == '.'")[["Beak Depth (mm)", "Body Mass (g)", 'Beak Length (mm)', "Flipper Length (mm)"]] 


In [144]:
fit4sex.predict(unknownpenguin)

array(['FEMALE'], dtype=object)

In [145]:
fit4sex.predict_proba(unknownpenguin), fit4sex.classes_

(array([[0.00856516, 0.63239252, 0.35904232]]),
 array(['.', 'FEMALE', 'MALE'], dtype=object))

In [146]:
predictsex = fit4sex.predict(X4TEST)

In [147]:
confusion_matrix(predictsex, penguins_test.dropna().Sex)

array([[35,  5],
       [14, 30]])

In [148]:
X4 = penguins_train.dropna()[["Beak Depth (mm)", "Body Mass (g)", 'Beak Length (mm)', "Flipper Length (mm)"]]
YSPECIES = penguins_train.dropna().Species
X4TEST = penguins_test.dropna()[["Beak Depth (mm)", "Body Mass (g)", 'Beak Length (mm)', "Flipper Length (mm)"]]
YSPECIESTEST = penguins_test.dropna().Species

In [149]:
YSPECIES.value_counts()

Adelie       116
Gentoo        90
Chinstrap     44
Name: Species, dtype: int64

In [150]:
fit4species = LogisticRegression().fit(X4, YSPECIES)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [151]:
fit4species = LogisticRegression(max_iter=500).fit(X4, YSPECIES)

In [152]:
fit4species.coef_, fit4species.intercept_, fit4species.classes_

(array([[ 1.59735843e+00,  7.50346433e-04, -9.28392716e-01,
          5.52903533e-02],
        [-3.03853668e-01, -4.77906015e-03,  9.83345061e-01,
         -9.41261560e-02],
        [-1.29350476e+00,  4.02871371e-03, -5.49523449e-02,
          3.88358027e-02]]),
 array([ 0.09054777, -0.0618506 , -0.02869717]),
 array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object))

In [153]:
# And the sklearn API has .predict() methods for almost everything:
predictions = fit4species.predict(X4TEST)
predictions

array(['Adelie', 'Adelie', 'Gentoo', 'Chinstrap', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Chinstrap', 'Chinstrap', 'Chinstrap',
       'Gentoo', 'Gentoo', 'Adelie', 'Adelie', 'Gentoo', 'Gentoo',
       'Gentoo', 'Chinstrap', 'Adelie', 'Gentoo', 'Chinstrap',
       'Chinstrap', 'Chinstrap', 'Adelie', 'Chinstrap', 'Chinstrap',
       'Gentoo', 'Chinstrap', 'Gentoo', 'Adelie', 'Chinstrap', 'Gentoo',
       'Gentoo', 'Chinstrap', 'Adelie', 'Gentoo', 'Chinstrap',
       'Chinstrap', 'Chinstrap', 'Adelie', 'Gentoo', 'Adelie',
       'Chinstrap', 'Gentoo', 'Adelie', 'Gentoo', 'Gentoo', 'Adelie',
       'Adelie', 'Adelie', 'Gentoo', 'Chinstrap', 'Chinstrap', 'Gentoo',
       'Adelie', 'Adelie', 'Gentoo', 'Gentoo', 'Chinstrap', 'Adelie',
       'Gentoo', 'Adelie', 'Gentoo', 'Gentoo', 'Adelie', 'Adelie',
       'Chinstrap', 'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Gentoo',
       'Adelie', 'Adelie', 'Chinstrap', 'Gentoo', 'Adelie', 'Chinstrap',
       'Chinstrap', 'Gentoo', 'Gentoo', 'Ge

In [154]:
from sklearn.metrics import confusion_matrix
# Confusion_matrix takes one-hot encoding... but 
# LogisticRegression.predict() produces effing labels.

In [None]:
Y_onehot = pd.get_dummies(YSPECIES)
Ytest_onehot = pd.get_dummies(YSPECIESTEST)
predictions_onehot = pd.get_dummies(predictions)

In [None]:
predictions_onehot

In [None]:
confusion_matrix(predictions_onehot, Ytest_onehot)

In [None]:
predictions_onehot.head()

In [155]:
fit4species = LogisticRegression(max_iter=500, 
                multi_class="multinomial").fit(X4, YSPECIES)

In [156]:
predictions = fit4species.predict(X4TEST)
predictions_onehot = pd.get_dummies(predictions)
predictions_onehot

Unnamed: 0,Adelie,Chinstrap,Gentoo
0,1,0,0
1,1,0,0
2,0,0,1
3,0,1,0
4,1,0,0
...,...,...,...
79,0,1,0
80,0,0,1
81,0,0,1
82,0,0,1


confusion_matrix(predictions_onehot, Ytest_onehot)

In [None]:
ValueError: multilabel-indicator is not supported

In [None]:
predictions_onehot.sum(axis=1).value_counts()

In [None]:
Ytest_onehot.sum(axis=1).value_counts()

In [None]:
# Somebody didn't rtfm earlier...
# https://stackoverflow.com/questions/46953967/multilabel-indicator-is-not-supported-for-confusion-matrix
# Input to class must be labels, not one-hot-encoding.


In [157]:
confusion_matrix(YSPECIESTEST, predictions)

array([[30,  0,  0],
       [ 0, 24,  0],
       [ 0,  0, 30]])

In [None]:
# Perfect..  Pros / cons of perfect classification ? ?


In [None]:
Ytest_onehot

In [158]:
# FYI if I want to go back from one-hot-encoding to labels:
Ytest_onehot.values.argmax(axis=1)


array([0, 0, 2, 1, 0, 0, 0, 0, 1, 1, 1, 2, 2, 0, 0, 2, 2, 2, 1, 0, 2, 1,
       1, 1, 0, 1, 1, 2, 1, 2, 0, 1, 2, 2, 1, 0, 2, 1, 1, 1, 0, 2, 0, 1,
       2, 0, 2, 2, 0, 0, 0, 2, 1, 1, 2, 0, 0, 2, 2, 1, 0, 2, 0, 2, 2, 0,
       0, 1, 0, 0, 2, 0, 2, 0, 0, 1, 2, 0, 1, 1, 2, 2, 2, 2])