In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats

# This is an exploration of the Palmer Station penguins dataset 
# https://allisonhorst.github.io/palmerpenguins/
# Horst AM, Hill AP, Gorman KB (2020). palmerpenguins: Palmer Archipelago (Antarctica) penguin data. R package version 0.1.0. https://allisonhorst.github.io/palmerpenguins/. doi: 10.5281/zenodo.3960218.
# https://en.wikipedia.org/wiki/Pygoscelis

# Three hundred brush-tailed penguins were briefly captured
# and subjected to humiliating beak, flipper, and weight measurements 
# so that data science students don't have to look at Ronald Fisher's 
# iris dataset.

In [None]:
# There is a mirror of this dataset in vega-datasets.
# see if I'm allowed to download it by curl...
# curl https://raw.githubusercontent.com/vega/vega-datasets/next/data/penguins.json
!curl -OL https://raw.githubusercontent.com/vega/vega-datasets/next/data/penguins.json 

In [None]:
# Check that the download was data and not a webpage with an error...
!head penguins.json

In [None]:
penguins = pd.read_json("../data/penguins.json")

In [None]:
g = penguins.iloc[np.random.permutation(len(penguins))]
len(penguins) * .25  # take one quarter of the penguins as a holdout set


In [None]:
penguins_train = g[87:]
penguins_test  = g[:87]
penguins_train.to_csv( "penguins_train.csv")
penguins_test.to_csv( "penguins_test.csv")
penguins = penguins_train
# There is a mistake in this cell that makes it not do what it was intended.  

In [None]:
# Further, there is a suboptimality in assigning penguins to penguins_train this early.

In [None]:
g

In [None]:
plt.plot(g.index)

In [None]:
# How many penguins?
penguins.Species.value_counts(), len(penguins)

In [None]:
# Dictionary for painting the dots high-contrast colors
colors = { "Adelie" : "green", "Gentoo": "blue", "Chinstrap": "red"}
colors2 = { "Adelie" : "lightgreen", "Gentoo": "lightblue", "Chinstrap": "pink"}

In [None]:
plt.scatter(penguins["Beak Depth (mm)"], penguins["Body Mass (g)"] , color = penguins.Species.map(colors)  )
# Had to look up how to make a legend for this kind of data
# https://matplotlib.org/stable/gallery/text_labels_and_annotations/custom_legends.html
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=c[1], edgecolor='k', label=c[0]) for c in colors.items()]
plt.legend(handles=legend_elements)

In [None]:
penguins.plot( "Beak Depth (mm)", "Body Mass (g)" , kind="scatter", color = penguins.Species.map(colors)  )

In [None]:
# Turning a column with k "factors" into k columns with numbers... jargon for this is?
penguins["SP1"] = penguins.Species == "Adelie"
penguins["SP2"] = penguins.Species == "Gentoo"
penguins["SP3"] = penguins.Species == "Chinstrap"
penguins_test["SP1"]= penguins_test.Species == "Adelie"
penguins_test["SP2"]= penguins_test.Species == "Gentoo"
penguins_test["SP3"]= penguins_test.Species == "Chinstrap"


In [None]:
penguins["SP1"].sum(), penguins["SP2"].sum(), penguins["SP3"].sum()

In [None]:
len(penguins)

In [None]:
penguins.describe()

In [None]:
# Hey, where are my SP1, SP2, and SP3 indicator columns?  And Sex isn't a number yet...

In [None]:
penguins[["Flipper Length (mm)", "Beak Length (mm)"]].plot("Flipper Length (mm)", "Beak Length (mm)", kind="scatter", color = penguins.Species.map(colors))
plt.savefig("PENGUIN.png", dpi=300, bbox_inches="tight")

In [None]:
penguins[["Beak Depth (mm)", "Body Mass (g)"]].values.shape, penguins.SP2.values[:, np.newaxis].shape

In [None]:
# How do I get this in the right shape?
penguins.SP2.values[:, np.newaxis].shape

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# reg = LinearRegression().fit(penguins[["Beak Depth (mm)", "Body Mass (g)"]].values, penguins.SP2.values)

In [None]:
# Ok, ok, I'll clean up my dataframe.. sorry, scipy..

In [None]:
penguinclean = penguins[np.isfinite( penguins["Beak Depth (mm)"]) & 
                        np.isfinite( penguins["Body Mass (g)"]) &
                        np.isfinite( penguins["Flipper Length (mm)"]) & 
                        np.isfinite( penguins["Beak Length (mm)"]) ] #  & 
                      #  np.isfinite( penguins["Sex"]) ]

In [None]:
penguinclean.isnull().any()

In [None]:
penguinclean = penguinclean.dropna()

In [None]:
penguinclean.isnull().any(), len(penguinclean)

In [None]:
penguins_train = penguins_train[np.isfinite( penguins_train["Beak Depth (mm)"]) & 
                        np.isfinite( penguins_train["Body Mass (g)"]) &
                        np.isfinite( penguins_train["Flipper Length (mm)"]) & 
                        np.isfinite( penguins_train["Beak Length (mm)"])]
penguins_train =penguins_train.dropna()

In [None]:
penguins_test = penguins_test[np.isfinite( penguins_test["Beak Depth (mm)"]) & 
                              np.isfinite( penguins_test["Body Mass (g)"]) &
                              np.isfinite( penguins_test["Flipper Length (mm)"]) & 
                              np.isfinite( penguins_test["Beak Length (mm)"])]
penguis_test = penguins_test.dropna()

In [None]:
penguins_train.isnull().any(), len(penguins_test)

In [None]:
penguins_test.isnull().any(), len(penguins_test)

In [None]:
penguins_test.iloc[np.where(penguins_test.isnull())]

In [None]:
penguins_test.iloc[np.where(penguins_test.isnull())].index

In [None]:
penguins_test.drop(penguins_test.iloc[np.where(penguins_test.isnull())].index, axis=0)

In [None]:
# Finally, get rid of the None values...
penguins_test = penguins_test.drop(penguins_test.iloc[np.where(penguins_test.isnull())].index, axis=0) 

In [None]:
# And confirm that everything is healthy: 
penguins_test.isnull().any(), len(penguins_test)

In [None]:
# run linear regression... this with just two columns for X
reg = LinearRegression().fit(penguinclean[["Beak Depth (mm)", "Body Mass (g)"]].values, penguinclean.SP2.values)

In [None]:
dir(reg)

In [None]:
reg.coef_, reg.intercept_

In [None]:
# Let us see if I can build the linear classifier out of the 
# coefficients in reg... 
X = penguinclean[["Beak Depth (mm)", "Body Mass (g)"]].values
Y = penguinclean.SP2
YHAT = np.dot(X, reg.coef_ ) + reg.intercept_

In [None]:
X.shape

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(121)
plt.scatter(X[:,0], YHAT, color=penguinclean.Species.map(colors) )
plt.xlabel("Beak Depth (mm)")
plt.ylabel("Gentoo linear predictor")

plt.subplot(122)
plt.scatter(X[:,1], YHAT, color=penguinclean.Species.map(colors) )
plt.xlabel("Body Mass (g))")
plt.ylabel("Gentoo linear predictor")
plt.legend(handles=legend_elements)


In [None]:
# My linear-leastsquares-fit coefficients were 
#  (array([-0.13720996,  0.00033089]), 1.3226593522496826)
# That's b0, b1, and C...
# yhat = b0 * X[:,0] + b1* X[:,1] + C 
# high school algebra... the boundary is x1div = (0.5 - c) / b1 - b0 /b1 * x0

x0div = np.arange(14,24)
x1div = (0.5 - reg.intercept_)/ reg.coef_[1] - reg.coef_[0] / reg.coef_[1] * x0div

In [None]:
plt.scatter(X[:,0], X[:,1], color=penguinclean.Species.map(colors) )
plt.legend(handles=legend_elements)

plt.plot(x0div, x1div)

In [None]:
plt.scatter(penguins["Beak Depth (mm)"], penguins["Body Mass (g)"] , color = penguins.Species.map(colors)  )
plt.legend(handles=legend_elements)
plt.plot(x0div, x1div)

In [None]:
# "Shooting fish in a barrel."

# We took two features, performed linear regression, and 
# used the coefficients to build a linear classifier.

# Gentoo is easy to tell apart from the other two species of brush-tailed penguin.


In [None]:
# Other dimensions will be informative about the difference between
# the red and the green birds:
plt.scatter(penguins["Beak Length (mm)"], penguins["Flipper Length (mm)"] , color = penguins.Species.map(colors)  )
plt.legend(handles=legend_elements)
# I can glance at this graph and guess at the locations of the
# lines that would separate each of the three species from the
# rest, but I will need a little ore theory to do three-way classificaiton. 

In [None]:
# Now I'm going to go after a harder problem; Adelie penguins with all four X
reg4_1 = LinearRegression().fit(penguins_train[["Beak Depth (mm)", "Body Mass (g)", "Beak Length (mm)", "Flipper Length (mm)"]].values, penguins_train.SP1.values)

In [None]:
reg4_1.coef_, reg4_1.intercept_

In [None]:
print(["Beak Depth (mm)", "Body Mass (g)", "Beak Length (mm)", "Flipper Length (mm)"])
reg4_1.coef_, reg4_1.intercept_

In [None]:
# Can I interpret these coefficients?  I can interpret their signs, of course..
# Three of the measurements are in mm, one is in g, and they have different
# relevant scales.


In [None]:
penguins_train.std()

In [None]:
# Let's set put the standard deviations of each column of X into an array:
feature_std = np.array([1.964146, 826.402823,  5.046194,  13.464290 ])
# If I arranged the columns correctly, this product 
print(["Beak Depth (mm)", "Body Mass (g)", "Beak Length (mm)", "Flipper Length (mm)"])

reg4_1.coef_ * feature_std


In [None]:
# tells us a little more about which fields had more weight.  
# Flipper length and Body mass aren't as weighted as two 
# beak measurements.

In [None]:
SP1PREDICT_train= np.dot( penguins_train[["Beak Depth (mm)", "Body Mass (g)", 'Beak Length (mm)', "Flipper Length (mm)"]].values,reg4_1.coef_) + reg4_1.intercept_

In [None]:
penguins_train["SP1PREDICT"] = SP1PREDICT_train > 0.5
penguins_train["SP1LINEAR"] = SP1PREDICT_train 

In [None]:
SP1PREDICT_test= np.dot( penguins_test[["Beak Depth (mm)", "Body Mass (g)", 'Beak Length (mm)', "Flipper Length (mm)"]].values,reg4_1.coef_) + reg4_1.intercept_

In [None]:
penguins_test["SP1PREDICT"] = SP1PREDICT_test > 0.5
penguins_test["SP1LINEAR"] = SP1PREDICT_test 

In [None]:
penguins_train.groupby(["SP1PREDICT", "SP1"]).SP1PREDICT.count()

In [None]:
len(penguins_train)

In [None]:
# On the training data, which is cheating,  
# Out of 256 penguins, the four-dimensional linear classifier got 252 right
# and 4 wrong on the "is-it-an-Adelie"

In [None]:
penguins_test.groupby(["SP1PREDICT", "SP1"]).SP1PREDICT.count()

In [None]:
penguins_test.head()

In [None]:
# Since the scores for a binary classifier are in one dimension, 
# I can histogram them

plt.hist(penguins_train.SP1LINEAR, bins=50)

In [None]:
# And they are nicely bimodal.
# But this is emphatically NOT THE RIGHT MODEL.


In [None]:
penguinsM = penguins.query("Sex == 'MALE'") 
penguinsF = penguins.query("Sex == 'FEMALE'") 

#penguinsM[["Flipper Length (mm)", "Beak Length (mm)"]].plot("Flipper Length (mm)", "Beak Length (mm)", kind="scatter", color = penguinsM.Species.map(colors))
plt.scatter(penguinsF["Flipper Length (mm)"], penguinsF["Beak Length (mm)"],  color = penguinsF.Species.map(colors2))
plt.scatter(penguinsM["Flipper Length (mm)"], penguinsM["Beak Length (mm)"],  color = penguinsM.Species.map(colors))
plt.xlabel("Flipper Length (mm)")
plt.ylabel("Beak Length (mm)")
plt.savefig("PENGUIN.png", dpi=300, bbox_inches="tight")