In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets as datasets
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [2]:
house = datasets.load_boston()
bcancer = datasets.load_breast_cancer()

In [3]:
# Houseing data target variable is price: continuous!
# If Y is continuous, use regresion. Regression predicts mean Y (E[Y] given X)
# alternatively Y ~ B*X
Xh, Yh = pd.DataFrame(house.data, columns=house.feature_names), house.target

# Breast cancer target is categorical; has breast cancer or not.
# If Y is categorical, use classification.
# Classification predicts probability of category (E[P(Y == 1)])
# alternatively (for logistic regression only though): Y ~ BX
Xb, Yb = pd.DataFrame(bcancer.data, columns=bcancer.feature_names), bcancer.target

In [4]:
print Xh.columns
print Xb.columns

Index([u'CRIM', u'ZN', u'INDUS', u'CHAS', u'NOX', u'RM', u'AGE', u'DIS',
       u'RAD', u'TAX', u'PTRATIO', u'B', u'LSTAT'],
      dtype='object')
Index([u'mean radius', u'mean texture', u'mean perimeter', u'mean area',
       u'mean smoothness', u'mean compactness', u'mean concavity',
       u'mean concave points', u'mean symmetry', u'mean fractal dimension',
       u'radius error', u'texture error', u'perimeter error', u'area error',
       u'smoothness error', u'compactness error', u'concavity error',
       u'concave points error', u'symmetry error', u'fractal dimension error',
       u'worst radius', u'worst texture', u'worst perimeter', u'worst area',
       u'worst smoothness', u'worst compactness', u'worst concavity',
       u'worst concave points', u'worst symmetry', u'worst fractal dimension'],
      dtype='object')


In [5]:
np.unique(Yb)

array([0, 1])

In [6]:
Yh[0:10]

array([ 24. ,  21.6,  34.7,  33.4,  36.2,  28.7,  22.9,  27.1,  16.5,  18.9])

# Linear Regression#

In [None]:
# check predictor types in case there are categorical variables in there ...
Xh.dtypes

# IF YOU HAVE A LOT OF CATEGORICAL VARIABLES USE PATSY

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
# First construct the linear regression using the "blueprint"
linear_reg = LinearRegression()

# Fit the linear regression on your target and predictors:
# x is pandas dataframe, so convert to matrix! keep track of column names
Xh_columns = Xh.columns
Xh_mat = Xh.values

house_linreg = linear_reg.fit(Xh_mat, Yh)

In [10]:
# PREDICTIONS!
Yh_predictions = house_linreg.predict(Xh_mat)

print Yh[0:10]
print Yh_predictions[0:10]

[ 24.   21.6  34.7  33.4  36.2  28.7  22.9  27.1  16.5  18.9]
[ 30.00821269  25.0298606   30.5702317   28.60814055  27.94288232
  25.25940048  23.00433994  19.5347558   11.51696539  18.91981483]


In [12]:
# Is our model good? Let's check the R2 for the model.
# Since we are just using our original data to predict, put it in the score function.
house_linreg.score(Xh_mat, Yh)

# WTF is R2?
# this is the proportion of variance explained in our target variable
# COMPARED TO A MODEL THAT JUST USES MEAN OF Y (baseline model)
# aka: how much better is the model compared to just guessing every Y row with the mean of Y.

0.74060774286494269

In [13]:
# what are our coefficients from the model?
print house_linreg.coef_

[ -1.07170557e-01   4.63952195e-02   2.08602395e-02   2.68856140e+00
  -1.77957587e+01   3.80475246e+00   7.51061703e-04  -1.47575880e+00
   3.05655038e-01  -1.23293463e-02  -9.53463555e-01   9.39251272e-03
  -5.25466633e-01]


In [15]:
# look at this in a nicer way. we saved the column values earlier, conveniently
house_coefs = pd.DataFrame({'feature': Xh_columns, 'coef':house_linreg.coef_})

# Remember that for linear regression the formula is Y ~ b1*x1 + ... + bn*xn
# for our x1 through xn columns of predictors
# To estimate Y for a row of predictor variables, we multiply each of these 
# coefficients by their respective beta coefficients and add them together!
house_coefs

Unnamed: 0,coef,feature
0,-0.107171,CRIM
1,0.046395,ZN
2,0.02086,INDUS
3,2.688561,CHAS
4,-17.795759,NOX
5,3.804752,RM
6,0.000751,AGE
7,-1.475759,DIS
8,0.305655,RAD
9,-0.012329,TAX


### LASSO

Why would we use the Lasso?

Lasso, depending on the **regularization strength C**, will eliminate variables in order of their value or importance on predicting Y.

In this ase we probably don't need it for the data (will likely make prediction worse if we remove variables), but this is just a demonstration.

In [16]:
from sklearn.linear_model import Lasso

In [19]:
# initialize Lasso just as you would the linear regression

# Let's make 2. One with "weak" regularization, which means there is a very
# small penalty on coefficient sizes.
# Coefficients added up can be big and it won't really care (basically
# will be the same as vanilla Linear Regression)
house_lasso_weak = Lasso(alpha = 0.01)

# make one that has strong regularization
house_lasso_strong = Lasso(alpha =100.)

In [21]:
# fit them each on data
h_weak = house_lasso_weak.fit(Xh_mat, Yh)
h_strong = house_lasso_strong.fit(Xh_mat, Yh)

In [None]:
# make tables of coefficients to see what lasso did
weak_coefs - pd.DataFrame({'feature: X'})