## DAT18 Lab 08
## Logistic Regression Classification

Import the usual packages

In [None]:
import numpy as np
import pandas as pd

from bokeh.plotting import figure,show,output_notebook
from bokeh.models import Range1d

from sklearn import datasets
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

output_notebook()
%matplotlib inline

Here are a couple pandas settings to make viewing the data a little easier

In [None]:
pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',60)

#### Load the Iris Data Set

In [None]:
from sklearn import datasets

sk_iris = datasets.load_iris()
iris = pd.DataFrame(sk_iris.data,columns=sk_iris['feature_names'])
iris['target'] = sk_iris.target
Names = sk_iris.target_names

#### Logistic Regression is a binary classifier so we'll just use two classes of the data set

In [None]:
iris = iris[iris.target!=0]

In [None]:
iris.head()

#### Use the cross_validation function from previous lab

I've also included worked examples of cross_eval_score. Remember: cv_score_dataframe needs your data to come from a single dataframe.

In [None]:
def cv_score_dataframe(data,label,k,model):
    positions = data.index.values
    np.random.shuffle(positions)
    cv_score=0
    
    for i in range(k):
        pos_var = len(data)/k
        
        test_slice = positions[i*pos_var:(i+1)*pos_var]

        train_1 = positions[ :i*pos_var]
        train_2 = positions[(i+1)*pos_var:]
        train_slice = np.concatenate([train_1,train_2])

        model.fit(data.loc[train_slice],label.loc[train_slice])
        k_score = model.score(data.loc[test_slice],label.loc[test_slice])
        cv_score += k_score
        print k_score
        
    return  cv_score/k

In [None]:
from sklearn.cross_validation import cross_val_score

### Create an instance of a Logistic Regression model and apply cross-validation
#### (Note: C parameter is for regularization, also known as our complexity penalty)

In [None]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(C=1)

In [None]:
features = iris.drop('target',axis=1)
target = iris.target

In [None]:
cv_score_dataframe(features, target,3, model_lr)

In [None]:
cross_val_score(model_lr,features,target,cv=3).mean()

### Review the feature importance


In [None]:
model_lr = LogisticRegression(C=1).fit(features, target)

x = np.arange(len(features.columns))
names = features.columns
names

OR = e^Beta and the betas are the model coefficients.

In [None]:
print model_lr.coef_
print model_lr.coef_.ravel()

In [None]:
p = figure(title="Model Coefficients")
for val in x:
    p.quad(top=model_lr.coef_.ravel()[val], 
           bottom=0, left=val+0.2,right=val+0.8, 
           color=['red','orange','green','purple'][val],
           legend=names[val]
          )
    
p.y_range = Range1d(min(model_lr.coef_.ravel())-0.1, max(model_lr.coef_.ravel())+1.5)
show(p)


#### Exercise 1: Try Changing the Normalization and see how the Coeficients Change and Discuss with your Neighbor

## Example 2: Build Logistic Regression Classifier for Spambase Data

#### Load the spambase.csv as a pandas DataFrame (last column of data contains Target Data - is_spam)

In [None]:
spam_data = pd.read_csv("../data/spambase.csv")
# spam_data.head()
spam_data.describe()
# spam_data.info()

Separate features with iloc indexing

In [None]:
features = spam_data.drop('is_spam',axis=1)
target = spam_data.is_spam

# features.head()
target.head()

Convert DataFrames into numpy arrays

In [None]:
features.values

#### Use crossvalidation to score model

In [None]:
from sklearn.linear_model import LogisticRegression

features = spam_data.drop('is_spam',axis=1)
target = spam_data.is_spam

model_lr = LogisticRegression(C=1)

cv_score_dataframe(features, target,3, model_lr)


In [None]:
cross_val_score(model_lr,features,target,cv=3).mean()

### Compare Performance of Logistic Regression to KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier(3)
cv_score_dataframe(features, target,3, model_knn)


#### Exercise 2: Try testing and plotting CV Scores with different C values

In [None]:
from sklearn.linear_model import LogisticRegression

c_list = [1000, 100, 10, 1, 0.1, 0.01, 0.001, .0001]
cv_scores = []


###  Evaluate Feature Importance

In [None]:
feature_names = spam_data.columns[:-1]
feature_names = feature_names[-10:]
model_lr = LogisticRegression(C=1).fit(features, target)



p = figure(title="Model Coefficients")

coefficients=model_lr.coef_.ravel()[-10:]

x = np.arange(len(feature_names))
for val in x:
    p.quad(top = coefficients[val], bottom=0, left=val+0.2,
           right=val+0.8, color=['red','orange','green','purple','blue','cyan','magenta','red','orange','purple','blue'][val],legend=feature_names[val])
p.y_range = Range1d(min(coefficients)-0.1, max(coefficients)+1.5)
show(p)


### What if I want the most important features?

In [None]:
coeffs = pd.DataFrame(zip(spam_data.columns[:-1],model_lr.coef_.ravel()),columns=['features','coeff'])
coeffs.head()

In [None]:
coeffs['abs'] = np.absolute(coeffs.coeff.values)

coeffs.sort('abs',ascending=False)

### Data Normalization

In [None]:
features.describe()

#### Normalize all features in one line of code
This is the form of normalization where you normalize the standard deviation.

In [None]:
features_norm = (features - features.mean())/features.std()

In [None]:
features_norm.describe()

We can also use the built in SKLearn Normalization

In [None]:
scaler = StandardScaler()
features_norm_2 = scaler.fit_transform(features)

pd.DataFrame(features_norm_2, columns=features.columns).head()

In [None]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression(C=1)

cv_score_dataframe(features_norm, target,3, model_lr)

#### Chart last 10 features and compare with earlier coefficients

In [None]:
feature_names = spam_data.columns[:-1]

feature_names = feature_names[-11:]
feat_norm_val = features_norm.values
model_lr = LogisticRegression(C=1).fit(feat_norm_val, target)

p = figure(title="Model Coefficients")

coefficients=model_lr.coef_.ravel()[-11:]

x = np.arange(len(feature_names))
for val in x:
    p.quad(top = coefficients[val], bottom=0, left=val+0.2,
           right=val+0.8, 
           color=['red','orange','green','purple','blue','cyan','magenta','red','orange','purple','blue','green'][val],
           legend=feature_names[val])
p.y_range = Range1d(min(coefficients)-0.1, max(coefficients)+2.5)
show(p)


### Exercise 3:
- Review the Solution to the Homework
- Discuss with your Neighbor: How you could improve your data cleaning
- Then Apply Logistic Regressions to the Homework Problem
