# SLU16 - Data Sufficiency and Selection


In [None]:
import math
import pandas as pd
import numpy as np
import sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt
from hashlib import sha1 # just for grading purposes
import json # just for grading purposes

def _hash(obj):
    if type(obj) is not str:
        obj = json.dumps(obj)
    return sha1(obj.encode()).hexdigest()

%matplotlib inline

In [None]:
# load up a classification dataset

X = pd.read_csv('data/exercise_X.csv')
y = pd.read_csv('data/exercise_y.csv')['label']
# give X a quick look
X.head()

In [None]:
# looks like a balanced binary target
y.value_counts()

# Find the first obviously useless feature

Can you determine which of the features contains all uniques and therefore cannot have any predictive power?

In [None]:
# Use this cell to determine which of the features serves as a categorical
# feature and contains all uniques


In [None]:
# set the variable feature_all_unique to the name of the feature
# that contains all uniques
feature_all_unique = None

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert _hash(feature_all_unique) == '87ea5dfc8b8e384d848979496e706390b497e547'
### END TESTS

# Find the second obviously useless feature

This one doesn't contain all uniques but based upon some Single Factor Analysis you should be able to determine which feature isn't worth
bothering with.

In [None]:
# use this cell to do some more SFA on other features to determine
# which of them is useless


In [None]:
# Use this cell to determine the other obviously useless feature
other_useless_feature = None

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert _hash(other_useless_feature) == '04c03b252faf210d252b1d80590911758427b048'
### END TESTS

In [None]:
# now drop the features that you determined to be useless and store them in X_1

# X_1 = X.drop(...)

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert _hash(list(sorted(X_1.columns))) == '7c9a6ed68a038fdcf0722571cbc6a60ed958d19b'
### END TESTS

# Find the rest of the useless features

Single Factor Analysis isn't likely to do much in helping us to determine
which of the rest of the features are useless. We'll need to some `feature_importances` in order to find the rest of these bad boys

In [None]:
# Now let's import and train the classifier and get the feature importance using
# the X_1 DataFrame

# First import a tree based classifier

# from sklearn... import ...

# YOUR CODE HERE
raise NotImplementedError()

# Create your classifier, assign it to the clf variable and then
# train it on X_1 and y
clf = None

# once the classifier is trained, set the feature importances here
# make it a pandas series with the index being the column names
# so that we can visualize the results
feature_importances = None

# set the random_state=1 and max_depth=5 or the tests won't pass!
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert hasattr(clf, 'feature_importances_'), 'The classifier must be a tree based classifier'
assert clf.random_state == 1, 'random_state must be 1'
assert clf.max_depth == 5, 'max_depth must be 5'
assert np.isclose(feature_importances['feature_0'], 0.031820, atol=1e-5), 'feature 0 importance seems off'
assert np.isclose(feature_importances['feature_1'], 0.128733, atol=1e-5), 'feature 1 importance seems off'
assert np.isclose(feature_importances['feature_6'], 0.146977, atol=1e-5), 'feature 6 importance seems off'
### END TESTS

In [None]:
feature_importances.plot.barh();

In [None]:
# Now let's import and train the classifier and get the feature importance using
# the X_1 DataFrame

# First import a LogisticRegression

# from sklearn... import ...

# YOUR CODE HERE
raise NotImplementedError()

# Create your classifier, assign it to the clf variable and then
# train it on X_1 and y
clf = None

# once the classifier is trained, set the coefs_ here
# make it a pandas series with the index being the column names
# so that we can visualize the results
# BE SURE to take the absolute value of the coefs
abs_coefs = None

# set the solver='lbfgs' and random_state=1 or the tests won't pass!
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert isinstance(clf, LogisticRegression), 'The classifier must be a logistic regression'
assert clf.random_state == 1, 'random_state must be 1'
assert clf.solver == 'lbfgs', 'solver must be lbfgs'
assert np.isclose(abs_coefs['feature_0'], 0.031889, atol=1e-5), 'feature 0 coef seems off'
assert np.isclose(abs_coefs['feature_1'], 0.093313, atol=1e-5), 'feature 1 coef seems off'
assert np.isclose(abs_coefs['feature_6'], 0.405876, atol=1e-5), 'feature 6 coef seems off'
### END TESTS

In [None]:
abs_coefs.plot.barh();

In [None]:
# now remove the 3 remaining useless features and store them in
# the variable X_2

# X_2 = X_1.drop(...)

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert _hash(list(sorted(X_2.columns))) == '0ba088ebdcf2b8598c95a2a89cf140b86ec1d6d5'
### END TESTS

# Correlations

Determine the correlations between each of the features and the target column.

In [None]:
X = pd.read_csv('data/exercise_X.csv')
y = pd.read_csv('data/exercise_y.csv')['label']

In [None]:
# In this cell, compute the absolute value of the correlations between each feature and the target and store it in
# a variable called abs_corrs

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
expected_features = {
    'feature_0', 
    'feature_1', 
    'feature_2', 
    'feature_3', 
    'feature_4',
    'feature_5',
    'feature_6',
    'feature_7',
}
assert set(abs_corrs.index) == expected_features, 'you should only have expected_features features'
assert np.isclose(abs_corrs['feature_0'], 0.027928, rtol=1e-04)
assert np.isclose(abs_corrs['feature_5'], 0.008327, rtol=1e-04)
assert np.isclose(abs_corrs['feature_7'], 0.285048, rtol=1e-04)
### END TESTS

# The learning curve

Okay now that we have gotten rid of all those useless features, let's focus on getting a sense for how much data we need in order to have
reasonable performance.

In [None]:
# Now create a dataframe that has a single feature that is the
# cross validation score in order to help us understand
# how increasing amounts of data affect the performance

# HINT: just use the snippet from the Learning Notebook
train_scores_mean = None
test_scores_mean = None

# instantiate a classifier that you will inspect the learning rate of
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=5, random_state=1)

# IMPORTANT: Be sure to train on X_2

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
# round in order to compensate for implementation details

assert math.isclose(sum(train_scores_mean), 4.5, rel_tol=1e-2)
assert math.isclose(sum(test_scores_mean), 3.8, rel_tol=1e-2)
### END TESTS

In [None]:
learning_curve_df = pd.DataFrame({
    'Training Scores': train_scores_mean,
    'Test Set scores': test_scores_mean
}, index=train_sizes)

learning_curve_df.plot.line(
    title='Decision Tree Learning Curve'
);

In [None]:
# Now select the minimum training set size that this particular classifier
# seems to need before it's learning rate stabilizes

min_train_set_size = None

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert _hash(min_train_set_size) == 'ba30fd97b4127db56e9f4d3d9c030d71646fd2e7'
### END TESTS