# BLU02 - Exercises Notebook

In [None]:
import hashlib # for grading

import os
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

## 1 Read the Programs data (graded)

In this first exercise, we aim to create a single dataframe, combining all programs from all seasons.

With a caveat though: **we want to include seasons after 1900**.

In [None]:
def make_programs():
    files = os.listdir('data/programs/')
    # Create a list with the name of all files containing programs from
    # 1900 inclusive and onwards (just the filename, no complete path.)
    # files_after_1900: List[str] = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # Create a list with the name of all .csv files.
    # seasons: List[pd.DataFrame] = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # Use pd.concat to create a single dataframe.
    # programs: pd.DataFrame = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # Drop the column ProgramID.
    # programs = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # Set the index to be the column GUID, and sort the dataframe by the index 
    #( use the DataFrame.sort_index() function).
    # Feel free to use method chaining if you want.
    # YOUR CODE HERE
    raise NotImplementedError()
    return programs


def read_season(file):
    path = os.path.join('data', 'programs', file)
    return pd.read_csv(path)


programs = make_programs()

In [None]:
assert programs['Season'].min() == '1900-01'

shape = str(programs.shape)
expected_hash = '16278afb4c2032bcddc35b915f5439ef586333e2723c2ba6cfb9cc1b58eca0e1'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

Let's preview the `programs` dataframe.

In [None]:
programs.head()

## 2 Read the Concerts data (graded)

Read the concerts data.

Although we list all transformations step-by-step for the sake of clarity, we expect you to use method chaining.

In [None]:
def make_concerts(): 
    # Read concerts data and drop the ProgramID and ConcertID columns.
    # concerts: pd.DataFrame = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # Remember to_datetime? We need here. We need to parse the columns Date and 
    # Time. Use pd.to_datetime(...).dt.date for the Date and pd_to_datetime(..., 
    # format=%I:%M%p).dt.time for the Time.
    # YOUR CODE HERE
    raise NotImplementedError()
    return concerts


concerts = make_concerts()

In [None]:
shape = str(concerts.shape)
expected_hash = 'c030586e7370b1f2c34307d5de9b921d96efa28c933e44111b121ed819f339da'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

sample = str(concerts.sample(random_state=0))
expected_hash = '392a3db01753b02d85173c38cde95112fb5cdf06ca5a45d25f828238d56103be'
assert hashlib.sha256(sample.encode()).hexdigest() == expected_hash

In [None]:
concerts.head()

## 3 Combine Programs and Concerts data (graded)

Let's combine both dataframes into a single dataset, using an inner join.

In [None]:
# Remember that you want to join on the index of one of the dataframes.
# nyp = ...
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
shape = str(nyp.shape)
expected_hash = 'a75738e37ac4ccf37a893a1009ba624efce9efaa7721d4319e9e078193fe8de6'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash 

sample = str(nyp.sample(random_state=0))
expected_hash = 'd47ed1ab14963bb6e594ebaf8d07fc89e78e83058dc78ced57a5bf5ca200efa7'
assert hashlib.sha256(sample.encode()).hexdigest() == expected_hash 

## 4 Read Works and Soloists data (graded)

We will read the two remaining pieces of data. 

Again, albeit the step-by-step description, we encourage you to use method chaining.

In [None]:
def make_works():
    # Read the works data.
    # works: pd.DataFrame = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # Remove the Intervals (attention to the values in the Interval column).
    # works: pd.DataFrame = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # Select the columns GUID, ComposerName, WorkTitle, Movement and ConductorName.
    # YOUR CODE HERE
    raise NotImplementedError()
    return works


def make_soloists():
    # Read the soloists data and drop ProgramID, WorkID and MovementID.
    # YOUR CODE HERE
    raise NotImplementedError()
    return soloists


works = make_works()
soloists = make_soloists()

In [None]:
shape = str(works.shape)
expected_hash = 'cad58aa6cd33cfa24c08a0f0f846877178ab31278f212c80b16b952d9416f883'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

shape = str(soloists.shape)
expected_hash = 'a7b0d20a45ff1344e0398eebb162af9afb8805082b0dfdcb70e9a4b78f94dd13'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash 

## 5 Combine Works and Soloists (graded)

Like we did for Programs and Concerts, now we combine Works and Soloists.

In [None]:
# Combine both dataframes, again using an inner type of join.
# works_and_soloists : pd.DataFrame = ....
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
shape = str(works_and_soloists.shape)
expected_hash = 'c0e73877aac4f3916267cb58f2f122ffef32c79039bde2ecb217fda123270d12'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

## 6 Combine everything (graded)

The final goal here is to create a single dataframe.

In [None]:
# Combine everything into a single dataframe.
# nyp_merged = ...
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
shape = str(nyp_merged.shape)
expected_hash = '3c25d9867a3c0134a6625087698dac6314f7c225f806e78dd259788bedcfb10b'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

## 7 Final transformations (graded)

Now, we perform the train-test split.

We also perform some final transformations on both datasets:
* Include some date features: Year, Month, Day and Weekday
* Drop Date, Season and GUID
* Change the column name Orchestra to OrchestraName, for consistency with other name columns
* Filter out composers that appear in less than 100 concerts.

In [None]:
def preprocess_data(df):
    # You should follow these exact steps:
    #   1 - add_date_features, ideally using df.pipe
    #   2 - drop Date, Season and GUID
    #   3 - rename Orchestra to OrchestraName
    #   4 - filter out composers with less than 100 concerts (keep the ones with >= 100 rows)
    # YOUR CODE HERE
    raise NotImplementedError()
    return df

def add_date_features(df):
    # YOUR CODE HERE
    raise NotImplementedError()
    return df


nyp_ = preprocess_data(nyp_merged)
X_train, X_test = train_test_split(nyp_, random_state=0)

In [None]:
shape = str(nyp_merged.shape)
expected_hash = '3c25d9867a3c0134a6625087698dac6314f7c225f806e78dd259788bedcfb10b'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

shape = str(nyp_.shape)
expected_hash = '31fa2b10222342d4743fa75b3a04c69945106f22fcf7473f5d1daeb84bca88b7'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(nyp_.columns.values)
expected_hash = '7d131b98b4d7094443c094603c6db00aa20a79e49661acdefb33bf5fc1c071fa'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash 

And, finally, we would be ready to explore modeling.

For the next part, however, we will be using the famous [Boston House Prices Dataset](https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names).

## 8 Scaling features (graded)

About the Boston dataset:

> Each record in the database describes a Boston suburb or town. The data is from the Boston Standard Metropolitan Statistical Area (SMSA) in 1970.

The features are all numerical (real, positive):
* **CRIM** - per capita crime rate by town
* **ZN** - proportion of residential land zoned for lots over 25,000 sq.ft.
* **INDUS** - proportion of non-retail business acres per town
* **CHAS** - Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
* **NOX** - nitric oxides concentration (parts per 10 million)
* **RM** - average number of rooms per dwelling
* **AGE** - proportion of owner-occupied units built prior to 1940
* **DIS** - weighted distances to five Boston employment centres
* **RAD** - index of accessibility to radial highways
* **TAX** - full-value property-tax rate per \$10,000
* **PTRATIO** - pupil-teacher ratio by town
* **B** - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
* **LSTAT** - % lower status of the population
* **MEDV** - Median value of owner-occupied homes in \$1000's.

We want to scale all features to the same range, using `sklearn.preprocessing.MinMaxScaler()`.

In [None]:
boston = load_boston()
X = pd.DataFrame(data=boston.data, columns=boston.feature_names)
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Initialize the MinMaxScaler to a [0, 5] range.
# YOUR CODE HERE
raise NotImplementedError()

# Fit on the training set and transform X_train. We expect X_train_
# to be a dataframe **just like** X_train, only scaled. 
# X_train_: pd.DataFrame = ...
# YOUR CODE HERE
raise NotImplementedError()

# Transform the test set.
# X_test_: pd.DataFrame = ...
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
shape = str(X_train_.shape)
expected_hash = '6f696c7e30c15aae3f0fa4807b596cf15d28cadaf33602d8d20368f7ac921f26'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(X_train_.columns.values)
expected_hash = 'c4e20218e7e33f0e771a608bb05ece0152f5a15fc6a0629b6c88cef7790fbfe1'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

shape = str(X_test_.shape)
expected_hash = 'aa2b4e3c1e358b4b9f21c2c86bbf1187020582395419f1a02a949d7a6efac9e4'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(X_test_.columns.values)
expected_hash = 'c4e20218e7e33f0e771a608bb05ece0152f5a15fc6a0629b6c88cef7790fbfe1'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

## 9 Build a ColumnSelector transformer (graded)

There's a simple transformer that can be useful, from times to times, when modeling.

What we want is to build a transformer that returns the columns we select beforehand. 

This transformer could be used to determine what features go into modeling.

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    # Implement the __init__ method.
    # Our ColumnSelector must be able to receive a parameter columns.
    # The default value for columns must be set to 'all', so we can
    # initialize it without any explicit parameters.
    # YOUR CODE HERE
    raise NotImplementedError()
        
    # There's no need for a fit method in this case, it does nothing.
    # We should be able to call fit without any explicit parameters.
    # Meaning: we should be able to call ColumnSelector.fit().
    # YOUR CODE HERE
    raise NotImplementedError()

    # Transform should return all columns if the parameter columns we
    # passed upon initialization is equal to 'all'. If a column or a
    # list of columns are passed, only those should be returned.
    # YOUR CODE HERE
    raise NotImplementedError()
        

cols = ['CRIM', 'DIS', 'INDUS', 'RM', 'DIS', 'TAX', 'B']
selector = ColumnSelector(columns=cols)
X_train__ = selector.fit_transform(X_train_)
X_test__ = selector.transform(X_test_)

In [None]:
assert(ColumnSelector())
assert(selector.fit())

shape = str(X_train__.shape)
expected_hash = '5d4f688e84beb21ec07f136c16a6cc11318d4f5de7b81bf0232e5282d9834123'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(X_train__.columns.values)
expected_hash = '901009bce1feeeccadd8cd499664598ff9319641e55dcda17a650c13c0626604'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

shape = str(X_test__.shape)
expected_hash = '0aba1c19151f76aa2ecb00fd75be05c6f73860573972e967f3d1fe1c44ae2629'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(X_test__.columns.values)
expected_hash = '901009bce1feeeccadd8cd499664598ff9319641e55dcda17a650c13c0626604'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

## 10 Building the pipeline (graded)

Finally, we want to use the two transformers together and run a linear regression on top.

In [None]:
# Create a pipeline including:
#   1 - 'selector', ColumSelector(columns=cols)
#   2 - 'min_max', MinMaxScaler() with same range as above
#   3 - 'model', LinearRegression
# YOUR CODE HERE
raise NotImplementedError()


pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('MSE: {}'.format(mse))
print('MAE: {}'.format(mae))

In [None]:
assert type(pipeline) == Pipeline
assert type(pipeline.named_steps['selector']) == ColumnSelector
assert type(pipeline.named_steps['min_max']) == MinMaxScaler
assert pipeline.named_steps['min_max'].get_params()['feature_range'] == (0,5)
assert type(pipeline.named_steps['model']) == LinearRegression 

Exercises complete, congratulations! You are about to become a certified data wrangler.