# BLU02 - Exercises Notebook

In [None]:
import hashlib # for grading

import os
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

## 1 Read the Programs data (graded)

In this first exercise, we aim to create a single dataframe, combining all programs from all seasons.

With a caveat though: **we want to include seasons after 1900**.

In [None]:




def make_programs():
    files = os.listdir('data/programs/')
    # Create a list with the name of all files containing programs from
    # 1900 inclusive and onwards (just the filename, no complete path.)
    # files_after_1900: List[str] = ...
    # YOUR CODE HERE
    
        
    
    files_after_1900 = [x for x in files if x >= '19']


    seasons = [read_season(f) for f in files_after_1900 if '.csv' in f]
    

    
    
    # Create a list with the name of all .csv files.
    # seasons: List[pd.DataFrame] = ...
    # YOUR CODE HERE

    
    # Use pd.concat to create a single dataframe.
    # programs: pd.DataFrame = ...
    # YOUR CODE HERE
    programs = pd.concat(seasons, axis=0, ignore_index=True)


   

    # Drop the column ProgramID.
    # programs = ...
    # YOUR CODE HERE
    programs = programs.drop(columns='ProgramID')


    # Set the index to be the column GUID, and sort the dataframe by the index 
    #( use the DataFrame.sort_index() function).
    # Feel free to use method chaining if you want.
    # YOUR CODE HERE
    programs = programs.set_index('GUID')
    programs.sort_index(inplace=True)
    return programs


def read_season(file):
    path = os.path.join('data', 'programs', file)
    return pd.read_csv(path)


programs = make_programs()

In [None]:
assert programs['Season'].min() == '1900-01'

shape = str(programs.shape)
expected_hash = '16278afb4c2032bcddc35b915f5439ef586333e2723c2ba6cfb9cc1b58eca0e1'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

Let's preview the `programs` dataframe.

In [None]:
programs.head()

## 2 Read the Concerts data (graded)

Read the concerts data.

Although we list all transformations step-by-step for the sake of clarity, we expect you to use method chaining.

In [None]:
concerts = pd.read_csv('./data/concerts.csv')
concerts = pd.DataFrame(concerts)
    



concerts = concerts.drop(['ProgramID', 'ConcertID'], axis = 1)
concerts

In [None]:
def make_concerts(): 
    # Read concerts data and drop the ProgramID and ConcertID columns.
    # concerts: pd.DataFrame = ...
    # YOUR CODE HERE
    concerts = pd.read_csv('./data/concerts.csv')
    concerts = pd.DataFrame(concerts)
    



    concerts = concerts.drop(['ProgramID', 'ConcertID'], axis = 1)
    
    # Remember to_datetime? We need here. We need to parse the columns Date and 
    # Time. Use pd.to_datetime(...).dt.date for the Date and pd_to_datetime(..., 
    # format=%I:%M%p).dt.time for the Time.
    # YOUR CODE HERE
    concerts['Date'] = pd.to_datetime(concerts['Date']).dt.date
    concerts['Time'] = pd.to_datetime(concerts['Time'], format='%I:%M%p').dt.time
     
    return concerts


concerts = make_concerts()

In [None]:
shape = str(concerts.shape)
expected_hash = 'c030586e7370b1f2c34307d5de9b921d96efa28c933e44111b121ed819f339da'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

sample = str(concerts.sample(random_state=0))
expected_hash = '392a3db01753b02d85173c38cde95112fb5cdf06ca5a45d25f828238d56103be'
assert hashlib.sha256(sample.encode()).hexdigest() == expected_hash

In [None]:
concerts.head()

## 3 Combine Programs and Concerts data (graded)

Let's combine both dataframes into a single dataset, using an inner join.

In [None]:
# Remember that you want to join on the index of one of the dataframes.
# nyp = ...
# YOUR CODE HERE
nyp = concerts.join(programs, on='GUID', how='inner')

In [None]:
shape = str(nyp.shape)
expected_hash = 'a75738e37ac4ccf37a893a1009ba624efce9efaa7721d4319e9e078193fe8de6'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash 

sample = str(nyp.sample(random_state=0))
expected_hash = 'd47ed1ab14963bb6e594ebaf8d07fc89e78e83058dc78ced57a5bf5ca200efa7'
assert hashlib.sha256(sample.encode()).hexdigest() == expected_hash 

## 4 Read Works and Soloists data (graded)

We will read the two remaining pieces of data. 

Again, albeit the step-by-step description, we encourage you to use method chaining.

In [None]:
def make_works():
    # Read the works data.
    # works: pd.DataFrame = ...
    # YOUR CODE HERE
    works = pd.read_csv('./data/works.csv')
    works = pd.DataFrame(works)
    
    # Remove the Intervals (attention to the values in the Interval column).
    # works: pd.DataFrame = ...
    # YOUR CODE HERE
    mask = works['Interval'].isnull()
    works = (works.loc[mask, :].drop(columns='Interval'))
    
    
    
    # Select the columns GUID, ComposerName, WorkTitle, Movement and ConductorName.
    # YOUR CODE HERE
    columns = ['GUID', 'ComposerName', 'WorkTitle', 'Movement','ConductorName']

    works = works[columns]
    return works


def make_soloists():
    # Read the soloists data and drop ProgramID, WorkID and MovementID.
    # YOUR CODE HERE
    soloists = pd.read_csv('./data/soloists.csv')
    soloists = pd.DataFrame(soloists)
    
    soloists = soloists.drop(['ProgramID', 'WorkID', 'MovementID'], axis = 1)
    return soloists


works = make_works()
soloists = make_soloists()

In [None]:
shape = str(works.shape)
expected_hash = 'cad58aa6cd33cfa24c08a0f0f846877178ab31278f212c80b16b952d9416f883'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

shape = str(soloists.shape)
expected_hash = 'a7b0d20a45ff1344e0398eebb162af9afb8805082b0dfdcb70e9a4b78f94dd13'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash 

## 5 Combine Works and Soloists (graded)

Like we did for Programs and Concerts, now we combine Works and Soloists.

In [None]:
# Combine both dataframes, again using an inner type of join.
# works_and_soloists : pd.DataFrame = ....
# YOUR CODE HERE
works_and_soloists = pd.merge(works, soloists, how = 'inner' )

In [None]:
shape = str(works_and_soloists.shape)
expected_hash = 'c0e73877aac4f3916267cb58f2f122ffef32c79039bde2ecb217fda123270d12'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

## 6 Combine everything (graded)

The final goal here is to create a single dataframe.

In [20]:
# Combine everything into a single dataframe.
# nyp_merged = ...
# YOUR CODE HERE
nyp_merged = pd.merge(nyp, works_and_soloists)

In [21]:
shape = str(nyp_merged.shape)
expected_hash = '3c25d9867a3c0134a6625087698dac6314f7c225f806e78dd259788bedcfb10b'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

## 7 Final transformations (graded)

Now, we perform the train-test split.

We also perform some final transformations on both datasets:
* Include some date features: Year, Month, Day and Weekday
* Drop Date, Season and GUID
* Change the column name Orchestra to OrchestraName, for consistency with other name columns
* Filter out composers that appear in less than 100 concerts.

In [39]:



def preprocess_data(df):
    # You should follow these exact steps:
    #   1 - add_date_features, ideally using df.pipe
    #   2 - drop Date, Season and GUID
    #   3 - rename Orchestra to OrchestraName
    #   4 - filter out composers with less than 100 concerts (keep the ones with >= 100 rows)
    # YOUR CODE HERE
    df = df.copy()
    df = (df.pipe(add_date_features)
          .drop(columns = ['Date', 'Season', 'GUID'])
          .rename(columns = {'Orchestra':'OrchestraName'})
          .groupby('ComposerName').filter(lambda x: x.shape[0] >= 100))
    
    

    return df

def add_date_features(df):
    # YOUR CODE HERE
    df['Year'] = pd.to_datetime(df['Date']).dt.year
    df['Month'] = pd.to_datetime(df['Date']).dt.month
    df['Day'] = pd.to_datetime(df['Date']).dt.day
    df['Weekday'] = pd.to_datetime(df['Date']).dt.weekday
    return df





nyp_ = preprocess_data(nyp_merged)
X_train, X_test = train_test_split(nyp_, random_state=0)

In [40]:
shape = str(nyp_merged.shape)
expected_hash = '3c25d9867a3c0134a6625087698dac6314f7c225f806e78dd259788bedcfb10b'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

shape = str(nyp_.shape)
expected_hash = '31fa2b10222342d4743fa75b3a04c69945106f22fcf7473f5d1daeb84bca88b7'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(nyp_.columns.values)
expected_hash = '7d131b98b4d7094443c094603c6db00aa20a79e49661acdefb33bf5fc1c071fa'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash 

And, finally, we would be ready to explore modeling.

For the next part, however, we will be using the famous [Boston House Prices Dataset](https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names).

## 8 Scaling features (graded)

About the Boston dataset:

> Each record in the database describes a Boston suburb or town. The data is from the Boston Standard Metropolitan Statistical Area (SMSA) in 1970.

The features are all numerical (real, positive):
* **CRIM** - per capita crime rate by town
* **ZN** - proportion of residential land zoned for lots over 25,000 sq.ft.
* **INDUS** - proportion of non-retail business acres per town
* **CHAS** - Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
* **NOX** - nitric oxides concentration (parts per 10 million)
* **RM** - average number of rooms per dwelling
* **AGE** - proportion of owner-occupied units built prior to 1940
* **DIS** - weighted distances to five Boston employment centres
* **RAD** - index of accessibility to radial highways
* **TAX** - full-value property-tax rate per \$10,000
* **PTRATIO** - pupil-teacher ratio by town
* **B** - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
* **LSTAT** - % lower status of the population
* **MEDV** - Median value of owner-occupied homes in \$1000's.

We want to scale all features to the same range, using `sklearn.preprocessing.MinMaxScaler()`.

In [41]:
X_train_

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.010397,1.100,0.989736,0.0,0.473251,1.958230,3.465499,3.095560,1.304348,1.364504,3.457447,4.902037,2.373050
1,0.005449,1.250,0.855938,0.0,0.699588,2.266718,2.281153,2.628580,1.521739,0.925573,3.776596,5.000000,1.062411
2,0.005528,2.000,1.090543,5.0,0.637860,3.550489,2.373841,1.649425,0.652174,0.639313,2.659574,4.903550,0.612766
3,0.489516,0.000,3.233138,0.0,3.168724,2.787890,4.938208,0.251774,5.000000,4.570611,4.042553,4.937970,2.182979
4,0.608514,0.000,3.233138,0.0,3.024691,3.085840,4.526262,0.294596,5.000000,4.570611,4.042553,0.267916,3.412766
5,0.005587,0.000,5.000000,0.0,2.304527,2.320368,4.938208,0.316783,0.652174,5.000000,3.989362,4.914393,2.317730
6,0.067500,0.000,3.504399,0.0,2.263374,2.216900,4.721936,0.571432,0.869565,2.061069,1.117021,3.681098,1.801418
7,0.663482,0.000,3.233138,0.0,3.425926,3.126078,3.789907,0.282954,5.000000,4.570611,4.042553,0.606813,2.980142
8,0.000382,4.500,0.139296,0.0,0.185185,3.533244,0.978373,3.434165,0.869565,0.372137,2.819149,4.987770,0.436879
9,0.018216,0.000,1.052053,0.0,1.255144,2.419046,3.017508,1.131132,1.521739,1.145038,2.553191,4.745953,1.297872


In [71]:
boston = load_boston()
X = pd.DataFrame(data=boston.data, columns=boston.feature_names)
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Initialize the MinMaxScaler to a [0, 5] range.
# YOUR CODE HERE
scaler = MinMaxScaler(feature_range=(0, 5))
# Fit on the training set and transform X_train. We expect X_train_
# to be a dataframe **just like** X_train, only scaled. 
# X_train_: pd.DataFrame = ...
# YOUR CODE HERE
X_train_ = scaler.fit(X_train)
X_train_ = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
# Transform the test set.
# X_test_: pd.DataFrame = ...
# YOUR CODE HERE
X_test_ = scaler.fit(X_test)
X_test_ = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)


In [72]:
shape = str(X_train_.shape)
expected_hash = '6f696c7e30c15aae3f0fa4807b596cf15d28cadaf33602d8d20368f7ac921f26'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(X_train_.columns.values)
expected_hash = 'c4e20218e7e33f0e771a608bb05ece0152f5a15fc6a0629b6c88cef7790fbfe1'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

shape = str(X_test_.shape)
expected_hash = 'aa2b4e3c1e358b4b9f21c2c86bbf1187020582395419f1a02a949d7a6efac9e4'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(X_test_.columns.values)
expected_hash = 'c4e20218e7e33f0e771a608bb05ece0152f5a15fc6a0629b6c88cef7790fbfe1'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

## 9 Build a ColumnSelector transformer (graded)

There's a simple transformer that can be useful, from times to times, when modeling.

What we want is to build a transformer that returns the columns we select beforehand. 

This transformer could be used to determine what features go into modeling.

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    # Implement the __init__ method.
    # Our ColumnSelector must be able to receive a parameter columns.
    # The default value for columns must be set to 'all', so we can
    # initialize it without any explicit parameters.
    # YOUR CODE HERE
    raise NotImplementedError()
        
    # There's no need for a fit method in this case, it does nothing.
    # We should be able to call fit without any explicit parameters.
    # Meaning: we should be able to call ColumnSelector.fit().
    # YOUR CODE HERE
    raise NotImplementedError()

    # Transform should return all columns if the parameter columns we
    # passed upon initialization is equal to 'all'. If a column or a
    # list of columns are passed, only those should be returned.
    # YOUR CODE HERE
    raise NotImplementedError()
        

cols = ['CRIM', 'DIS', 'INDUS', 'RM', 'DIS', 'TAX', 'B']
selector = ColumnSelector(columns=cols)
X_train__ = selector.fit_transform(X_train_)
X_test__ = selector.transform(X_test_)

In [None]:
assert(ColumnSelector())
assert(selector.fit())

shape = str(X_train__.shape)
expected_hash = '5d4f688e84beb21ec07f136c16a6cc11318d4f5de7b81bf0232e5282d9834123'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(X_train__.columns.values)
expected_hash = '901009bce1feeeccadd8cd499664598ff9319641e55dcda17a650c13c0626604'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

shape = str(X_test__.shape)
expected_hash = '0aba1c19151f76aa2ecb00fd75be05c6f73860573972e967f3d1fe1c44ae2629'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(X_test__.columns.values)
expected_hash = '901009bce1feeeccadd8cd499664598ff9319641e55dcda17a650c13c0626604'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

## 10 Building the pipeline (graded)

Finally, we want to use the two transformers together and run a linear regression on top.

In [None]:
# Create a pipeline including:
#   1 - 'selector', ColumSelector(columns=cols)
#   2 - 'min_max', MinMaxScaler() with same range as above
#   3 - 'model', LinearRegression
# YOUR CODE HERE
raise NotImplementedError()


pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('MSE: {}'.format(mse))
print('MAE: {}'.format(mae))

In [None]:
assert type(pipeline) == Pipeline
assert type(pipeline.named_steps['selector']) == ColumnSelector
assert type(pipeline.named_steps['min_max']) == MinMaxScaler
assert pipeline.named_steps['min_max'].get_params()['feature_range'] == (0,5)
assert type(pipeline.named_steps['model']) == LinearRegression 

Exercises complete, congratulations! You are about to become a certified data wrangler.