In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
# Which versions are installed?
import sys
print("Python version")
print (sys.version)
print("\nPandas info")
print (pd.__version__)

Python version
3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]

Pandas info
0.25.3


# Unit Testing Basics
## The assert statement

In [3]:
# We use the assert statement to identify bugs in our programs
b = 3

assert b > 5, 'the variable b should have a value of at least 5 , but it is only %s' % b

AssertionError: the variable b should have a value of at least 5 , but it is only 3

In [4]:
# Here, we use the assert statement to validate that our data was successfully loaded

def get_data():
    df_1 = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                   names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                   'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                   'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
                   '>=50K'], skipinitialspace=True)
    df_2 = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", 
                   names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                   'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                   'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
                   '>=50K'], skipinitialspace=True, skiprows=1)
    df_combined = df_1.append(df_2, ignore_index=True, sort=True)
    
    # Check this out -  
    assert df_combined.shape == (48842, 15), 'Expected data frame shape:%s; actual:%s' % ((48842, 16), df_combined.shape)

    X = df_combined.drop(columns=['>=50K'])
    y = df_combined['>=50K']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_data()

## Unit testing with pytest
[pytest](https://docs.pytest.org/en/latest/) is comprehensive Python testing tool that helps write better programs.

Here is Nana and Alice's presentation [Unit Testing for Data Science](https://my.metlife.com/:p:/r/sites/AnalyticsCommunity/_layouts/15/Doc.aspx?sourcedoc=%7B01975490-24A1-435D-A104-7E8EE5DC8E6B%7D&file=unit_test_framework_launch_v3.pptx&action=edit&mobileredirect=true).

In [5]:
class CharacterStripper(BaseEstimator, TransformerMixin):
    def __init__(self, character_to_strip='.'):
        self.character_to_strip = character_to_strip

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X
        X_transformed = X_transformed.str.strip(self.character_to_strip)
        return X_transformed       

In [6]:
def test_on_normal_input():
    data = pd.Series(['a.', 'b', 'c...', 'd'])
    char_stripper = CharacterStripper('.')
    expected = pd.Series(['a', 'b', 'c', 'd'])
    actual = char_stripper.fit_transform(data)
    pd.testing.assert_series_equal(expected, actual)
    
test_on_normal_input()

Before we can run the test through pytest, we need to:
1. Install it in our environment using Anaconda ([you can search online how to do that](https://www.google.com/search?q=anaconda+install+pytest&rlz=1C1GCEU_enUS837US837&oq=anaconda+install+pytest)).
2. Have your directories properly configured.

In [7]:
# '!' allows you to run terminal commends from Jupyter notebook
!pytest

platform win32 -- Python 3.7.4, pytest-5.3.5, py-1.8.1, pluggy-0.13.1
rootdir: C:\Users\tyifat\Workspace\python-for-dna\Season 2\lesson 12
collected 1 item

tests\test_custom_transformers.py .                                      [100%]



## Test-Driven Development (TDD)
TDD is a software development process in which tests are developed before the code that delivers the required functionality. A typical TDD cycle looks something like this:
1. Define requirements
2. Turn requirements into tests
3. Run all tests and see if the new test fails
4. Write/improve the code so that tests pass
5. Run tests
6. Refactor code (that is, cleanup, improve).
7. Repeat

In [27]:
class TestCountry2ContinentConverter():
    def test_on_normal_data_1(self):
        data = pd.DataFrame({'number':[1, 2, 3, 4, 5],
                            'country':['U.S.', 'China', 'India', 'U.S.', 'Mexico']})
        conversion_rules = {'U.S.':'America',
                           'Mexico':'America',
                           'China':'Asia',
                           'India':'Asia'}
        country_2_continent = Country2ContinentConverter(country_col='country', 
                                                     continent_col='continent', 
                                                     conversion_rules=conversion_rules)
        expected = pd.DataFrame({'number':[1, 2, 3, 4, 5],
                            'country':['U.S.', 'China', 'India', 'U.S.', 'Mexico'],
                             'continent':['America', 'Asia', 'Asia', 'America', 'America']})
        actual = country_2_continent.fit_transform(data)
        pd.testing.assert_frame_equal(expected, actual)

    def test_on_normal_data_2(self):
        pass
    
    def test_on_missing_value(self):
        data = pd.DataFrame({'number':[1, 2, 3, 4, 5],
                            'country':['U.S.', np.nan, 'India', 'U.S.', 'Mexico']})
        conversion_rules = {'U.S.':'America',
                           'Mexico':'America',
                           'China':'Asia',
                           'India':'Asia'}
        country_2_continent = Country2ContinentConverter(country_col='country', 
                                                     continent_col='continent', 
                                                     conversion_rules=conversion_rules)
        expected = pd.DataFrame({'number':[1, 2, 3, 4, 5],
                            'country':['U.S.', np.nan, 'India', 'U.S.', 'Mexico'],
                             'continent':['America', '', 'Asia', 'America', 'America']})
        actual = country_2_continent.fit_transform(data)
        pd.testing.assert_frame_equal(expected, actual)
    
    def test_on_unknown_value(self):
        data = pd.DataFrame({'number':[1, 2, 3, 4, 5],
                            'country':['U.S.', 'Argentina', 'India', 'U.S.', 'Mexico']})
        conversion_rules = {'U.S.':'America',
                           'Mexico':'America',
                           'China':'Asia',
                           'India':'Asia'}
        country_2_continent = Country2ContinentConverter(country_col='country', 
                                                     continent_col='continent', 
                                                     conversion_rules=conversion_rules)
        expected = pd.DataFrame({'number':[1, 2, 3, 4, 5],
                            'country':['U.S.', 'Argentina', 'India', 'U.S.', 'Mexico'],
                             'continent':['America', '', 'Asia', 'America', 'America']})
        actual = country_2_continent.fit_transform(data)
        pd.testing.assert_frame_equal(expected, actual)
    
    def test_on_missing_column(self):
        pass

In [21]:
# A class that changes category names according to specified rules
class Country2ContinentConverter(BaseEstimator, TransformerMixin):
    ''' This transformer revises categories according to a dictionary with rules '''
    def __init__(self, country_col='native-country', continent_col='continent', conversion_rules={}):
        self.country_col = country_col
        self.continent_col = continent_col
        self.conversion_rules = conversion_rules
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        X_transformed[self.continent_col] = ''
        for country in self.conversion_rules:
            X_transformed.loc[X[self.country_col]==country, self.continent_col] = self.conversion_rules[country]
        return X_transformed

In [22]:
test_country_2_continent = TestCountry2ContinentConverter()
test_country_2_continent.test_on_normal_data_1()

In [23]:
test_country_2_continent.test_on_missing_value()

In [24]:
test_country_2_continent.test_on_unknown_value()

In [32]:
test_country_2_continent.test_on_missing_column()

In [33]:
# We have to add the transformer and the test to our scripts before we can run them with pytest
!pytest

platform win32 -- Python 3.7.4, pytest-5.3.5, py-1.8.1, pluggy-0.13.1
rootdir: C:\Users\tyifat\Workspace\python-for-dna\Season 2\lesson 12
collected 1 item

tests\test_custom_transformers.py .                                      [100%]



# Completing our Data-Prep Pipeline
## More custom transformers

In [36]:
# A class that changes category names according to specified rules
class CategoryReviser(BaseEstimator, TransformerMixin):
    ''' This transformer revises categories according to a dictionary with rules '''
    def __init__(self, cat_change_rules={}):
        self.cat_change_rules = cat_change_rules

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        for feature in self.cat_change_rules:
            for cat in self.cat_change_rules[feature]:
                X_transformed.loc[X[feature]==cat, feature] = self.cat_change_rules[feature][cat]
        return X_transformed

In [37]:
cat_change_rules = {'marital-status':{'Married-AF-spouse':'Married', 'Married-civ-spouse':'Married'}, 
                    'workclass':{'Without-pay':'?', 'Never-worked':'?'},
                    'occupation':{'Armed-Forces':'Prof-specialty'}}

category_reviser = CategoryReviser(cat_change_rules=cat_change_rules)
X_train_prepared = category_reviser.fit_transform(X_train)
X_train_prepared['marital-status'].value_counts()

Married                  16901
Never-married            12010
Divorced                  4970
Separated                 1147
Widowed                   1132
Married-spouse-absent      471
Name: marital-status, dtype: int64

In [38]:
# The target encoder replaces a category with the positive rate for that category
class BasicTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features
        self.mapping = {}

    def fit(self, X, y=None):
        X_y = X.copy()
        X_y['target'] = y
        for feat in self.features:
            cat_positive_rates = {}
            for feat in self.features:
                self.mapping[feat] = {}
                positive_rates = X_y.groupby(feat)['target'].mean()
                for cat in X_y[feat].unique():
                    self.mapping[feat][cat] = positive_rates.loc[cat]
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        for feat in self.mapping:
            for cat in self.mapping[feat]:
                X_transformed.loc[X[feat]==cat, feat] = self.mapping[feat][cat]
        return X_transformed

In [46]:
X_train['native-country'].value_counts().tail(10)

Hong                          20
Thailand                      19
Scotland                      19
Yugoslavia                    18
Trinadad&Tobago               17
Outlying-US(Guam-USVI-etc)    17
Honduras                      16
Laos                          16
Hungary                       12
Holand-Netherlands             1
Name: native-country, dtype: int64

In [42]:
from sklearn.preprocessing import LabelEncoder
# from lesson11 import CharacterStripper
from sklearn.pipeline import Pipeline

# We cannot use LabelEncoder inside a pipeline
character_stripper = CharacterStripper(character_to_strip='.')
y_stripped = character_stripper.fit_transform(y_train)
label_encoder = LabelEncoder()
y_train_prepared = label_encoder.fit_transform(y_stripped)

target_encoder = BasicTargetEncoder(['native-country'])
X_prepared = target_encoder.fit_transform(X_train, y_train_prepared)
X_prepared['native-country'].value_counts().tail(10)

0.125000    32
0.103448    29
0.344828    29
0.461538    26
0.380952    21
0.250000    20
0.105263    19
0.210526    19
0.388889    18
0.000000     1
Name: native-country, dtype: int64

In [43]:
# Here, we regularize the target encodeing by taking the weighted average of the positive rate for the specific category
# and the positive rate for the entire dataset. The argument 'reg_weight' controls the weighting.
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, features, reg_weight=20):
        self.features = features
        self.reg_weight = reg_weight
        self.mapping = {}

    def fit(self, X, y=None):
        X_positive_rate = y.mean()
        X_y = X.copy()
        X_y['target'] = y
        for feat in self.features:
            cat_positive_rates = {}
            for feat in self.features:
                self.mapping[feat] = {}
                positive_rates = X_y.groupby(feat)['target'].mean()
                value_counts = X_y[feat].value_counts()
                for cat in X_y[feat].unique():
                    n = value_counts.loc[cat]
                    rate = positive_rates.loc[cat]
                    regularized_rate = (rate * n + X_positive_rate * self.reg_weight) / (n + self.reg_weight)
                    self.mapping[feat][cat] = regularized_rate
        return self
    
    def transform(self, X, y=None):
        X_transformed = X.copy()
        for feat in self.mapping:
            for cat in self.mapping[feat]:
                X_transformed.loc[X[feat]==cat, feat] = self.mapping[feat][cat]
        return X_transformed

In [45]:
target_encoder = TargetEncoder(['native-country'])
X_prepared = target_encoder.fit_transform(X_train, y_train_prepared)
X_prepared['native-country'].value_counts().tail(10)

0.365699    26
0.312735    21
0.245554    20
0.174927    19
0.226209    19
0.311109    18
0.184382    17
0.157355    17
0.275692    12
0.229626     1
Name: native-country, dtype: int64

## Modularizing the script as functions

In [47]:
def get_data():
    df_1 = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                   names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                   'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                   'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
                   '>=50K'], skipinitialspace=True)
    df_2 = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", 
                   names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                   'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                   'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
                   '>=50K'], skipinitialspace=True, skiprows=1)
    df_combined = df_1.append(df_2, ignore_index=True, sort=True)
    X = df_combined.drop(columns=['>=50K'])
    y = df_combined['>=50K']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    return X_train, X_test, y_train, y_test

In [48]:
def preprocess_y(y):
    character_stripper = CharacterStripper(character_to_strip='.')
    y_stripped = character_stripper.fit_transform(y_train)
    label_encoder = LabelEncoder()
    y_train_prepared = label_encoder.fit_transform(y_stripped)
    return y_train_prepared

In [49]:
def build_X_pipeline():
    num_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',]
    cat_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'continent']

    cat_change_rules = {'marital-status':{'Married-AF-spouse':'Married', 'Married-civ-spouse':'Married'}, 
                        'workclass':{'Without-pay':'?', 'Never-worked':'?'},
                        'occupation':{'Armed-Forces':'Prof-specialty'}}
    country_2_contonent_rules = {'United-States':'N-America',
                         'Germany':'Europe',
                         'Mexico':'LatAm',
                         'Scotland':'Europe',
                         'Peru':'LatAm',
                         'Honduras':'LatAm',
                         'Ecuador':'LatAm',
                         'Poland':'Europe',
                         'China':'Asia',
                         'Nicaragua':'LatAm',
                         'India':'Asia',
                         'Philippines':'Asia',
                         'Iran':'Asia',
                         'Japan':'Asia',
                         'Vietnam':'Asia',
                         'Dominican-Republic':'LatAm',
                         'Ireland':'Europe',
                         'Laos':'Asia',
                         'Jamaica':'LatAm',
                         'England':'Europe',
                         'Hong':'Asia',
                         'Puerto-Rico':'LatAm',
                         'Cuba':'LatAm',
                         'Haiti':'LatAm',
                         'Guatemala':'LatAm',
                         'El-Salvador':'LatAm',
                         'Columbia':'LatAm',
                         'Italy':'Europe',
                         'Taiwan':'Asia',
                         'Canada':'N-America',
                         'Portugal':'Europe',
                         'Thailand':'Asia',
                         'Cambodia':'Asia',
                         'France':'Europe',
                         'Greece':'Europe',
                         'Trinadad&Tobago':'LatAm',
                         'Yugoslavia':'Europe',
                         'Hungary':'Europe',
                         'Holand-Netherlands':'Europe',
                        }

    # Create and parametatrize data transformers
    scaler = StandardScaler()
    one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
    category_reviser = CategoryReviser(cat_change_rules=cat_change_rules)
    country_2_continent = Country2ContinentConverter(country_col='native-country', 
                                                     continent_col='continent', 
                                                     conversion_rules=country_2_contonent_rules)
    target_encoder = TargetEncoder(['native-country'])

    column_transformer = ColumnTransformer([('Scaler', scaler, num_features), 
                                            ('One Hot Encoder', one_hot_encoder, cat_features)])

    # Define the data transformation pipeline
    X_pipeline = Pipeline([('Category Reviser', category_reviser), 
                           ('Country to Continent', country_2_continent), 
                           ('Target Encoder', target_encoder), 
                           ('ColumnTransformer', column_transformer)])
    return X_pipeline

## Putting it all together

In [52]:
X_train, X_test, y_train, y_test = get_data()
y_train_prepared = preprocess_y(y_train)
y_test_prepared = preprocess_y(y_test)
X_pipeline = build_X_pipeline()
X_train_prepared = X_pipeline.fit_transform(X_train, y_train_prepared)
X_test_prepared = X_pipeline.transform(X_test)

X_train_prepared.shape, y_train_prepared.shape, X_test_prepared.shape, y_test_prepared.shape

((36631, 52), (36631,), (12211, 52), (36631,))

# Homework
Pick one of the custom transformenrs and write a couple of tests for it.