# Data Prep

### Imports

In [45]:
import pandas as pd
import numpy as np
import acquire

import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.model_selection
import sklearn.impute
import sklearn.preprocessing

import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare

## Exercises

1. Iris Data

a. Use the function defined in acquire.py to load the iris data.

In [2]:
iris = acquire.get_iris_data()

In [3]:
iris.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


In [4]:
train, test = sklearn.model_selection.train_test_split(iris, test_size=.3, random_state=123)

b. Drop the species_id and measurement_id columns

In [5]:
train.drop(columns=["species_id", "measurement_id"], inplace = True)
test.drop(columns=["species_id", "measurement_id"], inplace = True)

c. Rename the species_name column to just species.

In [6]:
train.rename(columns={"species_name": "species"}, inplace=True)

In [7]:
test.rename(columns={"species_name": "species"}, inplace=True)

In [8]:
train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
114,5.8,2.8,5.1,2.4,virginica
136,6.3,3.4,5.6,2.4,virginica
53,5.5,2.3,4.0,1.3,versicolor
19,5.1,3.8,1.5,0.3,setosa
38,4.4,3.0,1.3,0.2,setosa


d. Encode the species name using a sklearn label encoder. Research the inverse_transform method of the label encoder. How might this be useful?

In [18]:
def encode_species(train, test):
    encoder = sklearn.preprocessing.OneHotEncoder()
    encoder.fit(train[['species']])
    # nice columns for display
    cols = ['species_' + c for c in encoder.categories_[0]]

    m = encoder.transform(train[['species']]).todense()
    train = pd.concat([
        train,
        pd.DataFrame(m, columns=cols, index=train.index)
    ], axis=1).drop(columns='species')
    
    m = encoder.transform(test[['species']]).todense()
    test = pd.concat([
        test,
        pd.DataFrame(m, columns=cols, index=test.index)
    ], axis=1).drop(columns='species')

    return train, test

In [20]:
train, test = encode_species(train, test)

In [21]:
train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_setosa,species_versicolor,species_virginica
114,5.8,2.8,5.1,2.4,0.0,0.0,1.0
136,6.3,3.4,5.6,2.4,0.0,0.0,1.0
53,5.5,2.3,4.0,1.3,0.0,1.0,0.0
19,5.1,3.8,1.5,0.3,1.0,0.0,0.0
38,4.4,3.0,1.3,0.2,1.0,0.0,0.0


e. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [32]:
def drop_columns(df):
    return df.drop(columns=["species_id", "measurement_id"])

In [36]:
def rename_species_column_name(df):
    return df.rename(columns={"species_name": "species"})

In [40]:
def prep_iris(df):
    df = drop_columns(df)
    df = rename_species_column_name(df)
    train, test = sklearn.model_selection.train_test_split(df, train_size=.8, random_state=123)
    train, test = encode_species(train, test)
    return train, test

In [46]:
df = acquire.get_iris_data()

In [42]:
train, test = prep_iris(df)

In [48]:
train, test = prepare.prep_iris(df)

2. Titanic Data

a. Use the function you defined in acquire.py to load the titanic data set.

In [78]:
df = acquire.get_titanic_data()

In [53]:
df.shape

(891, 13)

In [55]:
df.head(3)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1


b. Handle the missing values in the embark_town and embarked columns.

In [51]:
train, test = sklearn.model_selection.train_test_split(df, train_size=.8, random_state=123)

In [56]:
train.embark_town.value_counts()

Southampton    515
Cherbourg      128
Queenstown      67
Name: embark_town, dtype: int64

In [57]:
train.embark_town = train.embark_town.fillna('Southampton')

In [58]:
test.embark_town = test.embark_town.fillna('Southampton')

In [66]:
def impute_embark_town(train, test):
    train.embark_town = train.embark_town.fillna('Southampton')
    test.embark_town = test.embark_town.fillna('Southampton')
    return train, test

c. Remove the deck column.

In [59]:
def drop_deck_column(df):
    return df.drop(columns=["deck"])

d. Use a label encoder to transform the embarked column.

In [63]:
def encode_embark_town(train, test):
    encoder = sklearn.preprocessing.OneHotEncoder()
    encoder.fit(train[['embark_town']])
    # nice columns for display
    cols = ['embark_town_' + c for c in encoder.categories_[0]]

    m = encoder.transform(train[['embark_town']]).todense()
    train = pd.concat([
        train,
        pd.DataFrame(m, columns=cols, index=train.index)
    ], axis=1).drop(columns='embark_town')
    
    m = encoder.transform(test[['embark_town']]).todense()
    test = pd.concat([
        test,
        pd.DataFrame(m, columns=cols, index=test.index)
    ], axis=1).drop(columns='embark_town')

    return train, test

e. Scale the age and fare columns using a min max scaler. Why might this be beneficial? When might you not want to do this?

In [72]:
def scale_minmax_for_age_and_fare(train, test, column_list = ['age','fare']):
    scaler = sklearn.preprocessing.MinMaxScaler()
    column_list_scaled = [col + '_scaled' for col in column_list]
    train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]), 
                                columns = column_list_scaled, 
                                index = train.index)
    train = train.join(train_scaled)

    test_scaled = pd.DataFrame(scaler.transform(test[column_list]), 
                                columns = column_list_scaled, 
                                index = test.index)
    test = test.join(test_scaled)

    return train, test

f. Create a function named prep_titanic that accepts the untransformed titanic data, and returns the data with the transformations above applied.

In [73]:
def prep_titanic(df):
    df = drop_deck_column(df)
    train, test = sklearn.model_selection.train_test_split(df, train_size=.8, random_state=123)
    train, test = impute_embark_town(train, test)
    train, test = encode_embark_town(train, test)
    train, test = scale_minmax_for_age_and_fare(train, test, column_list = ['age','fare'])
    return train, test

In [74]:
df = acquire.get_titanic_data()

In [76]:
train, test = prep_titanic(df)

In [77]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,alone,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,age_scaled,fare_scaled
329,329,1,1,female,16.0,0,1,57.9792,C,First,0,1.0,0.0,0.0,0.195778,0.113168
749,749,0,3,male,31.0,0,0,7.75,Q,Third,1,0.0,1.0,0.0,0.384267,0.015127
203,203,0,3,male,45.5,0,0,7.225,C,Third,1,1.0,0.0,0.0,0.566474,0.014102
421,421,0,3,male,21.0,0,0,7.7333,Q,Third,1,0.0,1.0,0.0,0.258608,0.015094
97,97,1,1,male,23.0,0,1,63.3583,C,First,0,1.0,0.0,0.0,0.28374,0.123667


In [79]:
df = acquire.get_titanic_data()

In [80]:
train, test = prep_titanic(df)

In [81]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,alone,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,age_scaled,fare_scaled
329,329,1,1,female,16.0,0,1,57.9792,C,First,0,1.0,0.0,0.0,0.195778,0.113168
749,749,0,3,male,31.0,0,0,7.75,Q,Third,1,0.0,1.0,0.0,0.384267,0.015127
203,203,0,3,male,45.5,0,0,7.225,C,Third,1,1.0,0.0,0.0,0.566474,0.014102
421,421,0,3,male,21.0,0,0,7.7333,Q,Third,1,0.0,1.0,0.0,0.258608,0.015094
97,97,1,1,male,23.0,0,1,63.3583,C,First,0,1.0,0.0,0.0,0.28374,0.123667
