## Creating And Cleaning Features: Cap And Floor Data To Remove Outliers

### Read In Data

In [None]:
# Read in data
import pandas as pd
import numpy as np

titanic = pd.read_csv('../../../data/titanic_no_missing.csv')
titanic.head()

### Remove Outliers

In [None]:
# See where outliers might be an issue
titanic.describe()

In [None]:
def detect_outlier(feature):
    outliers = []
    data = titanic[feature]
    mean = np.mean(data)
    std =np.std(data)
    
    
    for y in data:
        z_score= (y - mean)/std 
        if np.abs(z_score) > 3:
            outliers.append(y)
    print('\nOutlier caps for {}:'.format(feature))
    print('  --95p: {:.1f} / {} values exceed that'.format(data.quantile(.95),
                                                             len([i for i in data
                                                                  if i > data.quantile(.95)])))
    print('  --3sd: {:.1f} / {} values exceed that'.format(mean + 3*(std), len(outliers)))
    print('  --99p: {:.1f} / {} values exceed that'.format(data.quantile(.99),
                                                             len([i for i in data
                                                                  if i > data.quantile(.99)])))

In [None]:
# Determine what the upperbound should be for continuous features
for feat in ['Age_clean', 'SibSp', 'Parch', 'Fare']:
    detect_outlier(feat)

In [None]:
# Cap features
titanic['Age_clean'].clip()
titanic['Fare'].clip()

In [None]:
# Describe the dataframe again to make sure the capping was successful
titanic.describe()

In [None]:
# Write out capped data
titanic.to_csv('../../../data/titanic_capped.csv', index=False)