# Handle outliers

 Elevation, IsFactor, Placurv, curve and slope have some outliers.
 The aspect feature has no outliers.
 
 Some of the techniques you can use to handle outliers include:
  1. Log transformations, scaling, box-cox transformations...
  2. Dropping the outliers
  3. Replacing the outliers with mean, median, mode or any other aggregates

## 1. log transformations (scrapped)

The important caveat here is that the original data has to follow or approximately follow a log-normal distribution. Otherwise, the log transformation won’t work.

## 2. Dropping outliers

In [78]:
from scipy import stats
import numpy as np
import pandas as pd
from dfcols import all_square_cols

train = pd.read_csv('./data/Train.csv')
test = pd.read_csv('./data/Test.csv')
sample_submission = pd.read_csv('./data/SampleSubmission.csv')

# filters outliers in every column
# train = train[(np.abs(stats.zscore(train)) < 3).all(axis=1)]

outlierAttributes = all_square_cols("elevation") + all_square_cols("slope") + all_square_cols("placurv") + all_square_cols("procurv") + all_square_cols("lsfactor")
X = train[outlierAttributes]

train_drop = train[(np.abs(stats.zscore(X)) < 3).all(axis=1)]

# train_drop is the new dataset with outliers in 5 columns filtered

train_drop.describe()

Unnamed: 0,Sample_ID,1_elevation,2_elevation,3_elevation,4_elevation,5_elevation,6_elevation,7_elevation,8_elevation,9_elevation,...,17_sdoif,18_sdoif,19_sdoif,20_sdoif,21_sdoif,22_sdoif,23_sdoif,24_sdoif,25_sdoif,Label
count,8830.0,8830.0,8830.0,8830.0,8830.0,8830.0,8830.0,8830.0,8830.0,8830.0,...,8830.0,8830.0,8830.0,8830.0,8830.0,8830.0,8830.0,8830.0,8830.0,8830.0
mean,5413.274179,219.508041,219.422763,219.31359,219.192639,219.101359,219.551416,219.448698,219.318233,219.192865,...,1.300175,1.30016,1.300143,1.300125,1.300171,1.300154,1.300138,1.300121,1.300105,0.248131
std,3132.477924,146.308765,146.217524,146.132683,146.073739,146.034017,146.298372,146.197924,146.111576,146.050746,...,0.053517,0.053501,0.053485,0.053468,0.053562,0.053545,0.053529,0.053513,0.053497,0.431953
min,1.0,3.0,3.0,4.0,2.0,1.0,3.0,4.0,4.0,2.0,...,1.093693,1.0937,1.093705,1.093713,1.093611,1.093616,1.093623,1.093628,1.093636,0.0
25%,2679.5,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,...,1.285706,1.285683,1.285681,1.285664,1.285729,1.285698,1.285679,1.285664,1.285653,0.0
50%,5418.5,191.0,191.0,190.0,190.0,190.0,190.5,190.0,190.0,190.0,...,1.308018,1.307974,1.307932,1.307898,1.308057,1.307999,1.307968,1.307945,1.307915,0.0
75%,8118.75,305.0,305.0,305.0,305.0,305.0,305.0,304.75,305.0,305.0,...,1.331801,1.331772,1.331734,1.331697,1.331846,1.331828,1.331794,1.331754,1.331724,0.0
max,10864.0,696.0,695.0,694.0,693.0,695.0,693.0,692.0,690.0,691.0,...,1.370587,1.370595,1.3706,1.370607,1.370575,1.37058,1.370588,1.370593,1.370601,1.0


## 3. Replace outliers with mean

In [84]:
train_replace = train.copy(deep=True)
for column in train:
    if (column in outlierAttributes):
        curcol = train[column]
        mean = curcol[np.abs(stats.zscore(curcol)) < 3].mean()
        train_replace.loc[np.abs(stats.zscore(curcol)) >= 3, column] = np.nan
        train_replace.fillna(mean, inplace=True)

train_replace.describe()

# train_replace contains data where outliers are replaced by mean
        
    

107    48
49     46
141    44
68     43
83     43
       ..
841     1
652     1
819     1
849     1
660     1
Name: 1_elevation, Length: 761, dtype: int64
219.318977    114
107.000000     48
49.000000      46
141.000000     44
87.000000      43
             ... 
632.000000      1
678.000000      1
563.000000      1
572.000000      1
660.000000      1
Name: 1_elevation, Length: 685, dtype: int64


Unnamed: 0,Sample_ID,1_elevation,2_elevation,3_elevation,4_elevation,5_elevation,6_elevation,7_elevation,8_elevation,9_elevation,...,17_sdoif,18_sdoif,19_sdoif,20_sdoif,21_sdoif,22_sdoif,23_sdoif,24_sdoif,25_sdoif,Label
count,10864.0,10864.0,10864.0,10864.0,10864.0,10864.0,10864.0,10864.0,10864.0,10864.0,...,10864.0,10864.0,10864.0,10864.0,10864.0,10864.0,10864.0,10864.0,10864.0,10864.0
mean,5432.5,219.318977,219.182063,219.013212,218.794621,218.61774,219.397544,219.018613,219.00307,218.775172,...,1.301055,1.301038,1.30102,1.301,1.301055,1.301036,1.301018,1.301001,1.300983,0.25
std,3136.310996,148.342153,148.182254,148.028246,147.831363,147.651876,148.416231,147.889923,148.013407,147.808643,...,0.051938,0.051922,0.051905,0.051888,0.051981,0.051965,0.051949,0.051932,0.051916,0.433033
min,1.0,3.0,3.0,4.0,2.0,1.0,3.0,4.0,4.0,2.0,...,1.09241,1.092404,1.092393,1.092387,1.092354,1.092344,1.092337,1.092326,1.09232,0.0
25%,2716.75,101.75,101.0,101.0,101.0,101.0,102.0,101.0,101.0,101.0,...,1.286458,1.286418,1.286371,1.286364,1.28644,1.286447,1.286387,1.286365,1.286335,0.0
50%,5432.5,191.0,192.0,192.0,191.0,191.0,191.0,191.0,191.0,191.0,...,1.308175,1.308136,1.308103,1.308072,1.308206,1.308184,1.308142,1.308116,1.308085,0.0
75%,8148.25,305.0,305.0,305.0,305.0,305.0,305.0,304.0,305.0,304.25,...,1.331691,1.331653,1.331616,1.331562,1.33176,1.331731,1.331665,1.331633,1.331575,0.25
max,10864.0,700.0,699.0,699.0,699.0,699.0,700.0,698.0,699.0,698.0,...,1.37062,1.370627,1.370631,1.370637,1.370607,1.370612,1.370618,1.370623,1.370629,1.0
