### Outlier Handling


In [2]:
import pandas as pd

titanic_df = pd.read_csv("Titanic_Data.csv")

titanic_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# e.g. 


from scipy import stats

data = np.array([50, 60, 70, 80, 90])

# Manual calculation
z_manual = (data - data.mean()) / data.std(ddof=0)

# Using scipy
z_scipy = stats.zscore(data)

z_manual, z_scipy


(array([-1.41421356, -0.70710678,  0.        ,  0.70710678,  1.41421356]),
 array([-1.41421356, -0.70710678,  0.        ,  0.70710678,  1.41421356]))

In [7]:
import numpy as np

data = np.array([10, 12, 11, 13, 100])
z_scores = stats.zscore(data)
outliers = data[abs(z_scores) > 3]

outliers

array([], dtype=int64)

In [8]:
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1

outliers = data[(data < Q1 - 1.5*IQR) | (data > Q3 + 1.5*IQR)]

In [9]:
outliers


array([100])

In [10]:
IQR

np.float64(2.0)

In [24]:

# Using scipy
z_scipy = stats.zscore(titanic_df["Age"].dropna())

z_scipy

array([-0.53037664,  0.57183099, -0.25482473,  0.36516706,  0.36516706,
        1.67403863, -1.90813618, -0.18593675, -1.08148046, -1.77036023,
        1.94959054, -0.6681526 ,  0.64071897, -1.08148046,  1.74292661,
       -1.90813618,  0.08961515,  0.36516706,  0.29627909, -1.01259248,
       -0.11704878, -1.49480832,  0.57183099, -0.73704057,  0.70960695,
        2.50069435, -0.11704878,  0.8473829 , -0.59926462, -0.80592855,
       -1.08148046,  0.70960695, -0.18593675, -1.83924821, -0.73704057,
       -0.80592855, -1.5636963 , -0.59926462,  1.32959874, -0.0481608 ,
        2.43180638, -0.59926462, -0.08260479, -1.70147225, -1.28814439,
       -0.53037664,  0.57183099,  1.05404683, -1.77036023, -0.0481608 ,
       -0.73704057, -0.87481653, -0.25482473,  0.15850313, -0.9437045 ,
       -0.59926462, -0.25482473,  0.15850313, -0.32371271, -1.98873512,
        0.02072718, -0.53037664, -0.0481608 , -0.11704878, -0.87481653,
        0.22739111, -0.9437045 , -0.46148866, -0.39260069, -0.04

In [27]:

# Fill missing values first
titanic_df['Age_filled'] = titanic_df['Age'].fillna(titanic_df['Age'].mean())

# Compute z-score
titanic_df['Age_z'] = stats.zscore(titanic_df['Age_filled'])

# Filter outliers
outliers = titanic_df[titanic_df['Age_z'] > 2.6]
outliers


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_filled,Age_z
33,34,0,2,"Wheadon, Mr. Edward H",male,66.0,0,0,C.A. 24579,10.5,,S,66.0,2.793511
54,55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C,65.0,2.716556
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C,71.0,3.178283
116,117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q,70.5,3.139805
280,281,0,3,"Duane, Mr. Frank",male,65.0,0,0,336439,7.75,,Q,65.0,2.716556
438,439,0,1,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0,C23 C25 C27,S,64.0,2.639602
456,457,0,1,"Millet, Mr. Francis Davis",male,65.0,0,0,13509,26.55,E38,S,65.0,2.716556
493,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,71.0,3.178283
545,546,0,1,"Nicholson, Mr. Arthur Ernest",male,64.0,0,0,693,26.0,,S,64.0,2.639602
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S,80.0,3.870872


In [18]:
 #Manual calculation
z_manual = abs(titanic_df.Age - titanic_df.Age.mean()) / titanic_df.Age.std()

In [17]:
z_manual

0     -0.530377
1      0.571831
2     -0.254825
3      0.365167
4      0.365167
         ...   
886   -0.185937
887   -0.737041
888         NaN
889   -0.254825
890    0.158503
Name: Age, Length: 891, dtype: float64

In [23]:
df_filter_3 = [z_manual > 2.6]
df_filter_3

[0      False
 1      False
 2      False
 3      False
 4      False
        ...  
 886    False
 887    False
 888    False
 889    False
 890    False
 Name: Age, Length: 891, dtype: bool]

In [28]:
titanic_df['Age_zero'] = titanic_df['Age'].fillna(0)


In [32]:
titanic_df['Age_zero']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     0.0
889    26.0
890    32.0
Name: Age_zero, Length: 891, dtype: float64

In [29]:

titanic_df['Age_z'] = stats.zscore(titanic_df['Age_zero'])


In [33]:
titanic_df['Age_z']

0     -0.102255
1      0.807038
2      0.125068
3      0.636546
4      0.636546
         ...   
886    0.181899
887   -0.272748
888   -1.352534
889    0.125068
890    0.466053
Name: Age_z, Length: 891, dtype: float64

In [30]:
titanic_df['Age_z'] = (
    titanic_df['Age_zero'] - titanic_df['Age_zero'].mean()
) / titanic_df['Age_zero'].std()

In [31]:
outliers = titanic_df[titanic_df['Age_z'].abs() > 2.6]
outliers

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_filled,Age_z,Age_zero
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C,71.0,2.682457,71.0
116,117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q,70.5,2.654041,70.5
493,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,71.0,2.682457,71.0
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S,80.0,3.193934,80.0
672,673,0,2,"Mitchell, Mr. Henry Michael",male,70.0,0,0,C.A. 24580,10.5,,S,70.0,2.625626,70.0
745,746,0,1,"Crosby, Capt. Edward Gifford",male,70.0,1,1,WE/P 5735,71.0,B22,S,70.0,2.625626,70.0
851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.775,,S,74.0,2.852949,74.0


In [34]:
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_filled,Age_z,Age_zero
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,22.000000,-0.102255,22.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.000000,0.807038,38.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,26.000000,0.125068,26.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,35.000000,0.636546,35.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,35.000000,0.636546,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,27.000000,0.181899,27.0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,19.000000,-0.272748,19.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,29.699118,-1.352534,0.0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,26.000000,0.125068,26.0


In [36]:
data = pd.DataFrame({
    "id": [1,2,5,7,9],
    "name": ["John", "Jane", "Jennifer", "johan", "Jeff"],
    "age": [25, -32, 19, 42, 55], 
    "gender": ["M", "M", "Z", "F", "F"],
    "income": [400, 700, 300, 'Nan', 600]
})

In [37]:
data

Unnamed: 0,id,name,age,gender,income
0,1,John,25,M,400
1,2,Jane,-32,M,700
2,5,Jennifer,19,Z,300
3,7,johan,42,F,Nan
4,9,Jeff,55,F,600


In [38]:
data = data.astype({"age": "int32"})

In [39]:
incorrect_age = data[data.age<0]
incorrect_age

Unnamed: 0,id,name,age,gender,income
1,2,Jane,-32,M,700


In [40]:
data.age = abs(data.age)

In [41]:
data

Unnamed: 0,id,name,age,gender,income
0,1,John,25,M,400
1,2,Jane,32,M,700
2,5,Jennifer,19,Z,300
3,7,johan,42,F,Nan
4,9,Jeff,55,F,600


In [43]:
data[~data.gender.isin(["M", "F"])]

Unnamed: 0,id,name,age,gender,income
2,5,Jennifer,19,Z,300
