# Detecting and Filtering Outliers

In [5]:
import numpy as np
import pandas as pd
data = pd.DataFrame(np.random.randn(100, 4))
data

Unnamed: 0,0,1,2,3
0,-0.639026,0.793278,-0.096410,-0.519512
1,-0.257352,-1.588435,-0.282570,0.865976
2,-0.308136,-1.517038,1.058845,-0.393698
3,-0.243102,-1.962621,0.892062,0.936161
4,-0.031283,0.149545,-0.612334,0.130423
...,...,...,...,...
95,0.247958,0.551461,0.330506,-0.333861
96,-1.106516,0.022160,0.324838,0.949319
97,-0.583687,0.517993,-0.990763,1.513584
98,0.908798,-1.630399,-1.164205,0.935398


In [4]:
data.describe()

Unnamed: 0,0,1,2,3
count,100.0,100.0,100.0,100.0
mean,-0.091947,0.175485,-0.028241,-0.014724
std,1.024858,1.062999,0.9997,1.018027
min,-2.244627,-1.854069,-2.640328,-2.437395
25%,-0.684475,-0.468432,-0.687271,-0.589965
50%,-0.088924,0.014056,-0.170136,-0.173384
75%,0.56533,0.91963,0.595902,0.554028
max,2.991758,3.687194,2.835071,2.449715


In [6]:
col = data[2]
col[np.abs(col) > 3]

Series([], Name: 2, dtype: float64)

In [7]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3


In [8]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,100.0,100.0,100.0,100.0
mean,-0.007158,-0.252635,-0.048591,0.160344
std,0.960202,0.974843,1.045668,0.987494
min,-2.059287,-2.600146,-2.367857,-2.740989
25%,-0.598592,-0.907365,-0.798034,-0.428215
50%,-0.054856,-0.31442,-0.02656,0.187622
75%,0.511019,0.402654,0.625264,0.81977
max,2.746979,2.399716,2.356615,2.980439


In [9]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,-1.0
1,-1.0,-1.0,-1.0,1.0
2,-1.0,-1.0,1.0,-1.0
3,-1.0,-1.0,1.0,1.0
4,-1.0,1.0,-1.0,1.0


# Permutation and Random Sampling

In [61]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df


Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


Unnamed: 0,0,1,2,3
2,8,9,10,11
4,16,17,18,19
1,4,5,6,7
3,12,13,14,15
0,0,1,2,3


In [68]:
sampler = np.random.permutation(5)
sampler

array([1, 3, 4, 0, 2])

In [65]:
df

Unnamed: 0,0,1,2,3
2,8,9,10,11
4,16,17,18,19
1,4,5,6,7
3,12,13,14,15
0,0,1,2,3


In [92]:
df.take(sampler)
df.take(np.random.permutation(5))   #take every time diffent order
df.take([1, 3, 4, 0, 2])


Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3
2,8,9,10,11


In [126]:
df.sample(n=4)

Unnamed: 0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
1,4,5,6,7
0,0,1,2,3


In [131]:
choices = pd.Series([5, 7, -1, 6, 4])
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [132]:

draws = choices.sample(n=10, replace=True)
draws

3    6
2   -1
4    4
1    7
2   -1
1    7
0    5
0    5
0    5
4    4
dtype: int64

# Computing Indicator/Dummy Variables

In [139]:
df=pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [146]:
pd.get_dummies(df['key'])


Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [149]:
dummies = pd.get_dummies(df['key'], prefix='key',prefix_sep='-')
dummies

Unnamed: 0,key-a,key-b,key-c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [152]:
df_with_dummy = df[['data1']].join(dummies)

#df_with_dummy = dummies.join(df[['data1']])

df_with_dummy

Unnamed: 0,data1,key-a,key-b,key-c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


# String Manipulation

In [165]:
val = 'a,b, guido'
val.split(',')


['a', 'b', ' guido']

In [158]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [161]:
first, second, third = pieces
first + '::' + second + '::' + third


'a::b::guido'

In [163]:
print(pieces)
'::'.join(pieces)

['a', 'b', 'guido']


'a::b::guido'

In [210]:
print('guido' in val)
print(val.index(','))
print(val.find(':'))
val.count(',')
val.replace(',',':')

True
1
-1


'a:b: guido'

# Regular Expressions

In [188]:
import re
text = "foo bar\t baz \tqux"
print(re.split('\+',text))
re.split('\s+',text)

['foo bar\t baz \tqux']


['foo', 'bar', 'baz', 'qux']

In [190]:
regex = re.compile('\s+')
print(regex)

regex.split(text)

re.compile('\\s+')


['foo', 'bar', 'baz', 'qux']

In [192]:
print(text)
regex.findall(text)

foo bar	 baz 	qux


[' ', '\t ', ' \t']

In [211]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
print(text)
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'


Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com



In [195]:
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)
regex

re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}', re.IGNORECASE|re.UNICODE)

In [197]:
print(text)
regex.findall(text)

Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com



['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [199]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [200]:
text[m.start():m.end()]

'dave@google.com'

In [201]:
print(regex.match(text))

None


In [202]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [203]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [204]:
m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

In [207]:
print(text)
print(regex)
regex.findall(text)


Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com

re.compile('([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})', re.IGNORECASE)


[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [208]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [264]:
text = """Dave 42101-4235687-3
Steve 42101-7469834-3
Rob 42101-0233566-3
Ryan 421a01-657s9423-3
"""
print(text)
pattern = r'([0-9]{5})-([0-9]{7})-([0-9]{1})'
#pattern =re.compile('([0-9]+)@([0-9]+)', re.IGNORECASE)

regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)

Dave 42101-4235687-3
Steve 42101-7469834-3
Rob 42101-0233566-3
Ryan 421a01-657s9423-3



[('42101', '4235687', '3'),
 ('42101', '7469834', '3'),
 ('42101', '0233566', '3')]