In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels as sm
%matplotlib inline

In [5]:
# Reading data
# read_csv, read_excel, read_html, read_sql, etc
!type examples\ex1.txt # print raw contents of a file

a,b,c,d


In [9]:
df = pd.read_csv('examples\ex1.txt')
# seperate by regular expressions
result = pd.read_csv('examples\ex1.txt', sep='\s+')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [1]:
# built-in csv module
import csv
f = open('examples\ex1.txt')
reader = csv.reader(f)
for line in reader:
    print (line)

['a', 'b', 'c', 'd', 'message']
['1', '2', '3', '4', 'hello']
['5', '6', '7', '8', 'world']
['9', '10', '11', '12', 'foo']


In [None]:
# read from Excel
xlsx = pd.ExcelFile('examples/ex1.xlsx')
pd.read_excel(xlsx, 'Sheet1')

In [11]:
# Write data to text files
df = pd.read_csv('examples\ex1.txt')
df.to_csv('examples\out.txt')

In [3]:
# Data Cleaning and Preparation
# Missing Data
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna() # filtering out missing data
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [17]:
# drop NA in DataFrame
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna() # drops any row contains missing value
data.dropna(how='all') # drops rows that are all NA
data.dropna(axis=1, how='all') # drop columns  that are all NA

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data[4] = NA
data.dropna(thresh=3) # keep rows containing at least thresh=3 observations

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,


In [18]:
# Filling in missing data
data.fillna(0)
data.fillna({1: 0.5, 2:0}) # fill value for each column
_ = data.fillna(0, inplace=True) # modify existing object
data.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,6.5,3.0,0.0


In [22]:
# Removing duplicate
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                    'k2': [1, 1, 2, 3, 3, 4, 4]})
data.duplicated()
data.drop_duplicates()
data.drop_duplicates(['k1', 'k2'], keep = 'last') # keep the last duplicate instead of the first one, at specified columns


Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
6,two,4


In [26]:
# Transforming
data = pd.DataFrame({'food':['Bacon', 'pulled pork'], 'ounces':[4, 3]})
meat_to_animal = {'bacon': 'pig', 'pulled pork' : 'pig'}
lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal) # map a Series to DataFrame
data['animal'] = data['food'].map(lambda x: meat_to_animal[x.lower()])
data

Unnamed: 0,food,ounces,animal
0,Bacon,4,pig
1,pulled pork,3,pig


In [33]:
# Replacing values
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data.replace(-999, np.nan)
data.replace([-999, -1000], [np.nan, 0])
data.replace({-999: np.nan, -1000: 0})

# Renaming indexes
data = pd.DataFrame(np.arange(12).reshape((3, 4)), 
                    index=['Ohio', 'Colorado', 'New York'], 
                   columns=['one', 'two', 'three', 'four'])
transform = lambda x: x[:4].upper()
data.index = data.index.map(transform) # modify the DataFrame in place
data.rename(index=str.title, columns=str.upper) # create a new transformed copy
data.rename(index={'OHIO': 'INDIANA'}, inplace=True) # in-place rename
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [38]:
# Discretization & Binning
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
cats = pd.cut(ages, bins, labels=group_names, right=False) # right=False change the closed right boundary to open
cats
cats.codes
cats.categories
pd.value_counts(cats)
pd.cut(data, 4, precision=2) # compute and pass equal length bins
pd.qcut(dqata, 4) # cut bins with equal quantile (roughly same pd.value_counts in each bin)

YoungAdult    4
Youth         4
MiddleAged    3
Senior        1
dtype: int64

In [39]:
# Outliers
data = pd.DataFrame(np.random.randn(1000, 4))
data[(np.abs(data) > 3).any(1)] # select all rows having a valu exceeding 3 or -3

Unnamed: 0,0,1,2,3
55,-0.275046,-0.674094,-1.232819,3.0128
94,3.363394,-0.968372,-0.490589,-0.398242
242,-0.90461,-0.366035,-1.135133,3.161791
253,0.217461,-0.659799,-3.257438,0.93324
443,-3.031122,2.395083,-0.398755,0.840462
448,-3.155631,3.137394,0.663443,0.349558
453,-0.581055,0.441846,-0.793988,-3.123828
911,0.386158,-1.254104,-0.673958,-3.557271
918,1.103383,3.236055,-0.415796,-0.858347


In [44]:
# Random Sampling
df = pd.DataFrame(np.arange(5 * 4).reshape((5,4)))
sampler = np.random.permutation(5)
df.take(sampler) # permutate rows according to sampler
df.sample(n=3) # select a random sbuset without replacement
df.sample(n=3, replace=True) # select subset with replacement

Unnamed: 0,0,1,2,3
1,4,5,6,7
1,4,5,6,7
3,12,13,14,15


In [51]:
# Indicator & Dummy Variables
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
pd.get_dummies(df['key'])

# get_dummies with cut
np.random.seed(12345)
values = np.random.rand(10)
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [53]:
# String Manipulation
# split and strip
val = 'a,b,   guido'
pieces = [x.strip() for x in val.split(',')]
pieces

first, second, third = pieces
first + '::' + second + '::' + third

val.find(':') # return position of first found, -1 if not found
val.find(':') # return position of last found
val.count(',')
val.replace(',', '::')


['a', 'b', 'guido']

In [55]:
# Regular Expressions
import re
text = "foo     bar\t baz  \tqux"
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [60]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags=re.IGNORECASE)
print(regex.findall(text)) # return all found
m = regex.search(text) # returns object(position) of first match
print(m)
print(regex.match(text)) # match returns only if the matched pattern occurs at the start, otherwise None
print(regex.sub('REDACTED', text)) # replace pattern

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
<re.Match object; span=(5, 20), match='dave@google.com'>
None
Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [64]:
# find pattern with groups, use parentheses
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [66]:
# Vectorized String Functions
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
data.str.findall(pattern, flags=re.IGNORECASE) # add 'str' to skip the NA values

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [69]:
matches = data.str.findall(pattern, flags=re.IGNORECASE)
matches.str[0].str.get(1) # retrieve elements, use either .str[Index] or .str.get(Index)

Dave     google
Steve     gmail
Rob       gmail
Wes         NaN
dtype: object

In [70]:
data.str.extract(pattern, flags=re.IGNORECASE) # return captured groups as a DataFrame

Unnamed: 0,0,1,2
Dave,dave,google,com
Steve,steve,gmail,com
Rob,rob,gmail,com
Wes,,,
