In [None]:
import sys
import numpy as np
import pandas as pd
np.random.seed(12345)  #乱数系列の指定

### Data Frame

In [None]:
data = {'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
   ...: 'year' : [2000, 2001, 2002, 2001, 2002],
   ...: 'pop' : [1.5, 1.7, 3.6, 2.4, 2.9]}

frame = pd.DataFrame(data)
frame

In [None]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
    ...: index=['one', 'two', 'three', 'four', 'five'])

frame2

In [None]:
frame2['debt'] = np.arange(5.)
frame2

### Read/Write

In [None]:
!cat ex1.csv  #!でOSコマンド呼び出し

In [None]:
df = pd.read_csv('ex1.csv')
             #pd.read_table('ex1.csv',sep=',')
df

In [None]:
df.to_csv(sys.stdout)  #標準出力

In [None]:
df.to_csv('out.csv', index=False,header=False)
!cat out.csv

### Basic operations 

In [None]:
frame2['year']==2001

In [None]:
frame2[frame2['year']==2001]

In [None]:
frame2.drop(['two','four'])

In [None]:
frame2.drop(['pop'],axis=1)

In [None]:
frame3=frame2.drop(['two','four'])
frame3

In [None]:
frame4 = frame3.reindex(['one','b','three','d','five'])
frame4

### Mathematical operations, alignment

In [None]:
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)), columns=list('abcd'))
df1

In [None]:
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)), columns=list('abcde'))
df2

In [None]:
df1+df2

In [None]:
df1.add(df2, fill_value=0)

In [None]:
df1.reindex(columns=df2.columns, fill_value=0)

### merge, join, concat 

In [None]:
df1 = pd.DataFrame({'key':['b','b','a','c','a','a','b'],
    ...: 'data1':range(7)})
df1

In [None]:
df2 = pd.DataFrame({'key':['a','b','d'],'data2':range(3)})
df2

In [None]:
pd.merge(df1, df2, on='key')

In [None]:
pd.merge(df1, df2, how='outer')

### Data preprocessing

In [None]:
data = pd.DataFrame({'k1':['one']*3 + ['two']*4,
    ...: 'k2':[1,1,2,3,3,4,4]})
data
data.duplicated()

In [None]:
data.drop_duplicates()

In [None]:
data = pd.DataFrame([1., -999., 2., -999., -1000., 3.])
data

In [None]:
data.replace(-999, np.nan)

In [None]:
data.replace({-999:np.nan, -1000:0})

In [None]:
from numpy import nan as NA
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data

In [None]:
data.dropna()

In [None]:
data.dropna(how='all')

In [None]:
df = pd.DataFrame(np.random.randn(7,3))
df.loc[:3, 1] = NA
df.loc[:1, 2] = NA
df

In [None]:
df.fillna(0)

In [None]:
df.fillna({1:0, 2:0.5})

In [None]:
df.mean()
df.fillna(df.mean()) #replace NaN by mean values

### Removing outliers

In [None]:
np.random.seed(12345)  
data = pd.DataFrame(np.random.randn(1000,4))
data.describe()

In [None]:
col = data[3]  #第４列
col[np.abs(col) > 3] #絶対値が3より大を探す

In [None]:
data[(np.abs(data)>3).any(1)]  #いずれかの一つ以上の要素が3より大を探す

In [None]:
data[np.abs(data)>3] = np.sign(data)*3  #符号の配列
data.describe()

### Dummy values

In [None]:
df = pd.DataFrame({'class':['b','b','a','c','a','b'],
     ...: 'data1':range(6)})
df

In [None]:
dummies = pd.get_dummies(df['class'])
dummies 

In [None]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

In [None]:
data = pd.read_csv('titanic.csv')
data.describe()

In [None]:
data2 = data[np.isnan(data['Age'])]
data2[:3]