# DataFrame

In [6]:
import pandas as pd
import numpy as np
from pandas import DataFrame
import math

In [15]:
# Create Dataframe
df = DataFrame({'int_col' : [1,2,6,8,-1], 'float_col' : [0.1, 0.2,0.2,10.1,None], 'str_col' : ['a','b',None,'c','a']})
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-0.13165,0.715206,0.225515
Ohio,0.746391,0.604475,-1.291059
Texas,1.326981,0.966488,-2.718146
Oregon,-0.339254,0.26335,0.243651


## 1. apply / map/ applymap

**apply** works on a row / column basis of a DataFrame, 
**applymap** works element-wise on a DataFrame, 
**map** works element-wise on a Series.

#### apply 

In [10]:
frame.apply(lambda x: x.max()-x.min(),axis=0)

b    2.500785
d    2.273373
e    1.804596
dtype: float64

In [11]:
frame.apply(lambda x: x.max()-x.min(),axis=1)

Utah      1.235145
Ohio      3.254675
Texas     2.088934
Oregon    1.470470
dtype: float64

#### applymap

In [12]:
frame.applymap(lambda x: '%.2f' % x)

Unnamed: 0,b,d,e
Utah,0.15,-0.36,-1.09
Ohio,-2.35,0.9,-0.82
Texas,-0.64,-1.37,0.72
Oregon,-1.09,0.38,-0.41


#### map

In [13]:
frame['e'].map(lambda x: '%.2f' % x)

Utah      -1.09
Ohio      -0.82
Texas      0.72
Oregon    -0.41
Name: e, dtype: object

## 2. missing value

In [23]:
#Check type
df.dtypes          # type of df column
df.info()          # df info
type(df.iloc[0,0]) # df element type
type(123)          # int
type (123) is int  # True

#Check missing value
import math
t= np.nan          # NaN
math.isnan(t)      # output : True

val=None
val == None        # True
not (val is None)  # False
val is not None    # False

# Check dataframe missing value
df.isnull()        
df.isnull().any()  # If missing in column
df.isnull().any().any()
df.isnull().sum()  # Number of missing in each column
df.isnull().sum().sum()

# Print lines with missing value
df[df.isnull().any(axis=1)]

# drop lines with missing value / columns (axis=1)
df.dropna(axis=0)

# fill na
mean=df['float_col'].mean()
df['float_col'].fillna(mean)

# count missing value
df.isnull().sum()  #per column
df.isnull().sum(axis=1)  #per line

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
float_col    4 non-null float64
int_col      5 non-null int64
str_col      4 non-null object
dtypes: float64(1), int64(1), object(1)
memory usage: 192.0+ bytes


0    0
1    0
2    1
3    0
4    1
dtype: int64

In [None]:
# convert column type
def myconvertint(x):
    if math.isnan(x):
        return np.nan
    else:
        return x.astype(int)#int(x)  
df['float_col'].map(myconvertint)

## 3. get new columns based on existing columns

In [None]:
#vectorized calculations using simple operators and numpy functions
df = pd.DataFrame(data={"A":[1,2], "B":[1.2,1.3], "Z":["a","b"]})
df["C"] = df["A"]+df["B"]

df["E"] = np.sqrt(df["A"])

df["F"] = df.Z.str.upper()

In [79]:
# multicolumns as function of single column
df = DataFrame({'int_col' : [1,2,6,8,-1], 'float_col' : [0.1, 0.2,0.2,10.1,None], 'str_col' : ['a','b',None,'c','a']})
def two_three(x):
    return (x*2,x*3)
df['twice'],df['thrice']= zip(*df['int_col'].map(two_three))
zip(*df['int_col'].map(two_three))

[(2, 4, 12, 16, -2), (3, 6, 18, 24, -3)]

In [81]:
# single column as a function of multiple columns
def sum_two(row):
    return row['int_col']+row['float_col']

df['sum']=df.apply(sum_two,axis=1)
df.head()

Unnamed: 0,float_col,int_col,str_col,twice,thrice,sum
0,0.1,1,a,2,3,1.1
1,0.2,2,b,4,6,2.2
2,0.2,6,,12,18,6.2
3,10.1,8,c,16,24,18.1
4,,-1,a,-2,-3,


In [91]:
# multiple columns as a function of multiple columns
def int_float_squares(row):
    return (row['int_col']**2,  row['float_col']**2)
df['int_sq'],df['flt_sq']=zip(*df.apply(int_float_squares,axis=1))
df.head()

Unnamed: 0,float_col,int_col,str_col,twice,thrice,sum,flt_sq,int_sq
0,0.1,1,a,2,3,1.1,0.01,1
1,0.2,2,b,4,6,2.2,0.04,4
2,0.2,6,,12,18,6.2,0.04,36
3,10.1,8,c,16,24,18.1,102.01,64
4,,-1,a,-2,-3,,,1


In [96]:
df = pd.DataFrame(data={"A":[1,2], "B":[1.2,1.3], "Z":["a","b"]})
# columns name
print df.columns
# columns name list
print list(df.columns.values)

# np array of values in dataframe
print df.values

# index
print df.index

# convert object to float array
print df.values[:,:-1].astype(float)

# rename columns
df2 = df.rename(columns={'int_col' : 'some_other_name'})

# copy
df3 = df.copy()

# describe
print df.describe()

# covariance
print df.cov()

# correlation
print df.corr()

Index([u'A', u'B', u'Z'], dtype='object')
['A', 'B', 'Z']
[[1 1.2 'a']
 [2 1.3 'b']]
RangeIndex(start=0, stop=2, step=1)
[[ 1.   1.2]
 [ 2.   1.3]]
              A         B
count  2.000000  2.000000
mean   1.500000  1.250000
std    0.707107  0.070711
min    1.000000  1.200000
25%    1.250000  1.225000
50%    1.500000  1.250000
75%    1.750000  1.275000
max    2.000000  1.300000
      A      B
A  0.50  0.050
B  0.05  0.005
     A    B
A  1.0  1.0
B  1.0  1.0


# String

In [11]:
print(r'C:\some\name')

C:\some\name


In [10]:
 3 * 'un' + 'ium'

'unununium'

In [13]:
print("""\
Usage: thingy [OPTIONS]\
     -h                        Display this usage message
     -H hostname               Hostname to connect to
""")

Usage: thingy [OPTIONS]     -h                        Display this usage message
     -H hostname               Hostname to connect to



In [14]:
word='Python'
word[-2]
word[0:2]
word[-2:]

'on'

In [15]:
'J' + word[1:] #word[0] = 'J'

'Jython'

In [16]:
len(word)

6

# List

In [29]:
squares = [1, 4, 9, 16, 25]
squares[-3:]w


[9, 16, 25]

In [19]:
squares + [36, 49, 64, 81, 100]

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]

In [30]:
squares.append(216)
squares

[1, 4, 9, 16, 25, 216]

In [31]:
squares[2:4]=[]
squares

[1, 4, 25, 216]

In [32]:
len(squares)

4

In [33]:
a = ['a', 'b', 'c']
n = [1, 2, 3]
x = [a, n]
x

[['a', 'b', 'c'], [1, 2, 3]]

In [None]:
words = ['cat', 'window', 'defenestrate']
for w in words[:]:
    if len(w) > 6:
         words.insert(0, w)
words[:]