In [1]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

## Understanding apply
I will need a small example to figure this out. Here is a DataFrame with some $NaN$s inserted

In [6]:
df = pd.DataFrame({'AAA' : [4,np.NaN,6,7], 'BBB' : [10,20,np.NaN,40],'CCC' : [100,50,-30,-50]})
df

Unnamed: 0,AAA,BBB,CCC
0,4.0,10.0,100
1,,20.0,50
2,6.0,,-30
3,7.0,40.0,-50


And here is a function I want to apply

In [9]:
def testfunc(x):
    r1=x[0]**2
    r2=x[1]**2 
    print("ox,oy=( %s , %s), r1,r2=( %f.2 , %f.2)"%(x[0], x[1], r1, r2))
    
    return pd.Series(dict(x=r1, y=r2))


A couple of things are worth mentioning before I forget:
- When I execute the $apply$ function on a dataframe, a result will be returned - the original dataframe will not be affected
- Also, the function I have created is returning a $dictionary$ with two values

In [10]:
# Default is row by row
df.apply(testfunc)

ox,oy=( 4.0 , nan), r1,r2=( 16.000000.2 , nan.2)
ox,oy=( 4.0 , nan), r1,r2=( 16.000000.2 , nan.2)
ox,oy=( 10.0 , 20.0), r1,r2=( 100.000000.2 , 400.000000.2)
ox,oy=( 100 , 50), r1,r2=( 10000.000000.2 , 2500.000000.2)


Unnamed: 0,AAA,BBB,CCC
x,16.0,100,10000
y,,400,2500


In [11]:
# We need to insert columns
df.apply(testfunc, axis=1)

ox,oy=( 4.0 , 10.0), r1,r2=( 16.000000.2 , 100.000000.2)
ox,oy=( 4.0 , 10.0), r1,r2=( 16.000000.2 , 100.000000.2)
ox,oy=( nan , 20.0), r1,r2=( nan.2 , 400.000000.2)
ox,oy=( 6.0 , nan), r1,r2=( 36.000000.2 , nan.2)
ox,oy=( 7.0 , 40.0), r1,r2=( 49.000000.2 , 1600.000000.2)


Unnamed: 0,x,y
0,16.0,100.0
1,,400.0
2,36.0,
3,49.0,1600.0


In [12]:
# Try to insert the two new columns directly
df[['x','y']]=df.apply(testfunc, axis=1)
df.head()

ox,oy=( 4.0 , 10.0), r1,r2=( 16.000000.2 , 100.000000.2)
ox,oy=( 4.0 , 10.0), r1,r2=( 16.000000.2 , 100.000000.2)
ox,oy=( nan , 20.0), r1,r2=( nan.2 , 400.000000.2)
ox,oy=( 6.0 , nan), r1,r2=( 36.000000.2 , nan.2)
ox,oy=( 7.0 , 40.0), r1,r2=( 49.000000.2 , 1600.000000.2)


Unnamed: 0,AAA,BBB,CCC,x,y
0,4.0,10.0,100,16.0,100.0
1,,20.0,50,,400.0
2,6.0,,-30,36.0,
3,7.0,40.0,-50,49.0,1600.0


In [16]:
# That worked like a charm. Now lets extend the apply function and try everything again from df init
df = pd.DataFrame({'AAA' : [4,np.NaN,6,7], 'BBB' : [10,20,np.NaN,40],'CCC' : [100,50,-30,-50]})

def testfunc(x):
    r1=x[0]**2
    r2=x[1]**2 
    print("ox,oy=( %s , %s), r1,r2=( %f.2 , %f.2)"%(x[0], x[1], r1, r2))
    if (pd.notnull(r1) & pd.notnull(r2)):
        return pd.Series(dict(x=r1, y=r2))
    else:
        return np.nan

In [17]:
# Try to insert the two new columns directly (Columns are already there, so this is not so interesting...)
df[['x','y']]=df.apply(testfunc, axis=1)
df.head()

ox,oy=( 4.0 , 10.0), r1,r2=( 16.000000.2 , 100.000000.2)
ox,oy=( 4.0 , 10.0), r1,r2=( 16.000000.2 , 100.000000.2)
ox,oy=( nan , 20.0), r1,r2=( nan.2 , 400.000000.2)
ox,oy=( 6.0 , nan), r1,r2=( 36.000000.2 , nan.2)
ox,oy=( 7.0 , 40.0), r1,r2=( 49.000000.2 , 1600.000000.2)


Unnamed: 0,AAA,BBB,CCC,x,y
0,4.0,10.0,100,16.0,100.0
1,,20.0,50,,
2,6.0,,-30,,
3,7.0,40.0,-50,49.0,1600.0


In [14]:
# That also worked as expected, let's see if we can just not return anything in some cases
df = pd.DataFrame({'AAA' : [4,np.NaN,6,7], 'BBB' : [10,20,np.NaN,40],'CCC' : [100,50,-30,-50]})
def testfunc(x):
    r1=x[0]**2
    r2=x[1]**2 
    print("ox,oy=( %s , %s), r1,r2=( %f.2 , %f.2)"%(x[0], x[1], r1, r2))
    if (pd.notnull(r1) & pd.notnull(r2)):
        return pd.Series(dict(x=r1, y=r2))
# Try to insert the two new columns directly
df[['x','y']]=df.apply(testfunc, axis=1)
df.head()

ox,oy=( 4.0 , 10.0), r1,r2=( 16.000000.2 , 100.000000.2)
ox,oy=( 4.0 , 10.0), r1,r2=( 16.000000.2 , 100.000000.2)
ox,oy=( nan , 20.0), r1,r2=( nan.2 , 400.000000.2)
ox,oy=( 6.0 , nan), r1,r2=( 36.000000.2 , nan.2)
ox,oy=( 7.0 , 40.0), r1,r2=( 49.000000.2 , 1600.000000.2)


Unnamed: 0,AAA,BBB,CCC,x,y
0,4.0,10.0,100,16.0,100.0
1,,20.0,50,,
2,6.0,,-30,,
3,7.0,40.0,-50,49.0,1600.0


In [15]:
# Hmmm - same result, so there will just be some cleaning up afterwards then
df.dropna(axis=0)

Unnamed: 0,AAA,BBB,CCC,x,y
0,4,10,100,16,100
3,7,40,-50,49,1600
