### <font color="brown">Pandas - DataFrame Continued</font>

In [1]:
import numpy as np
import pandas as pd
from pandas import Series
from pandas import DataFrame

---

#### <font color="brown">Function Application and Mapping</font>

##### <font color="brown">apply, for Series objects of DataFrame (columns or rows)</font>

In [2]:
df = DataFrame(np.random.randn(4,3),columns=list("ABC"),index=["One","Two","Three",'Four'])
df

Unnamed: 0,A,B,C
One,1.504181,-0.026839,0.371866
Two,-0.207043,0.964151,0.239417
Three,1.019738,0.834195,-0.924367
Four,-0.535898,-0.409469,0.097684


In [3]:
dfabs = df.abs()
dfabs

Unnamed: 0,A,B,C
One,1.504181,0.026839,0.371866
Two,0.207043,0.964151,0.239417
Three,1.019738,0.834195,0.924367
Four,0.535898,0.409469,0.097684


**Function that rounds each item of a series that will be sent as parameter**

In [4]:
roundfn = lambda x: round(x,2)  # x is a Series

**Use apply method on dataframe with function as parameter, each column will be sent as argument to function**

In [5]:
dfabs  # original does not change

Unnamed: 0,A,B,C
One,1.504181,0.026839,0.371866
Two,0.207043,0.964151,0.239417
Three,1.019738,0.834195,0.924367
Four,0.535898,0.409469,0.097684


In [6]:
# or you can directly write the lambda as argument
dfabs.apply(lambda x: round(x,2))

Unnamed: 0,A,B,C
One,1.5,0.03,0.37
Two,0.21,0.96,0.24
Three,1.02,0.83,0.92
Four,0.54,0.41,0.1


In [7]:
# this makes it clear that apply is executed a column at a time (axis=0 is default)
dfabs.apply(lambda x: np.cumsum(x))

Unnamed: 0,A,B,C
One,1.504181,0.026839,0.371866
Two,1.711224,0.990989,0.611282
Three,2.730962,1.825185,1.535649
Four,3.26686,2.234654,1.633333


In [8]:
# you can define any function, not just lambdas
def roundsum(x):
    return round(np.cumsum(x),2)

In [9]:
dfabs.apply(roundsum)

Unnamed: 0,A,B,C
One,1.5,0.03,0.37
Two,1.71,0.99,0.61
Three,2.73,1.83,1.54
Four,3.27,2.23,1.63


In [10]:
dfabs

Unnamed: 0,A,B,C
One,1.504181,0.026839,0.371866
Two,0.207043,0.964151,0.239417
Three,1.019738,0.834195,0.924367
Four,0.535898,0.409469,0.097684


In [11]:
# a row at a time
dfabs.apply(roundsum,axis=1)

Unnamed: 0,A,B,C
One,1.5,1.53,1.9
Two,0.21,1.17,1.41
Three,1.02,1.85,2.78
Four,0.54,0.95,1.04


In [12]:
# of course, you can simply extract a column or row and send it 
dfabs['A'].apply(roundfn)

One      1.50
Two      0.21
Three    1.02
Four     0.54
Name: A, dtype: float64

In [13]:
dfabs.loc['Two'].apply(roundfn)

A    0.21
B    0.96
C    0.24
Name: Two, dtype: float64

In [14]:
# since round is a Python function you can use it directly
dfabs.loc['Two'].round(2)

A    0.21
B    0.96
C    0.24
Name: Two, dtype: float64

In [15]:
dfabs

Unnamed: 0,A,B,C
One,1.504181,0.026839,0.371866
Two,0.207043,0.964151,0.239417
Three,1.019738,0.834195,0.924367
Four,0.535898,0.409469,0.097684


In [16]:
# for built-in Python functions, apply is generally unnecessary
dfabs.round(2)

Unnamed: 0,A,B,C
One,1.5,0.03,0.37
Two,0.21,0.96,0.24
Three,1.02,0.83,0.92
Four,0.54,0.41,0.1


---

##### <font color="brown">applymap, for one item of a DataFrame at a time</font>

In [17]:
dfabs.applymap(lambda x: round(x,2))  

Unnamed: 0,A,B,C
One,1.5,0.03,0.37
Two,0.21,0.96,0.24
Three,1.02,0.83,0.92
Four,0.54,0.41,0.1


In [18]:
dfabs  # original not changed

Unnamed: 0,A,B,C
One,1.504181,0.026839,0.371866
Two,0.207043,0.964151,0.239417
Three,1.019738,0.834195,0.924367
Four,0.535898,0.409469,0.097684


In [19]:
dfabs.applymap(lambda x: np.round(np.cumsum(x),2))

Unnamed: 0,A,B,C
One,[1.5],[0.03],[0.37]
Two,[0.21],[0.96],[0.24]
Three,[1.02],[0.83],[0.92]
Four,[0.54],[0.41],[0.1]


##### **each item is treated like an ndarray, so cumulative sum is trivially done on a single item**

---

##### <font color="brown">map, for Series</font>

In [20]:
dfabs

Unnamed: 0,A,B,C
One,1.504181,0.026839,0.371866
Two,0.207043,0.964151,0.239417
Three,1.019738,0.834195,0.924367
Four,0.535898,0.409469,0.097684


In [21]:
dfabs['C'].map(lambda x: round(x,2))

One      0.37
Two      0.24
Three    0.92
Four     0.10
Name: C, dtype: float64

In [22]:
dfabs  # original not changed

Unnamed: 0,A,B,C
One,1.504181,0.026839,0.371866
Two,0.207043,0.964151,0.239417
Three,1.019738,0.834195,0.924367
Four,0.535898,0.409469,0.097684


In [23]:
dfabs.loc['Three'].map(lambda x: round(x,2))

A    1.02
B    0.83
C    0.92
Name: Three, dtype: float64

In [24]:
# try mapping two rows
dfabs.loc[['Two','Three']].map(lambda x: round(x,2))

AttributeError: 'DataFrame' object has no attribute 'map'

In [39]:
# won't work because the indexing gives a dataframe, so use apply
dfabs.loc[['Two','Three']].apply(lambda x: round(x,2))

Unnamed: 0,A,B,C
Two,0.21,0.96,0.24
Three,1.02,0.83,0.92


In [40]:
dfabs

Unnamed: 0,A,B,C
One,1.504181,0.026839,0.371866
Two,0.207043,0.964151,0.239417
Three,1.019738,0.834195,0.924367
Four,0.535898,0.409469,0.097684


---

---

#### <font color="brown">Iterating over rows and columns of DataFrame</font>

##### <font color="brown">Iterating over rows using iterrows</font>

In [41]:
dfround = dfabs.round(2)
dfround

Unnamed: 0,A,B,C
One,1.5,0.03,0.37
Two,0.21,0.96,0.24
Three,1.02,0.83,0.92
Four,0.54,0.41,0.1


In [42]:
for row in dfround.iterrows():
    print(row,'\n')

('One', A    1.50
B    0.03
C    0.37
Name: One, dtype: float64) 

('Two', A    0.21
B    0.96
C    0.24
Name: Two, dtype: float64) 

('Three', A    1.02
B    0.83
C    0.92
Name: Three, dtype: float64) 

('Four', A    0.54
B    0.41
C    0.10
Name: Four, dtype: float64) 



**Each row is returned as a tuple: first item is index label of row, second item is Series for row.<br>
For each row Series, its index are the column names, and values are values in that row**

In [43]:
for row in dfround.iterrows():
    ser = row[1]
    print(ser.index)
    print(ser.values)
    print('\n')

Index(['A', 'B', 'C'], dtype='object')
[1.5  0.03 0.37]


Index(['A', 'B', 'C'], dtype='object')
[0.21 0.96 0.24]


Index(['A', 'B', 'C'], dtype='object')
[1.02 0.83 0.92]


Index(['A', 'B', 'C'], dtype='object')
[0.54 0.41 0.1 ]




In [44]:
# retrieving column values individually
for row in dfround.iterrows():
    ser = row[1]
    for i,col in enumerate(ser.index):
        print(f'{col}: {ser.values[i]}')
    print('\n')

A: 1.5
B: 0.03
C: 0.37


A: 0.21
B: 0.96
C: 0.24


A: 1.02
B: 0.83
C: 0.92


A: 0.54
B: 0.41
C: 0.1




---

##### <font color="brown">Iterating over columns using iteritems</font>

In [45]:
dfround

Unnamed: 0,A,B,C
One,1.5,0.03,0.37
Two,0.21,0.96,0.24
Three,1.02,0.83,0.92
Four,0.54,0.41,0.1


In [46]:
for col in dfround.iteritems():
    print(col)
    print('\n')

('A', One      1.50
Two      0.21
Three    1.02
Four     0.54
Name: A, dtype: float64)


('B', One      0.03
Two      0.96
Three    0.83
Four     0.41
Name: B, dtype: float64)


('C', One      0.37
Two      0.24
Three    0.92
Four     0.10
Name: C, dtype: float64)




**Each column is returned as a tuple: first item is column name, second item is Series for column.<br>
For each Series, index is the row index, and values are values in that column**

---

---

#### <font color="brown">Grouping</font>

##### <font color="brown">Example 1: State populations for some years</font>

In [47]:
popdat = {'state': ['Arizona','Arizona','Arizona','Virginia','Virginia'],
          'year': [2005, 2010, 2015, 2010, 2015],
          'pop': [5.9, 6.6, 6.8, 7.9, 8.3]}
popdf = DataFrame(popdat)
popdf

Unnamed: 0,state,year,pop
0,Arizona,2005,5.9
1,Arizona,2010,6.6
2,Arizona,2015,6.8
3,Virginia,2010,7.9
4,Virginia,2015,8.3


**Q. What is the yearly population over all states?**

In [48]:
# first group by year
yrgrp = popdf.groupby('year')
yrgrp

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f89ebe57a90>

In [49]:
# then sum up within each group
df = yrgrp.sum()  # pop is the only numeric column, so sum applies to it
df

Unnamed: 0_level_0,pop
year,Unnamed: 1_level_1
2005,5.9
2010,14.5
2015,15.1


In [50]:
df.reset_index()  # so we can have year as a column

Unnamed: 0,year,pop
0,2005,5.9
1,2010,14.5
2,2015,15.1


In [51]:
popdf['debt'] = Series([1.2,1.2,1.1,0.9,1.2])
popdf

Unnamed: 0,state,year,pop,debt
0,Arizona,2005,5.9,1.2
1,Arizona,2010,6.6,1.2
2,Arizona,2015,6.8,1.1
3,Virginia,2010,7.9,0.9
4,Virginia,2015,8.3,1.2


**Q. What is the yearly population and debt over all states?**

In [None]:
yrgrp = popdf.groupby('year')

In [None]:
df = yrgrp.sum()  # sum is applied to both numeric columns
df

##### <font color="brown">Example 2: School graduates and majors for some years</font>

In [52]:
grads = pd.read_csv(open('graduates.csv'))
grads

Unnamed: 0,Student School,Graduating Year,Major
0,Rutgers,2012,CS
1,Penn State,2011,EE
2,Princeton,2013,Psychology
3,MIT,2010,Physics
4,Rutgers,2018,Math
5,Penn State,2019,Economics
6,MIT,2017,CS
7,Penn State,2015,Biology
8,Rutgers,2013,Philosophy
9,Princeton,2012,Economics


**Q. How many grads in 2012?**

In [26]:
gdf = grads.groupby('Graduating Year').count()
gdf

Unnamed: 0_level_0,Student School,Major
Graduating Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,1,1
2011,1,1
2012,3,3
2013,3,3
2015,3,3
2017,1,1
2018,2,2
2019,1,1


In [27]:
gdf.loc[2012]['Major']

3

**Q. How many Econ grads in 2105?**

In [28]:
# 2-level grouping, first by major, then by graduating year within major
gdf2 = grads.groupby(['Major','Graduating Year']).count()
gdf2

Unnamed: 0_level_0,Unnamed: 1_level_0,Student School
Major,Graduating Year,Unnamed: 2_level_1
Biology,2015,1
CS,2012,2
CS,2017,1
CS,2018,1
EE,2011,1
Economics,2012,1
Economics,2015,2
Economics,2019,1
Math,2018,1
Philosophy,2013,1


In [29]:
gdf2.loc['Economics',2015]

Student School    2
Name: (Economics, 2015), dtype: int64

In [30]:
gdf2.loc['Economics']

Unnamed: 0_level_0,Student School
Graduating Year,Unnamed: 1_level_1
2012,1
2015,2
2019,1


In [31]:
gdf2.loc['Economics',2015]['Student School']

2

In [72]:
gdf2.reset_index()

Unnamed: 0,Major,Graduating Year,Student School
0,Biology,2015,1
1,CS,2012,2
2,CS,2017,1
3,CS,2018,1
4,EE,2011,1
5,Economics,2012,1
6,Economics,2015,2
7,Economics,2019,1
8,Math,2018,1
9,Philosophy,2013,1


In [74]:
gdf2_reset = gdf2.reset_index()

In [75]:
gdf2_reset[(gdf2_reset['Major'] == 'Economics') & (gdf2_reset['Graduating Year'] == 2015)]

Unnamed: 0,Major,Graduating Year,Student School
6,Economics,2015,2


In [79]:
ser = gdf2_reset[(gdf2_reset['Major'] == 'Economics') & (gdf2_reset['Graduating Year'] == 2015)]['Student School']

6    2
Name: Student School, dtype: int64

In [85]:
ser.values[0]

2

---

---

#### <font color="brown">Value Counts</font>

**Q. What are the top majors by number of graduates?**

In [54]:
grads

Unnamed: 0,Student School,Graduating Year,Major
0,Rutgers,2012,CS
1,Penn State,2011,EE
2,Princeton,2013,Psychology
3,MIT,2010,Physics
4,Rutgers,2018,Math
5,Penn State,2019,Economics
6,MIT,2017,CS
7,Penn State,2015,Biology
8,Rutgers,2013,Philosophy
9,Princeton,2012,Economics


In [55]:
major_counts = grads['Major'].value_counts()  # returns a series with major as index
major_counts

Economics     4
CS            4
Psychology    2
Philosophy    1
Physics       1
EE            1
Math          1
Biology       1
Name: Major, dtype: int64

In [56]:
major_counts[major_counts == majorser.max()]  # get series items whose value is equal to max value

Economics    4
CS           4
Name: Major, dtype: int64

In [57]:
major_counts[major_counts == major_counts.max()].index.tolist()  

['Economics', 'CS']

**Note above that an index can be converted to a list with the tolist method**

---

---

#### <font color="brown">Dropping rows or columns (variation of del operation for column)</font>

In [62]:
nparr = np.random.random((4,3))
randdf = DataFrame(nparr,index=['four','one','three','two'],columns=['first','second','third'])
randdf

Unnamed: 0,first,second,third
four,0.383509,0.766397,0.61735
one,0.317755,0.639141,0.414127
three,0.026308,0.129146,0.673448
two,0.755009,0.129073,0.121383


**Dropping rows**

In [63]:
randdf2 = randdf.drop(['four','three'])
randdf2

Unnamed: 0,first,second,third
one,0.317755,0.639141,0.414127
two,0.755009,0.129073,0.121383


In [64]:
randdf

Unnamed: 0,first,second,third
four,0.383509,0.766397,0.61735
one,0.317755,0.639141,0.414127
three,0.026308,0.129146,0.673448
two,0.755009,0.129073,0.121383


*Original is not changed*

In [66]:
rfcopy = randdf.copy()
del rfcopy['second']
rfcopy

Unnamed: 0,first,third
four,0.383509,0.61735
one,0.317755,0.414127
three,0.026308,0.673448
two,0.755009,0.121383


*But del operation changes the original*

In [67]:
# trying to drop column "fourth"
randdf.drop(['third'])

KeyError: "['third'] not found in axis"

In [69]:
# need to use axis=1
randdf.drop(['third'],axis=1)

Unnamed: 0,first,second
four,0.383509,0.766397
one,0.317755,0.639141
three,0.026308,0.129146
two,0.755009,0.129073


In [70]:
randdf

Unnamed: 0,first,second,third
four,0.383509,0.766397,0.61735
one,0.317755,0.639141,0.414127
three,0.026308,0.129146,0.673448
two,0.755009,0.129073,0.121383


**Use inplace=True to modify original**

In [71]:
rcopy = randdf.copy()
rcopy.drop(['three','four'],inplace=True)
rcopy

Unnamed: 0,first,second,third
one,0.317755,0.639141,0.414127
two,0.755009,0.129073,0.121383
