### <font color="brown">Pandas - DataFrame Continued</font>

https://pandas.pydata.org/docs/user_guide/index.html<br>
(You can also get at this from Jupiter notebook through Help -> pandas Reference -> User Guide)

In [3]:
import numpy as np
import pandas as pd
from pandas import Series
from pandas import DataFrame

---

#### <font color="brown">Creating a DataFrame from a CSV file (typical usage)</font>

**Using the Pandas method read_csv**

In [160]:
mpgs = pd.read_csv(mpgfile)
mpgs.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino
5,15.0,8.0,429.0,198.0,4341.0,10.0,70.0,1.0,ford galaxie 500
6,14.0,8.0,454.0,220.0,4354.0,9.0,70.0,1.0,chevrolet impala
7,14.0,8.0,440.0,215.0,4312.0,8.5,70.0,1.0,plymouth fury iii
8,14.0,8.0,455.0,225.0,4425.0,10.0,70.0,1.0,pontiac catalina
9,15.0,8.0,390.0,190.0,3850.0,8.5,70.0,1.0,amc ambassador dpl


**Note: NAs are read in as NaN which is basically a missing/null value.**

In [162]:
mpgs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     406 non-null    float64
 2   displacement  406 non-null    float64
 3   horsepower    400 non-null    float64
 4   weight        406 non-null    float64
 5   acceleration  406 non-null    float64
 6   model year    406 non-null    float64
 7   origin        406 non-null    float64
 8   car name      406 non-null    object 
dtypes: float64(8), object(1)
memory usage: 28.7+ KB


**In the info above, note that each column now has an inferred datatype, not object.<br>
Also note the number of non-null values per column For instance, mpg has 8 missing values, and horsepower has 6 missing values.**

In [163]:
mpgs[mpgs['mpg'].isnull()]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
10,,4.0,133.0,115.0,3090.0,17.5,70.0,2.0,citroen ds-21 pallas
11,,8.0,350.0,165.0,4142.0,11.5,70.0,1.0,chevrolet chevelle concours (sw)
12,,8.0,351.0,153.0,4034.0,11.0,70.0,1.0,ford torino (sw)
13,,8.0,383.0,175.0,4166.0,10.5,70.0,1.0,plymouth satellite (sw)
14,,8.0,360.0,175.0,3850.0,11.0,70.0,1.0,amc rebel sst (sw)
17,,8.0,302.0,140.0,3353.0,8.0,70.0,1.0,ford mustang boss 302
39,,4.0,97.0,48.0,1978.0,20.0,71.0,2.0,volkswagen super beetle 117
367,,4.0,121.0,110.0,2800.0,15.4,81.0,2.0,saab 900s


In [164]:
mpgs[mpgs['horsepower'].isnull()]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
38,25.0,4.0,98.0,,2046.0,19.0,71.0,1.0,ford pinto
133,21.0,6.0,200.0,,2875.0,17.0,74.0,1.0,ford maverick
337,40.9,4.0,85.0,,1835.0,17.3,80.0,2.0,renault lecar deluxe
343,23.6,4.0,140.0,,2905.0,14.3,80.0,1.0,ford mustang cobra
361,34.5,4.0,100.0,,2320.0,15.8,81.0,2.0,renault 18i
382,23.0,4.0,151.0,,3035.0,20.5,82.0,1.0,amc concord dl


##### <font color="brown">Summary starts for numeric columns in DataFrame</font>

In [166]:
# summary stats for numeric columns in dataset
mpgs.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
count,398.0,406.0,406.0,400.0,406.0,406.0,406.0,406.0
mean,23.514573,5.475369,194.779557,105.0825,2979.413793,15.519704,75.921182,1.568966
std,7.815984,1.71216,104.922458,38.768779,847.004328,2.803359,3.748737,0.797479
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,105.0,75.75,2226.5,13.7,73.0,1.0
50%,23.0,4.0,151.0,95.0,2822.5,15.5,76.0,1.0
75%,29.0,8.0,302.0,130.0,3618.25,17.175,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


---

#### <font color="brown">Numpy ufuncs work with DataFrames</font>

In [141]:
df = DataFrame(np.random.randn(4,3),columns=list("ABC"),index=["One","Two","Three",'Four'])
df

Unnamed: 0,A,B,C
One,1.193575,-0.49112,-0.093747
Two,0.507445,0.772171,0.35001
Three,-0.563124,1.271805,-1.578139
Four,0.074375,0.389039,0.525876


In [142]:
np.abs(df)  # won't change original

Unnamed: 0,A,B,C
One,1.193575,0.49112,0.093747
Two,0.507445,0.772171,0.35001
Three,0.563124,1.271805,1.578139
Four,0.074375,0.389039,0.525876


In [144]:
df

Unnamed: 0,A,B,C
One,1.193575,-0.49112,-0.093747
Two,0.507445,0.772171,0.35001
Three,-0.563124,1.271805,-1.578139
Four,0.074375,0.389039,0.525876


In [143]:
# alternatively, use dataframe method abs(), this won't change original df either
df.abs()

Unnamed: 0,A,B,C
One,1.193575,0.49112,0.093747
Two,0.507445,0.772171,0.35001
Three,0.563124,1.271805,1.578139
Four,0.074375,0.389039,0.525876


In [145]:
df

Unnamed: 0,A,B,C
One,1.193575,-0.49112,-0.093747
Two,0.507445,0.772171,0.35001
Three,-0.563124,1.271805,-1.578139
Four,0.074375,0.389039,0.525876


In [146]:
dfabs = df.abs()
dfabs

Unnamed: 0,A,B,C
One,1.193575,0.49112,0.093747
Two,0.507445,0.772171,0.35001
Three,0.563124,1.271805,1.578139
Four,0.074375,0.389039,0.525876


In [147]:
dfabs.mean()  # default is axis=0, so column wise mean

A    0.584630
B    0.731034
C    0.636943
dtype: float64

In [148]:
dfabs.mean(axis=1)  # row-wise

One      0.592814
Two      0.543209
Three    1.137689
Four     0.329763
dtype: float64

In [149]:
dfabs.cumsum(axis=1)

Unnamed: 0,A,B,C
One,1.193575,1.684695,1.778443
Two,0.507445,1.279616,1.629626
Three,0.563124,1.834928,3.413067
Four,0.074375,0.463415,0.98929


In [150]:
dfabs.sum()   # column sums

A    2.338519
B    2.924135
C    2.547772
dtype: float64

##### **What if there are NaN values?**

In [151]:
dfabs2 = dfabs.copy()
dfabs2

Unnamed: 0,A,B,C
One,1.193575,0.49112,0.093747
Two,0.507445,0.772171,0.35001
Three,0.563124,1.271805,1.578139
Four,0.074375,0.389039,0.525876


In [152]:
dfabs2.iloc[1,1] = np.nan
dfabs2

Unnamed: 0,A,B,C
One,1.193575,0.49112,0.093747
Two,0.507445,,0.35001
Three,0.563124,1.271805,1.578139
Four,0.074375,0.389039,0.525876


In [153]:
dfabs2['B'].sum()   # NaN values are skipped

2.1519644018140025

In [154]:
dfabs2.mean(skipna=False)  # but they can be included if needed

A    0.584630
B         NaN
C    0.636943
dtype: float64

In [155]:
dfabs

Unnamed: 0,A,B,C
One,1.193575,0.49112,0.093747
Two,0.507445,0.772171,0.35001
Three,0.563124,1.271805,1.578139
Four,0.074375,0.389039,0.525876


In [156]:
dfabs.argmax()

AttributeError: 'DataFrame' object has no attribute 'argmax'

In [157]:
dfabs['C'].argmax()

2

In [158]:
dfabs.loc['Three'].argmax()

2

---

#### <font color="brown">Working with NaNs</font>

#### dropna to remove rows/columns with NaNs

In [107]:
from numpy import nan as NA
datf = DataFrame([[1, 3.8, 2.1],
                  [2, NA, NA],
                  [NA, NA, NA],
                  [NA, 4.8, 1.7]])
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


**drop rows that have an NaN in any column**

In [108]:
datf.dropna()

Unnamed: 0,0,1,2
0,1.0,3.8,2.1


In [109]:
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


**Original is not modified, use inplace=True to modify original**

In [112]:
datf1 = datf.copy()
datf1.dropna(inplace=True)
datf1

Unnamed: 0,0,1,2
0,1.0,3.8,2.1


**To drop only those rows/columns that have NaN in all columns**

In [114]:
# rows
datf.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
3,,4.8,1.7


In [116]:
# to do the same with columns, pass axis=1
datf.dropna(axis=1)

0
1
2
3


In [117]:
datf.dropna(how='all',axis=1)  # none of the columns are entirely NAs

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


#### Filling missing spots (NaNs) with values

**Replace all NaNs with single value**

In [119]:
datf.fillna(0)

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,4.8,1.7


**Replace all NaNs in row or column using ffill/pad (forward fill)**

In [120]:
# column-wise
datf.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,3.8,2.1
2,2.0,3.8,2.1
3,2.0,4.8,1.7


In [123]:
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


In [124]:
# row-wise
datf.fillna(method='ffill',axis=1)

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,2.0,2.0
2,,,
3,,4.8,1.7


**<font color="brown">Can also fill backward using bfill/backfill - more later</font>**

**Replace all NaNs in multiple columns using dictionary**

In [128]:
datf.fillna({1: 2.5, 2: 1.5})

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,2.5,1.5
2,,2.5,1.5
3,,4.8,1.7


In [131]:
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,2.5,1.5
2,,2.5,1.5
3,,4.8,1.7


In [130]:
# or you can treat columns separately as Series and fillnas per column
datf[2].fillna(1.5)

0    2.1
1    1.5
2    1.5
3    1.7
Name: 2, dtype: float64

In [129]:
# modify original
datf.fillna({1: 2.5, 2: 1.5},inplace=True)
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,2.5,1.5
2,,2.5,1.5
3,,4.8,1.7


In [136]:
datf = DataFrame([[1, 3.8, 2.1],
                  [2, NA, NA],
                  [NA, NA, NA],
                  [NA, 4.8, 1.7]])
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


**Treat column/row separately as Series and fillnas**

In [139]:
datfc = datf.copy()
datfc[2].fillna(1.5,inplace=True)
datfc

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,1.5
2,,,1.5
3,,4.8,1.7


In [140]:
datfc = datf.copy()
datfc.loc[2].fillna(-1,inplace=True)
datfc

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,-1.0,-1.0,-1.0
3,,4.8,1.7


---
#### <font color="brown">One way to deal with missing numeric data is to replace with mean</font>

In [14]:
mpgs['mpg'].mean()

23.514572864321615

In [16]:
mpgs2 = mpgs.copy()

In [17]:
mpgs2[mpgs2['mpg'].isnull()]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
10,,4.0,133.0,115.0,3090.0,17.5,70.0,2.0,citroen ds-21 pallas
11,,8.0,350.0,165.0,4142.0,11.5,70.0,1.0,chevrolet chevelle concours (sw)
12,,8.0,351.0,153.0,4034.0,11.0,70.0,1.0,ford torino (sw)
13,,8.0,383.0,175.0,4166.0,10.5,70.0,1.0,plymouth satellite (sw)
14,,8.0,360.0,175.0,3850.0,11.0,70.0,1.0,amc rebel sst (sw)
17,,8.0,302.0,140.0,3353.0,8.0,70.0,1.0,ford mustang boss 302
39,,4.0,97.0,48.0,1978.0,20.0,71.0,2.0,volkswagen super beetle 117
367,,4.0,121.0,110.0,2800.0,15.4,81.0,2.0,saab 900s


##### **Use fillna method on relevant column (Series)**

In [18]:
mpgs2['mpg'] = mpgs2['mpg'].fillna(mpgs2['mpg'].mean())

In [19]:
mpgs2.loc[10:14]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
10,23.514573,4.0,133.0,115.0,3090.0,17.5,70.0,2.0,citroen ds-21 pallas
11,23.514573,8.0,350.0,165.0,4142.0,11.5,70.0,1.0,chevrolet chevelle concours (sw)
12,23.514573,8.0,351.0,153.0,4034.0,11.0,70.0,1.0,ford torino (sw)
13,23.514573,8.0,383.0,175.0,4166.0,10.5,70.0,1.0,plymouth satellite (sw)
14,23.514573,8.0,360.0,175.0,3850.0,11.0,70.0,1.0,amc rebel sst (sw)


In [20]:
mpgs2['horsepower'] = mpgs2['horsepower'].fillna(mpgs2['horsepower'].mean())

In [21]:
mpgs2.loc[[38,133]]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
38,25.0,4.0,98.0,105.0825,2046.0,19.0,71.0,1.0,ford pinto
133,21.0,6.0,200.0,105.0825,2875.0,17.0,74.0,1.0,ford maverick


In [22]:
mpgs2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           406 non-null    float64
 1   cylinders     406 non-null    float64
 2   displacement  406 non-null    float64
 3   horsepower    406 non-null    float64
 4   weight        406 non-null    float64
 5   acceleration  406 non-null    float64
 6   model year    406 non-null    float64
 7   origin        406 non-null    float64
 8   car name      406 non-null    object 
dtypes: float64(8), object(1)
memory usage: 28.7+ KB


---

#### <font color="brown">General data frame manipulation</font>

**Column returned on indexing is a VIEW, so modifications will affect underlying dataframe column**

In [22]:
rand2d = np.random.random((3,2))
rand2d

array([[0.08869073, 0.45776295],
       [0.69099604, 0.64578043],
       [0.97474218, 0.78522925]])

In [23]:
randdf = DataFrame(rand2d, index=['one', 'two', 'three'],
                   columns = ['first', 'second'])
randdf

Unnamed: 0,first,second
one,0.088691,0.457763
two,0.690996,0.64578
three,0.974742,0.785229


In [24]:
col2 = randdf['second']
col2

one      0.457763
two      0.645780
three    0.785229
Name: second, dtype: float64

In [25]:
col2 += 0.05
print(col2,'\n')
print(randdf)

one      0.507763
two      0.695780
three    0.835229
Name: second, dtype: float64 

          first    second
one    0.088691  0.507763
two    0.690996  0.695780
three  0.974742  0.835229


In [26]:
# if you don't want this, make an explicit copy of the returned column series
randdf['second'] -= 0.05
randdf

Unnamed: 0,first,second
one,0.088691,0.457763
two,0.690996,0.64578
three,0.974742,0.785229


In [27]:
col2 = randdf['second'].copy()
col2 += 0.05
print(col2,'\n')
print(randdf)

one      0.507763
two      0.695780
three    0.835229
Name: second, dtype: float64 

          first    second
one    0.088691  0.457763
two    0.690996  0.645780
three  0.974742  0.785229


**Adding dataframes together**

In [28]:
rand2d = np.random.random((3,3))
randdf2 = DataFrame(rand2d, index=['one', 'two', 'four'],
                   columns = ['first', 'second', 'third'])
randdf2

Unnamed: 0,first,second,third
one,0.630694,0.997698,0.40234
two,0.623308,0.289494,0.767826
four,0.747858,0.927262,0.449514


In [29]:
randdf

Unnamed: 0,first,second
one,0.088691,0.457763
two,0.690996,0.64578
three,0.974742,0.785229


In [32]:
randdf + randdf2   # NaN will be used if either of a pair of aligned values is missing

Unnamed: 0,first,second,third
four,,,
one,0.719385,1.455461,
three,,,
two,1.314304,0.935274,


**Note that indexes are arranged in lexicographic order**

---

#### <font color="brown">Reindexing a Series</font>

In [33]:
ser = Series([1, 5, -2, 16], index=['a','b','x','d'])
ser

a     1
b     5
x    -2
d    16
dtype: int64

In [35]:
# shuffle index positions, and introduce an additional index
ser.reindex(['x','a','b','c','d'])
ser

a     1
b     5
x    -2
d    16
dtype: int64

##### reindex does not change the original, so we need to assign to another series

In [36]:
print(ser,'\n')
ser1 = ser.reindex(['x','a','b','c','d'])
print(ser1)

a     1
b     5
x    -2
d    16
dtype: int64 

x    -2.0
a     1.0
b     5.0
c     NaN
d    16.0
dtype: float64


##### datatype of values changed to float because NaN was introduced, defaults to float

##### **Using fill values for missing positions**

In [38]:
print(ser,'\n')
ser1 = ser.reindex(['x','a','b','c','d'], fill_value=0)
print(ser1)

a     1
b     5
x    -2
d    16
dtype: int64 

x    -2
a     1
b     5
c     0
d    16
dtype: int64


##### Another option, a method named ffill/pad, carries forward a value into missing positions (interpolation)

In [39]:
scores = Series([9,8,10,7,6], index=['q1','q3','q4','q5','q8'])
scores

q1     9
q3     8
q4    10
q5     7
q8     6
dtype: int64

In [40]:
# using ffill
scores2 = scores.reindex(['q1','q2','q3','q4','q5','q6','q7','q8'], method='ffill')
scores2

q1     9
q2     9
q3     8
q4    10
q5     7
q6     7
q7     7
q8     6
dtype: int64

*q5 is forward filled into q6 and q7*

In [43]:
# using pad
scores2 = scores.reindex(['q1','q2','q3','q4','q5','q6','q7','q8'], method='pad')
scores2

q1     9
q2     9
q3     8
q4    10
q5     7
q6     7
q7     7
q8     6
dtype: int64

##### Likewise, you can use bfill/backfill to carry a value backward into missing positions

In [44]:
scores

q1     9
q3     8
q4    10
q5     7
q8     6
dtype: int64

In [45]:
# using bfill
scores2 = scores.reindex(['q1','q2','q3','q4','q5','q6','q7','q8'], method='bfill')
scores2

q1     9
q2     8
q3     8
q4    10
q5     7
q6     6
q7     6
q8     6
dtype: int64

*q8 is back filled into q6 and q7*

---

#### <font color="brown">Reindexing a DataFrame</font>

In [46]:
randdf2

Unnamed: 0,first,second,third
one,0.630694,0.997698,0.40234
two,0.623308,0.289494,0.767826
four,0.747858,0.927262,0.449514


In [47]:
randdf2.reindex(['one','two','three','four'])
randdf2

Unnamed: 0,first,second,third
one,0.630694,0.997698,0.40234
two,0.623308,0.289494,0.767826
four,0.747858,0.927262,0.449514


In [51]:
# again, doesn't change the dataframe, need to assign to another
randdf3 = randdf2.reindex(['one','two','three','four'])
randdf3

Unnamed: 0,first,second,third
one,0.630694,0.997698,0.40234
two,0.623308,0.289494,0.767826
three,,,
four,0.747858,0.927262,0.449514


In [52]:
# but fill doesn't work
randdf3 = randdf2.reindex(['one','two','three','four'],method='ffill')
randdf3

ValueError: index must be monotonic increasing or decreasing

**Error message says index must be monotonic increasing or decreasing, i.e. lexicograpic or reverse lexicographic order**

In [55]:
randdf3 = randdf2.reindex(['four','one','two'])  # lexicographic order, monotonic increasing
randdf3

Unnamed: 0,first,second,third
four,0.747858,0.927262,0.449514
one,0.630694,0.997698,0.40234
two,0.623308,0.289494,0.767826


In [56]:
randdf4 = randdf3.reindex(['four','one','three','two'], method='ffill')
randdf4

Unnamed: 0,first,second,third
four,0.747858,0.927262,0.449514
one,0.630694,0.997698,0.40234
three,0.630694,0.997698,0.40234
two,0.623308,0.289494,0.767826


*row "one" is forward filled into row "three"*