# Missing Values

In [1]:
import pandas as pd

In [2]:
# The pandas read_csv() function has a parameter called
# na_values to let us specify the form of missing values. It allows scalar, string, list, or dictionaries to
# be used.
df = pd.read_csv('class_grades.csv')
df.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,,63.15,48.89
3,7,,,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
7,7,72.85,86.85,60.0,,56.11
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61


In [3]:
# We can actually use the function .isnull() to create a boolean mask of the whole dataframe. This effectively
# broadcasts the isnull() function to every cell of data.
mask=df.isnull()
mask.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,True,False,False
3,False,True,True,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,True,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


### dropna()

In [4]:
# drop all of those rows which have any missing data, which can be done with the dropna() function.
df.dropna().head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61
10,7,80.44,90.2,75.0,91.48,39.72
12,8,97.16,103.71,72.5,93.52,63.33
13,7,91.28,83.53,81.25,99.81,92.22


### fillna()

In [8]:
# One of the handy functions that Pandas has for
# working with missing values is the filling function, fillna(). This function takes a number or parameters.
# You could pass in a single value which is called a scalar value to change all of the missing data to one
# value. This isn't really applicable in this case, but it's a pretty common use case.

# So, if we wanted to fill all missing values with 0, we would use fillna
fill = df.fillna(0)
fill.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,0.0,63.15,48.89
3,7,0.0,0.0,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
7,7,72.85,86.85,60.0,0.0,56.11
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61


In [6]:
# Next up is the method parameter(). The two common fill values are ffill and bfill. ffill is for forward
# filling and it updates an na value for a particular cell with the value from the previous row. bfill is
# backward filling, which is the opposite of ffill. It fills the missing values with the next valid value.
# It's important to note that your data needs to be sorted in order for this to have the effect you might
# want. Data which comes from traditional database management systems usually has no order guarantee, just
# like this data.

### method parameters- ffill  and bfill

In [10]:
df.fillna(method = 'bfill')

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.50
1,8,95.05,105.49,67.50,99.07,68.33
2,8,83.70,83.17,49.38,63.15,48.89
3,7,91.32,93.64,49.38,105.93,80.56
4,8,91.32,93.64,95.00,107.41,73.89
...,...,...,...,...,...,...
94,8,89.94,103.71,45.00,93.52,61.94
95,7,89.94,80.54,41.25,93.70,39.72
96,8,89.94,102.77,87.50,90.74,87.78
97,7,95.60,76.13,66.25,99.81,85.56


In [11]:
df.fillna(method = 'ffill')

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.50
1,8,95.05,105.49,67.50,99.07,68.33
2,8,83.70,83.17,67.50,63.15,48.89
3,7,83.70,83.17,49.38,105.93,80.56
4,8,91.32,93.64,95.00,107.41,73.89
...,...,...,...,...,...,...
94,8,87.52,103.71,45.00,93.52,61.94
95,7,87.52,80.54,41.25,93.70,39.72
96,8,89.94,102.77,87.50,90.74,87.78
97,7,95.60,76.13,66.25,99.81,85.56


In [14]:
df = df.set_index('Prefix')
df = df.sort_index()
df.head(20)

Unnamed: 0_level_0,Assignment,Tutorial,Midterm,TakeHome,Final
Prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,86.86,62.64,92.5,85.19,62.78
4,87.93,99.47,53.12,9999.0,61.11
4,92.01,102.52,38.75,86.11,49.17
5,57.14,34.09,64.38,51.48,52.5
5,88.09,63.39,74.38,93.7,50.83
6,,34.09,66.88,51.48,55.83
6,28.14,58.51,72.5,53.7,68.33
6,90.74,89.64,61.25,90.0,
6,95.05,70.24,52.5,52.41,47.78
6,95.6,61.4,64.38,99.81,42.78


In [19]:
df = df.reset_index()
df = df.set_index(['Prefix', 'Assignment'])
df = df.sort_index()
df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Tutorial,Midterm,TakeHome,Final
Prefix,Assignment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,86.86,62.64,92.5,85.19,62.78
4,87.93,99.47,53.12,9999.0,61.11
4,92.01,102.52,38.75,86.11,49.17
5,57.14,34.09,64.38,51.48,52.5
5,88.09,63.39,74.38,93.7,50.83
6,28.14,58.51,72.5,53.7,68.33
6,90.74,89.64,61.25,90.0,
6,91.28,95.24,82.5,97.59,92.78
6,95.05,70.24,52.5,52.41,47.78
6,95.6,61.4,64.38,99.81,42.78


In [20]:
df = pd.DataFrame({'A': [1, 1, 2, 3, 4],
                   'B': [3, 6, 3, 8, 9],
                   'C': ['a', 'b', 'c', 'd', 'e']})
df

Unnamed: 0,A,B,C
0,1,3,a
1,1,6,b
2,2,3,c
3,3,8,d
4,4,9,e


### replace()

In [21]:
# We can replace 1's with 100, let's try the value-to-value approach
df.replace(1, 100)

Unnamed: 0,A,B,C
0,100,3,a
1,100,6,b
2,2,3,c
3,3,8,d
4,4,9,e


In [22]:
# How about changing two values? Let's try the list approach For example, we want to change 1's to 100 and 3's
# to 300
df.replace([1, 3], [100, 300])

Unnamed: 0,A,B,C
0,100,300,a
1,100,6,b
2,2,300,c
3,300,8,d
4,4,9,e


In [23]:
# Here's my solution, first matching any number of characters then ending in .html
# df.replace(to_replace=".*.html$", value="webpage", regex=True)

In [24]:
# # That works, but it's kind of gross. And it's slow, since we had to make a full copy of a column then go
# # through and update strings. There are a few other ways we can deal with this. Let me show you the most 
# # general one first, and that's called the apply() function. Let's drop the column we made first
# del(df["First"])

# # The apply() function on a dataframe will take some arbitrary function you have written and apply it to
# # either a Series (a single column) or DataFrame across all rows or columns. Lets write a function which
# # just splits a string into two pieces using a single row of data
# def splitname(row):
#     # The row is a single Series object which is a single row indexed by column values
#     # Let's extract the firstname and create a new entry in the series
#     row['First']=row['President'].split(" ")[0]
#     # Let's do the same with the last word in the string
#     row['Last']=row['President'].split(" ")[-1]
#     # Now we just return the row and the pandas .apply() will take of merging them back into a DataFrame
#     return row

# # Now if we apply this to the dataframe indicating we want to apply it across columns
# df=df.apply(splitname, axis='columns')
# df.head()

In [26]:
# # Pretty questionable as to whether that is less gross, but it achieves the result and I find that I use the
# # apply() function regularly in my work. The pandas series has a couple of other nice convenience functions
# # though, and the next I would like to touch on is called .extract(). Lets drop our firstname and lastname.
# del(df['First'])
# del(df['Last'])

# # Extract takes a regular expression as input and specifically requires you to set capture groups that
# # correspond to the output columns you are interested in. 
# # if you were going to write a regular expression that returned groups and just had the
# # firstname and lastname in it, what would that look like?

# # Here's my solution, where we match three groups but only return two, the first and the last name
# pattern="(^[\w]*)(?:.* )([\w]*$)"

# # Now the extract function is built into the str attribute of the Series object, so we can call it
# # using Series.str.extract(pattern)
# df["President"].str.extract(pattern).head()