In [1]:
## Reading csv + Basic Attributes

import pandas as pd

#How we read in a pandas dataframe. The header=0 means column names are in the first row
df = pd.read_csv("./data/Grades.csv", header=0)

#type(df)

#The head method returns the first five rows
df.head()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade
0,Jake,32.0,1,19.5,20.0,1,10.0,33.0,A
1,Joe,32.0,1,20.0,16.0,1,14.0,32.0,A
2,Susan,30.0,1,19.0,19.0,1,10.5,33.0,A-
3,Sol,31.0,1,22.0,13.0,1,13.0,34.0,A
4,Chris,30.0,1,19.0,17.0,1,12.5,33.5,A


We can convert the type of a column using the **astype()** method.

In [18]:
#Returns the Participation 1 column as a float
df["Participation1"].astype("float64").head()

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: Participation1, dtype: float64

In [24]:
df.dtypes

Name               object
Previous_Part     float64
Participation1      int64
Mini_Exam1        float64
Mini_Exam2        float64
Participation2      int64
Mini_Exam3        float64
Final             float64
Grade              object
dtype: object

In [25]:
#Change Participation 1 column
df["Participation1"] = df["Participation1"].astype("float64")
#Note that df has changed and the two Participation column are different
df.dtypes

Name               object
Previous_Part     float64
Participation1    float64
Mini_Exam1        float64
Mini_Exam2        float64
Participation2      int64
Mini_Exam3        float64
Final             float64
Grade              object
dtype: object

Note that the two participation columns have different types because of our conversion.

The **unique()** method returns an array (think of it as a list) of the unique values in the column

In [21]:
#Let's look at how many unique grades there were
list_grades = df["Grade"].unique()

list_grades

array(['A', 'A-', 'B', 'B+', 'A+', 'B-', 'C+'], dtype=object)

In [22]:
#We can slice list_grades just like a list
list_grades[1:3]

array(['A-', 'B'], dtype=object)

The **value_counts()** method returns the counts of each unique value in the column as a series

In [23]:
grade_breakdown = df["Grade"].value_counts()
grade_breakdown

A     8
B     3
C+    2
B+    2
A-    2
A+    1
B-    1
Name: Grade, dtype: int64

In [26]:
#Using the describe() method
summary = df.describe()
summary

Unnamed: 0,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0
mean,29.5,0.947368,18.026316,17.052632,1.0,10.526316,31.815789
std,2.848001,0.229416,3.360199,2.753254,0.0,2.796196,4.913729
min,22.0,0.0,7.0,12.0,1.0,5.5,21.0
25%,28.5,1.0,17.5,15.5,1.0,9.0,31.0
50%,30.0,1.0,19.0,18.0,1.0,10.0,32.0
75%,31.5,1.0,20.0,19.0,1.0,12.75,34.0
max,33.0,1.0,22.0,21.0,1.0,17.0,42.0


Next, we look at how to create new columns

In [27]:
#Create a New Column that is a function of other columns
df["Final_Perc"] = df["Final"]/35
df.head()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Final_Perc
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,0.942857
1,Joe,32.0,1.0,20.0,16.0,1,14.0,32.0,A,0.914286
2,Susan,30.0,1.0,19.0,19.0,1,10.5,33.0,A-,0.942857
3,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,0.971429
4,Chris,30.0,1.0,19.0,17.0,1,12.5,33.5,A,0.957143


In [29]:
#I can then delete it with the drop method
df.drop(["Final_Perc"], inplace = True, axis=1)
df.head()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A
1,Joe,32.0,1.0,20.0,16.0,1,14.0,32.0,A
2,Susan,30.0,1.0,19.0,19.0,1,10.5,33.0,A-
3,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A
4,Chris,30.0,1.0,19.0,17.0,1,12.5,33.5,A


The inplace argument works as follows:

 - inplace = True : The dataframe itself will have the given column(s) deleted.
 - inplace = False: Will return a dataframe with the column(s) deleted.
 
 The axis argument works as follows:
 
 - axis = 1 : delete columns given
 - axis = 0 : delete rows given.
 
 Let's look at an example where we delete rows

In [30]:
#Delete rows with index 0 and 2
drop_rows = df.drop([0,2], inplace = False, axis=0)
drop_rows.head()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade
1,Joe,32.0,1.0,20.0,16.0,1,14.0,32.0,A
3,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A
4,Chris,30.0,1.0,19.0,17.0,1,12.5,33.5,A
5,Tarik,31.0,1.0,19.0,19.0,1,8.0,24.0,B
6,Malik,31.5,1.0,20.0,21.0,1,9.0,36.0,A


Let's have a look at df

In [31]:
df.head()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A
1,Joe,32.0,1.0,20.0,16.0,1,14.0,32.0,A
2,Susan,30.0,1.0,19.0,19.0,1,10.5,33.0,A-
3,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A
4,Chris,30.0,1.0,19.0,17.0,1,12.5,33.5,A


Note that df was not changed! This is what happens when you set inplace.

Let's see how we can sort a data frame.  The inplace argument has the same affect as the drop method.

In [32]:
#Sort the data frame according tothe Final Column
#By setting inplace= False will just return the sorted dataframe and not chnage df 
df.sort_values(by = ["Final"], inplace =False, ascending=False).head()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade
8,Ronaldo,33.0,1.0,20.0,20.0,1,17.0,42.0,A+
9,Messi,30.5,1.0,17.0,18.0,1,9.0,37.0,A-
6,Malik,31.5,1.0,20.0,21.0,1,9.0,36.0,A
17,Vik,31.5,1.0,15.0,19.0,1,13.0,35.0,A
3,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A


Now let's sort by multiple columns, specifying more than one column is essentially specifying a tie break



Now let's dive into the datetime column type with Parking data set, where each row corresponds to a different parking ticket given in NYC.

In [3]:
#Sort by Mini Exam 1 and tie breal with Previous Part

result_sorted = df.sort_values(by = ["Mini_Exam1", "Previous_Part","Final"], inplace =False, ascending=True)
result_sorted.head()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade
11,Jimmy,27.5,0,7.0,13.0,1,5.5,31.0,B-
14,Chrinstine,29.0,1,13.0,15.5,1,9.0,31.0,B
17,Vik,31.5,1,15.0,19.0,1,13.0,35.0,A
15,Josh,23.5,1,17.0,12.0,1,8.5,23.0,C+
9,Messi,30.5,1,17.0,18.0,1,9.0,37.0,A-


 we will a collection of important miscellaneous concepts that include:

- Changing columns names
- Combining dataframes
- Understanding the index
- Missing Data
- Reading from Excel

In [6]:
import pandas as pd

#Read in the data frame
df=pd.read_csv("./data/Grades.csv", header=0)

df.head()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade
0,Jake,32.0,1,19.5,20.0,1,10.0,33.0,A
1,Joe,32.0,1,20.0,16.0,1,14.0,32.0,A
2,Susan,30.0,1,19.0,19.0,1,10.5,33.0,A-
3,Sol,31.0,1,22.0,13.0,1,13.0,34.0,A
4,Chris,30.0,1,19.0,17.0,1,12.5,33.5,A


We can change column names through the rename method

In [3]:
#Change the column names
df.rename(columns={"Participation1":"Participation_1", "Participation2":"Participation_2"}, inplace=True)

df.head()

Unnamed: 0,Name,Previous_Part,Participation_1,Mini_Exam1,Mini_Exam2,Participation_2,Mini_Exam3,Final,Grade
0,Jake,32.0,1,19.5,20.0,1,10.0,33.0,A
1,Joe,32.0,1,20.0,16.0,1,14.0,32.0,A
2,Susan,30.0,1,19.0,19.0,1,10.5,33.0,A-
3,Sol,31.0,1,22.0,13.0,1,13.0,34.0,A
4,Chris,30.0,1,19.0,17.0,1,12.5,33.5,A


The format for the columns input is {"old_column_name":"new_column_name"}. It should be noted that the rename method can also be applied to change the index by changing columns to index.

When setting the index, make sure you choose a column that will uniquely identify each row.

We can change the the index back to row numbers using the reset_index() method.

In [7]:
#Resetting the index
df.reset_index(drop=False, inplace=True)
df.head()

Unnamed: 0,index,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade
0,0,Jake,32.0,1,19.5,20.0,1,10.0,33.0,A
1,1,Joe,32.0,1,20.0,16.0,1,14.0,32.0,A
2,2,Susan,30.0,1,19.0,19.0,1,10.5,33.0,A-
3,3,Sol,31.0,1,22.0,13.0,1,13.0,34.0,A
4,4,Chris,30.0,1,19.0,17.0,1,12.5,33.5,A


Now we are back to the original data frame. Setting drop = False (default) adds the old index as a new column in the dataframe instead of just deletiing it.


## Handling Missing Data

Missing data is common in most data analysis applications.  You have a number of options for filtering out missing data.  One option is doing it by hand or you can use the *dropna* method.

With dataframes objects, things get a little more complex.  You may want to drop rows or columns which are all NA or just those containing any NAs. *dropna* by default drops any row containing a missing value.

In [8]:
#Here we have two pieces of missing data
df_missing = pd.read_csv("./data/Missing_Data.csv")
df_missing

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A,-1
1,Joe,,1,20.0,16,1,14.0,32.0,A,23
2,Sol,31.0,1,22.0,13,1,13.0,34.0,A,34
3,Chris,30.0,-1,19.0,not available,1,12.5,33.5,A,72


The isnull() method returns a series or dataframe of booleans corresponding to whether the particular entries are null or not.

In [9]:
#isnull method for a data frame
df_missing.isnull()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False


We can make sure they are all read in as NA values using the na_values input when we read in the file

In [10]:
#Notice that here the not available is turned into an NaN value
df_missing_NA = pd.read_csv("./data/Missing_Data.csv", na_values=["NaN", "not available"])
df_missing_NA

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,,1,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,-1,19.0,,1,12.5,33.5,A,72


In [18]:
#Let's rerun the isnull() method on the Previous_Part column
df_missing_NA.isnull()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False


In [None]:
#Now on the entire dataframe
df_missing_NA.isnull()

Let's say we now realize that the -1 in the Participation column is a NA value.  If we add -1 to the na_values input, we will also replace the -1 in the Temp column. Luckily, we can give a dictionary to the na_values input which specifies the NA values in each columns 

In [13]:
#Note that the temp column is unaffected
df_missing_NA2 = pd.read_csv("./data/Missing_Data.csv",
                na_values={"Previous_Part":"NA", "Participation1":-1,"Mini_Exam2":"not available"})
df_missing_NA2

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,,1.0,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,,19.0,,1,12.5,33.5,A,72


Now lets see how we can change/replace these NA values

In [14]:
#Get rid of all rows with an NA
df_missing_NA2.dropna(axis=0)

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34


In [15]:
#Passing how='all' will only drop rows that are all NA (doesn't change anything)
df_missing_NA2.dropna(how='all')

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,,1.0,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,,19.0,,1,12.5,33.5,A,72


In [16]:
#Dropping column is just a matter of passing axis=1 (doesn't change anything)
df_missing_NA2.dropna(axis=1,how='all')

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,,1.0,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,,19.0,,1,12.5,33.5,A,72


Rather than filtering ou missing data, you may want to fill in the "holes" in any number of ways. For most purposes, the *fillna* method with a constant relplaces missing values with that value.

In [17]:
df_missing_NA2.fillna(0)

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,0.0,1.0,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,0.0,19.0,0.0,1,12.5,33.5,A,72


In [24]:
#You can pass fillna a dict which gives the replacement value for each column
df_missing_NA2.fillna({"Previous_Part":5,"Mini_Exam2":0.5})

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,5.0,1.0,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,,19.0,0.5,1,12.5,33.5,A,72


With *fillna* you can do lots of things with a little creativity.  For example, you might pass the mean of median value of a series.


In [18]:
#Replace with mean
df_missing_NA2.fillna(df_missing_NA2.mean())

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,31.0,1.0,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,1.0,19.0,16.333333,1,12.5,33.5,A,72
