In [131]:
# always include aliased modules
# these are the most common ones that we'll use in our Big Data class
import math as m
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [132]:
# read in a test file

df_simple = pd.read_csv("messy_data.csv")
df_simple.head()


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,favorite,Unnamed: 4
0,,,,fruit,
1,,first,last,or,favorite
2,title,name,name,vegetable,dessert
3,Ms,Tracey,Sconyers,blueberry,pie
4,Mr,John,Smith,carrot,cake


In [133]:
new_labels = ['title', 'first_name', 'last_name', 'fav_fruit_or_veg', 'fav_dessert'] #new column names
df = pd.read_csv("messy_data.csv", #name of csv file
                 header=3,         #which indexed row in the file is the last header row this means rows 0..3 are headers
                 names=new_labels, #use the new_labels list for the column names. Notice there are no blanks
                 index_col=False)  # do not use the first column as the index; instead, create an integer index
df


Unnamed: 0,title,first_name,last_name,fav_fruit_or_veg,fav_dessert
0,Ms,Tracey,Sconyers,blueberry,pie
1,Mr,John,Smith,carrot,cake
2,,Uprep,teachers,banana,bread pudding
3,Mr,Ken,Jaffe,apple,cookie
4,Ms,Sarah,Peterson,carrot,cookie


In [134]:
# test the drop method

# temporarily remove rows at index 0 through 3 (not including 3)  It does not re-index the dataframe
df.drop(df.index[2])

Unnamed: 0,title,first_name,last_name,fav_fruit_or_veg,fav_dessert
0,Ms,Tracey,Sconyers,blueberry,pie
1,Mr,John,Smith,carrot,cake
3,Mr,Ken,Jaffe,apple,cookie
4,Ms,Sarah,Peterson,carrot,cookie


In [135]:
# as you can see, the original content is preserved; third row at index 2 is still in data frame
df.head()

Unnamed: 0,title,first_name,last_name,fav_fruit_or_veg,fav_dessert
0,Ms,Tracey,Sconyers,blueberry,pie
1,Mr,John,Smith,carrot,cake
2,,Uprep,teachers,banana,bread pudding
3,Mr,Ken,Jaffe,apple,cookie
4,Ms,Sarah,Peterson,carrot,cookie


In [136]:
# remove a row based on a value in the "last_name" column
df.drop(df[df.last_name == "Jaffe"].index)

Unnamed: 0,title,first_name,last_name,fav_fruit_or_veg,fav_dessert
0,Ms,Tracey,Sconyers,blueberry,pie
1,Mr,John,Smith,carrot,cake
2,,Uprep,teachers,banana,bread pudding
4,Ms,Sarah,Peterson,carrot,cookie


In [137]:
# as you can see, the original content is still preserved
df.head()

Unnamed: 0,title,first_name,last_name,fav_fruit_or_veg,fav_dessert
0,Ms,Tracey,Sconyers,blueberry,pie
1,Mr,John,Smith,carrot,cake
2,,Uprep,teachers,banana,bread pudding
3,Mr,Ken,Jaffe,apple,cookie
4,Ms,Sarah,Peterson,carrot,cookie


In [138]:
# use the inplace=True parameter to permanently remove a row based on a value in the "last_name" column 
df.drop(df[df.last_name == "Jaffe"].index, inplace=True)

In [139]:
# Now we see the row is removed, but the index number remains
df

Unnamed: 0,title,first_name,last_name,fav_fruit_or_veg,fav_dessert
0,Ms,Tracey,Sconyers,blueberry,pie
1,Mr,John,Smith,carrot,cake
2,,Uprep,teachers,banana,bread pudding
4,Ms,Sarah,Peterson,carrot,cookie


In [140]:
# finally, read in the csv file to a dataframe but choose one particular column to be the index
# this index column ideally should be unique, and with no missing data
df_best = pd.read_csv("messy_data.csv", #name of csv file
                 header=3,         #which indexed row in the file is the last header row this means rows 0..3 are headers
                 names=new_labels, #use the new_labels list for the column names. Notice there are no blanks
                 index_col='last_name')  # use the last_name column as our index
df_best



Unnamed: 0_level_0,title,first_name,fav_fruit_or_veg,fav_dessert
last_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sconyers,Ms,Tracey,blueberry,pie
Smith,Mr,John,carrot,cake
teachers,,Uprep,banana,bread pudding
Jaffe,Mr,Ken,apple,cookie
Peterson,Ms,Sarah,carrot,cookie


In [141]:
# permanently delete the unwanted rows using the last_name index
# use the inplace=True parameter to permanently remove a row based on a value in the "last_name" column 
# this works only because last_name is an index
df_best.drop(['teachers'], inplace=True)
df_best

Unnamed: 0_level_0,title,first_name,fav_fruit_or_veg,fav_dessert
last_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sconyers,Ms,Tracey,blueberry,pie
Smith,Mr,John,carrot,cake
Jaffe,Mr,Ken,apple,cookie
Peterson,Ms,Sarah,carrot,cookie
