In [1]:
import pandas as pd

In [2]:
orders = pd.read_table('orders.tsv')

In [3]:
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [4]:
# How do I select a pandas Series from a DataFrame?
orders['item_name']

# or
orders.item_name

# Bracket notation will always work, whereas dot notation has limitations:
# Dot notation doesn't work if there are spaces in the Series name
# Dot notation doesn't work if the Series has the same name as a DataFrame 
# method or attribute (like 'head' or 'shape')

0                Chips and Fresh Tomato Salsa
1                                        Izze
2                            Nantucket Nectar
3       Chips and Tomatillo-Green Chili Salsa
4                                Chicken Bowl
                        ...                  
4617                            Steak Burrito
4618                            Steak Burrito
4619                       Chicken Salad Bowl
4620                       Chicken Salad Bowl
4621                       Chicken Salad Bowl
Name: item_name, Length: 4622, dtype: object

In [5]:
# Why do some pandas commands end with parentheses
# Methods end with parentheses, while attributes don't

In [6]:
movies = pd.read_csv('http://bit.ly/imdbratings')

In [7]:
# example method: show the first 5 rows
movies.head()

# example method: calculate summary statistics
movies.describe()

# example attribute: number of rows and columns
movies.shape

# example attribute: data type of each column
movies.dtypes

star_rating       float64
title              object
content_rating     object
genre              object
duration            int64
actors_list        object
dtype: object

In [8]:
movies.describe()

Unnamed: 0,star_rating,duration
count,979.0,979.0
mean,7.889785,120.979571
std,0.336069,26.21801
min,7.4,64.0
25%,7.6,102.0
50%,7.8,117.0
75%,8.1,134.0
max,9.3,242.0


In [9]:
# How do I rename columns in a pandas DataFrame?
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')


In [10]:
# examine the column names
ufo.columns

Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')

In [11]:
# rename two of the columns by using the 'rename' method
ufo.rename(columns={'Colors Reported':'Colors_Reported', 'Shape Reported':'Shape_Reported'}, inplace=True)
ufo.columns

Index(['City', 'Colors_Reported', 'Shape_Reported', 'State', 'Time'], dtype='object')

In [12]:
# replace all of the column names by overwriting the 'columns' attribute
ufo_cols = ['city', 'colors reported', 'shape reported', 'state', 'time']
ufo.columns = ufo_cols
ufo.columns

Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')

In [13]:
# How do I remove columns from a pandas DataFrame?

In [None]:
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

In [None]:
# remove a single column (axis=1 refers to columns)
# ufo.drop('Colors Reported', axis=1, inplace=True)
ufo.drop('Colors Reported', axis=1)


In [None]:
ufo.head()

In [None]:
# remove multiple rows at once (axis=0 refers to rows)
ufo.drop([0, 1], axis=0, inplace=True)
ufo.head()

In [None]:
# How do I sort a pandas DataFrame or a Series?

# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()

In [None]:
# sort the 'title' Series in ascending order (returns a Series)
movies.title.sort_values().head()

In [None]:
# sort in descending order instead
movies.title.sort_values(ascending=False).head()

In [None]:
# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort_values('title').head()

In [None]:
# sort in descending order instead
movies.sort_values('title', ascending=False).head()

In [None]:
# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort_values(['content_rating', 'duration']).head()

In [None]:
# How do I filter rows of a pandas DataFrame by column value?
# create a list in which each element refers to a DataFrame row: True if the row satisfies the condition, False otherwise
booleans = []
for length in movies.duration:
    if length >= 200:
        booleans.append(True)
    else:
        booleans.append(False)

In [None]:
# confirm that the list has the same length as the DataFrame
print(len(booleans))
movies.shape

In [None]:
# convert the list to a Series
is_long = pd.Series(booleans)
is_long.head()

In [None]:
# use bracket notation with the boolean Series to tell the DataFrame which rows to display
movies[is_long]

In [None]:
# simplify the steps above: no need to write a for loop to create 'is_long' since pandas 
# will broadcast the comparison
is_long = movies.duration >= 200
movies[is_long]

# or equivalently, write it in one line (no need to create the 'is_long' object)
movies[movies.duration >= 200]

In [None]:
# or equivalently, use the 'loc' method
movies.loc[movies.duration >= 200]

In [None]:
# or equivalently, use the 'loc' method, to select the content_rating column
movies.loc[movies.duration >= 200, 'content_rating']

In [None]:
# How do I apply multiple filter criteria to a pandas DataFrame? 
# filter the DataFrame to only show movies with a 'duration' of at least 200 minutes
movies[movies.duration >= 200]

In [None]:
# CORRECT: use the '&' operator to specify that both conditions are required
movies[(movies.duration >=200) & (movies.genre == 'Drama')]

In [None]:
# use the '|' operator to specify that a row can match any of the three criteria
movies[(movies.genre == 'Crime') | (movies.genre == 'Drama') | (movies.genre == 'Action')].head(10)

# or equivalently, use the 'isin' method
movies[movies.genre.isin(['Crime', 'Drama', 'Action'])].head(10)

In [None]:
# When reading from a file, how do I read in only a subset of the columns?
# read a dataset of UFO reports into a DataFrame, and check the columns
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.columns


In [None]:
# specify which columns to include by name
ufo = pd.read_csv('http://bit.ly/uforeports', usecols=['City', 'State'])
ufo.columns

In [None]:
# When reading from a file, how do I read in only a subset of the rows?
# specify how many rows to read
ufo = pd.read_csv('http://bit.ly/uforeports', nrows=5)
ufo

In [None]:
# How do I iterate through a Series
for c in ufo.City:
    print(c)

In [None]:
# How do I iterate through a DataFrame?
# various methods are available to iterate through a DataFrame
for index, row in ufo.iterrows():
    print(index, row.City, row.State)
    

In [None]:
# How do I use the "axis" parameter in pandas?
# When referring to rows or columns with the axis parameter:
# axis 0 refers to rows
# axis 1 refers to columns

# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()


In [None]:
# drop a column (temporarily)
drinks.drop('continent', axis=1).head()

In [None]:
drinks.head()

In [None]:
# drop a row (temporarily)
drinks.drop(2, axis=0).head()

In [None]:
# calculate the mean of each numeric column
# When performing a mathematical operation with the axis parameter:

# axis 0 means the operation should "move down" the row axis
# axis 1 means the operation should "move across" the column axis
drinks.mean()

# or equivalently, specify the axis explicitly
drinks.mean(axis=0)

In [None]:
# calculate the mean of each row
drinks.mean(axis=1).head()

In [None]:
# 'index' is an alias for axis 0
drinks.mean(axis='index')

In [None]:
# 'columns' is an alias for axis 1
drinks.mean(axis='columns').head()

In [None]:
# How do I use string methods 

orders.head()

In [None]:
# normal way to access string methods in Python
'hello'.upper()

In [None]:
# string methods for pandas Series are accessed via 'str'
orders.item_name.str.upper().head()

In [None]:
# string method 'contains' checks for a substring and returns a boolean Series
orders.item_name.str.contains('Chicken').head()

In [None]:
# When should I use a "groupby" in pandas?
# calculate the mean beer servings across the entire dataset
drinks.beer_servings.mean()

In [None]:
# calculate the mean beer servings just for countries in Africa
drinks[drinks.continent=='Africa'].beer_servings.mean()

In [None]:
# calculate the mean beer servings for each continent
drinks.groupby('continent').beer_servings.mean()

In [None]:
# other aggregation functions (such as 'max') can also be used with groupby
drinks.groupby('continent').beer_servings.max()

In [None]:
# multiple aggregation functions can be applied simultaneously
drinks.groupby('continent').beer_servings.agg(['count', 'mean', 'min', 'max'])

In [None]:
# How do I explore a pandas Series?
# examine the data type of each Series
movies.dtypes

In [None]:
# Exploring a non-numeric Series:
# count the non-null values, unique values, and frequency of the most common value
movies.genre.describe()

In [None]:
# count how many times each value in the Series occurs
# The resulting object will be in descending order so that the first element is the most frequently-occurring element. 
# Excludes NA values by default.
movies.genre.value_counts()

In [None]:
# display percentages instead of raw counts
movies.genre.value_counts(normalize=True)

In [None]:
# display the unique values in the Series
movies.genre.unique()

In [None]:
# Exploring a numeric Series:

# calculate various summary statistics
movies.duration.describe()

In [None]:
# allow plots to appear in the notebook
%matplotlib inline

In [None]:
# histogram of the 'duration' Series (shows the distribution of a numerical variable)
movies.duration.plot(kind='hist')

In [None]:
# bar plot of the 'value_counts' for the 'genre' Series
movies.genre.value_counts().plot(kind='bar')

In [None]:
# How do I handle missing values in pandas?
# What does "NaN" mean?

# "NaN" is not a string, rather it's a special value: numpy.nan.
# It stands for "Not a Number" and indicates a missing value.
# read_csv detects missing values (by default) when reading the file, and replaces them with 
# this special value.
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.tail()


In [None]:
# 'isnull' returns a DataFrame of booleans (True if missing, False if not missing)
ufo.isnull().tail()

In [None]:
# count the number of missing values in each Series
ufo.isnull().sum()

In [None]:
# use the 'isnull' Series method to filter the DataFrame rows
ufo[ufo.City.isnull()]

In [None]:
# if 'any' values are missing in a row, then drop that row
ufo.dropna(how='any').shape

In [None]:
# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily
ufo.shape

In [None]:
# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.dropna(how='all').shape

In [None]:
# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape

In [None]:
# 'value_counts' does not include missing values by default
ufo['Shape Reported'].value_counts().head()

In [None]:
# fill in missing values with a specified value
ufo['Shape Reported'].fillna(value='VARIOUS', inplace=True)

In [None]:
# confirm that the missing values were filled in
ufo['Shape Reported'].value_counts().head(6)

In [None]:
# What do I need to know about the pandas index?

In [None]:
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

In [None]:
# every DataFrame has an index (sometimes called the "row labels")
drinks.index

In [None]:
# column names are also stored in a special "index" object
drinks.columns

In [None]:
# neither the index nor the columns are included in the shape
drinks.shape

In [None]:
# index and columns both default to integers if you don't define them
pd.read_table('http://bit.ly/movieusers', header=None, sep='|').head()

In [None]:
# identification: index remains with each row when filtering the DataFrame
drinks[drinks.continent=='South America']

In [None]:
# selection: select a portion of the DataFrame using the index
drinks.loc[23, 'beer_servings']

In [None]:
# set an existing column as the index
drinks.set_index('country', inplace=True)
drinks.head()

In [None]:
# 'country' is now the index
drinks.index

In [None]:
# 'country' is no longer a column
drinks.columns

In [None]:
# 'country' data is no longer part of the DataFrame contents
drinks.shape

In [None]:
# country name can now be used for selection
drinks.loc['Brazil', 'beer_servings']

In [None]:
# restore the index name, and move the index back to a column
drinks.index.name = 'country'
drinks.reset_index(inplace=True)
drinks.head()

In [None]:
# How do I select multiple rows and columns from a pandas DataFrame? 
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head(3)

In [None]:
# The loc method is used to select rows and columns by label. You can pass it:

# A single label
# A list of labels
# A slice of labels
# A boolean Series
# A colon (which indicates "all labels")

In [None]:
# row 0, all columns
ufo.loc[0, :]

In [None]:
# rows 0 and 1 and 2, all columns
ufo.loc[[0, 1, 2], :]

In [None]:
# rows 0 through 2 (inclusive), all columns
ufo.loc[0:2, :]

In [None]:
# this implies "all columns", but explicitly stating "all columns" is better
ufo.loc[0:2]

In [None]:
# rows 0 through 2 (inclusive), column 'City'
ufo.loc[0:2, 'City']

In [None]:
# rows 0 through 2 (inclusive), columns 'City' and 'State'
ufo.loc[0:2, ['City', 'State']]

In [None]:
# accomplish the same thing using double brackets - but using 'loc' is preferred since 
# it's more explicit
ufo[['City', 'State']].head(3)

In [None]:
# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)
ufo.loc[0:2, 'City':'State']

In [None]:
# rows in which the 'City' is 'Oakland', column 'State'
ufo.loc[ufo.City=='Oakland', 'State']

In [None]:
# The iloc method is used to select rows and columns by integer position. You can pass

# A single integer position
# A list of integer positions
# A slice of integer positions
# A colon (which indicates "all integer positions")

In [None]:
# rows in positions 0 and 1, columns in positions 0 and 3
ufo.iloc[[0, 1], [0, 3]]

In [None]:
# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)
ufo.iloc[0:2, 0:4]

In [None]:
# accomplish the same thing - but using 'iloc' is preferred since it's more explicit
ufo[0:2]

In [None]:
# The ix method is used to select rows and columns by label or integer position, 
# and should only be used when you need to mix label-based and integer-based 
# selection in the same call.
# read a dataset of alcohol consumption into a DataFrame and set 'country' as the index
drinks = pd.read_csv('http://bit.ly/drinksbycountry', index_col='country')
drinks.head()

In [None]:
# row with label 'Albania', column in position 0
# ix - is deprecated now
drinks.ix['Albania', 0]

In [None]:
# When should I use the "inplace" parameter in pandas?

In [None]:
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

In [None]:
# remove the 'City' column (doesn't affect the DataFrame since inplace=False)
ufo.drop('City', axis=1).head()

In [None]:
# confirm that the 'City' column was not actually removed
ufo.head()

In [None]:
# remove the 'City' column (does affect the DataFrame since inplace=True)
ufo.drop('City', axis=1, inplace=True)

In [None]:
# confirm that the 'City' column was actually removed
ufo.head()

In [None]:
# drop a row if any value is missing from that row (doesn't affect the DataFrame since inplace=False)
ufo.dropna(how='any').shape

In [None]:
# confirm that no rows were actually removed
ufo.shape

In [None]:
# fill missing values using "backward fill" strategy (doesn't affect the DataFrame 
# since inplace=False)
ufo.fillna(method='bfill').tail()

In [None]:
# Why are DataFrame slices inclusive when using .loc, but exclusive when using .iloc?

In [None]:
# label-based slicing is inclusive of the start and stop
ufo.loc[0:4, :]

In [None]:
# position-based slicing is inclusive of the start and exclusive of the stop
ufo.iloc[0:4, :]

In [None]:
# How do I randomly sample rows from a DataFrame?
# sample 3 rows from the DataFrame without replacement (new in pandas 0.16.1)
ufo.sample(n=3)

In [None]:
# use the 'random_state' parameter for reproducibility
ufo.sample(n=3, random_state=42)

In [None]:
# sample 75% of the DataFrame's rows without replacement
train = ufo.sample(frac=0.75, random_state=99)

# store the remaining 25% of the rows in another DataFrame
test = ufo.loc[~ufo.index.isin(train.index), :]

In [None]:
# How do I create dummy variables in pandas?

In [None]:
# read the training dataset from Kaggle's Titanic competition
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()

In [None]:
# create the 'Sex_male' dummy variable using the 'map' method
train['Sex_male'] = train.Sex.map({'female':0, 'male':1})
train.head()

In [None]:
# use 'get_dummies' with a feature that has 3 possible values
pd.get_dummies(train.Embarked, prefix='Embarked').head(10)

In [None]:
# How do I work with dates and times in pandas?

In [None]:
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

In [None]:
# 'Time' is currently stored as a string
ufo.dtypes

In [None]:
# convert 'Time' to datetime format
ufo['Time'] = pd.to_datetime(ufo.Time)
ufo.head()

In [None]:
ufo.dtypes

In [None]:
# convenient Series attributes are now available
ufo.Time.dt.hour.head()

In [None]:
ufo.Time.dt.weekday_name.head()

In [None]:
ufo.Time.dt.dayofyear.head()

In [None]:
# convert a single string to datetime format (outputs a timestamp object)
ts = pd.to_datetime('1/1/1999')
ts

In [None]:
# compare a datetime Series with a timestamp
ufo.loc[ufo.Time >= ts, :].head()