In [None]:
# 7.14 pandas Series and DataFrames
# NumPy’s array is optimized for homogeneous numeric data that’s accessed via integer indices.
# Data science presents unique demands for which more customized data structures are required. Big data applications must support mixed data types, customized indexing, missing data, data that’s not structured consistently and data that needs to be manipulated.
# Pandas is the most popular library for dealing which such data.
# Series for one-dimensional collections
# DataFrames for two-dimensional collections
# Pandas’s MultiIndex to manipulate multi-dimensional data in the context of Series and DataFrames.

In [1]:
# Creating a Series with Default indices
import pandas as pd
grades = pd.Series([87, 100, 94])

# Pandas displays a Series in two-column format
# with the indices left-aligned in the left column
# the values right-aligned in the right column
# print(grades)
grades

0     87
1    100
2     94
dtype: int64

In [2]:
# Creating a Series with All elements Having the Same Values
pd.Series(98.6, range(3))

0    98.6
1    98.6
2    98.6
dtype: float64

In [3]:
# Accessing a Series's elements
grades[0]

87

In [4]:
# Describe method
# count, mean, std, min, quartiles
# quartiles
# 50% represents the median of sorted values
# 25% represents the median of the first half of the sorted values
# 75% represents the median of the second half of the sorted values

In [5]:
# Producing Descriptive Statistics for Series
# Series provides many methods for various descriptive statistics
print(grades.count())
print(grades.mean())
print(grades.min())
print(grades.max())
print(grades.std())
print(grades.describe())

3
93.66666666666667
87
100
6.506407098647712
count      3.000000
mean      93.666667
std        6.506407
min       87.000000
25%       90.500000
50%       94.000000
75%       97.000000
max      100.000000
dtype: float64


In [6]:
# Creating a Series with Custom Indices
# Can use integers not beginning at 0 and noconsecutive integers
grades = pd.Series([87, 100, 94], index = ['Wally', 'Eva', 'Sam'])
print(grades)

Wally     87
Eva      100
Sam       94
dtype: int64


In [7]:
# Dictionary Initalizers
# If you initalize a Series with dictionary, its' keys
# becomes the Series' indices, and its values become
# the Series' element values
grades = pd.Series({'Wally':87, 'Eva':100, 'Sam':94})
print(grades)

Wally     87
Eva      100
Sam       94
dtype: int64


In [8]:
# Accessing Elements of a Series Via Custom Indices
grades['Eva']

100

In [10]:
grades.Wally

87

In [11]:
grades.dtype

dtype('int64')

In [12]:
grades.values

array([ 87, 100,  94], dtype=int64)

In [13]:
# Creating a Series of Strings
hardware = pd.Series(['Hammer', 'Saw','Wrench'])
hardware

0    Hammer
1       Saw
2    Wrench
dtype: object

In [14]:
# If a Series contains strings, you can use its str attribute
# to call string methods on the elements
hardware.str.contains('a')

0     True
1     True
2    False
dtype: bool

In [15]:
hardware.str.upper()

0    HAMMER
1       SAW
2    WRENCH
dtype: object

In [16]:
# 7.14.2 DataFrames
# a DataFrame is an enhanced two-dimensional array

In [19]:
# Creating a DataFrame from Dictionary
# Student grades in three exam

grades_dict = {'Wally':[87, 96, 70], 'Eva':[100, 87, 90],
              'Sam':[94, 77, 90], 'Katie':[100, 81, 82], 'Bob':[83, 65, 85]}
grades =pd.DataFrame(grades_dict)
#print(grades)
grades

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
0,87,100,94,100,83
1,96,87,77,81,65
2,70,90,90,82,85


In [20]:
# Customizing a DataFrame's Indices with the index Attribute
pd.DataFrame(grades_dict, index = ['Test1', 'Test2', 'Test3'])

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test2,96,87,77,81,65
Test3,70,90,90,82,85


In [21]:
# Shorter way
grades.index = ['Test1', 'Test2', 'Test3']
grades

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test2,96,87,77,81,65
Test3,70,90,90,82,85


In [22]:
# Accessing a DataFrame's Columns
print(grades['Eva'])
print()
# Shorter way
print(grades.Sam)

Test1    100
Test2     87
Test3     90
Name: Eva, dtype: int64

Test1    94
Test2    77
Test3    90
Name: Sam, dtype: int64


In [23]:
# Selecting Rows via the loc and iloc Attributes
# Access a row by its' label using loc Attributes
grades.loc['Test1']

Wally     87
Eva      100
Sam       94
Katie    100
Bob       83
Name: Test1, dtype: int64

In [24]:
# Access a row by integer zero-based indices using the iloc attribute
grades.iloc[1]

Wally    96
Eva      87
Sam      77
Katie    81
Bob      65
Name: Test2, dtype: int64

In [25]:
# Selecting Rows via Slices and lists with the loc andn iloc Attributes
grades.loc['Test1':'Test3'] # Including the high index ('Test3')

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test2,96,87,77,81,65
Test3,70,90,90,82,85


In [27]:
grades.iloc[0:2] # Excluding the high index (2)

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test2,96,87,77,81,65


In [28]:
# To select specific rows, use a list rather than
# slice notation with loc or iloc
grades.loc[['Test1', 'Test3']] # Or grades.iloc[[0,2]]

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test3,70,90,90,82,85


In [29]:
# Selecting Subsets of the Rows and Columns
# By using two slices, two lists or combination of slices and lists.
# To view only Eva's and Katie's grades on Test1 and Test2
grades.loc['Test1':'Test2', ['Eva', 'Katie']]
# grades.loc[['Test1', 'Test2'], ['Eva', 'Katie']]

Unnamed: 0,Eva,Katie
Test1,100,100
Test2,87,81


In [31]:
# To view Test1 and Test3 for the first three students
grades.iloc[[0,2], 0:3]

Unnamed: 0,Wally,Eva,Sam
Test1,87,100,94
Test3,70,90,90


In [32]:
# Boolean Indexing
# To select all the A grades-greater than or equal to 90
# Grades for which condition is False are represented
# as NaN (not a number) in the new DataFrame

grades[grades >= 90]

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,,100.0,94.0,100.0,
Test2,96.0,,,,
Test3,,90.0,90.0,,


In [33]:
# To select all the 8 grades
# & (and), | (or)
grades[(grades >= 80) & (grades <90)] 

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87.0,,,,83.0
Test2,,87.0,,81.0,
Test3,,,,82.0,85.0


In [34]:
# Accessing a Specific DataFrame Cell by Row and Column
# To get the single value from DataFrame using at and iat attributes
# Row and Column indices must be separated by a comma.
# Select Eva's Test2 grade(87)
grades.at['Test2', 'Eva']

87

In [35]:
# Wally's Test3 grade(70)
grades.iat[2,0]

70

In [36]:
# Assign new values to specific elements
grades.at['Test2', 'Eva'] = 100
grades.at['Test2', 'Eva']

100

In [37]:
# Descriptive Statisitics
# Both Series and DataFrames have a describe method
grades.describe()

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
count,3.0,3.0,3.0,3.0,3.0
mean,84.333333,96.666667,87.0,87.666667,77.666667
std,13.203535,5.773503,8.888194,10.692677,11.015141
min,70.0,90.0,77.0,81.0,65.0
25%,78.5,95.0,83.5,81.5,74.0
50%,87.0,100.0,90.0,82.0,83.0
75%,91.5,100.0,92.0,91.0,84.0
max,96.0,100.0,94.0,100.0,85.0


In [38]:
# set_option function: 
# By default, pandas calculates the descriptive 
# statistics with floating-point values and
# displays them with six digits of precision. 
# You can control the precision and default settings 
# with pandas’ set _option function

In [39]:
pd.set_option('precision', 2)
grades.describe()

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
count,3.0,3.0,3.0,3.0,3.0
mean,84.33,96.67,87.0,87.67,77.67
std,13.2,5.77,8.89,10.69,11.02
min,70.0,90.0,77.0,81.0,65.0
25%,78.5,95.0,83.5,81.5,74.0
50%,87.0,100.0,90.0,82.0,83.0
75%,91.5,100.0,92.0,91.0,84.0
max,96.0,100.0,94.0,100.0,85.0


In [40]:
# Calculate the average for each student
grades.mean()

Wally    84.33
Eva      96.67
Sam      87.00
Katie    87.67
Bob      77.67
dtype: float64

In [41]:
# Transposing the DataFrame with the T Attribute
# T returns a transposed view (not a copy) of the DataFrame
grades.T

Unnamed: 0,Test1,Test2,Test3
Wally,87,96,70
Eva,100,100,90
Sam,94,77,90
Katie,100,81,82
Bob,83,65,85


In [42]:
# Summary statistics by Test and Average for each test
grades.T.describe()

Unnamed: 0,Test1,Test2,Test3
count,5.0,5.0,5.0
mean,92.8,83.8,83.4
std,7.66,14.31,8.23
min,83.0,65.0,70.0
25%,87.0,77.0,82.0
50%,94.0,81.0,85.0
75%,100.0,96.0,90.0
max,100.0,100.0,90.0


In [43]:
# To see average of all student's grades on each test
grades.T.mean()

Test1    92.8
Test2    83.8
Test3    83.4
dtype: float64

In [44]:
# Sorting by Rows by their indices
# Sort the rows by their indices in descending order using sort_index
grades.sort_index(ascending = False)

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test3,70,90,90,82,85
Test2,96,100,77,81,65
Test1,87,100,94,100,83


In [45]:
grades.sort_index()

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test2,96,100,77,81,65
Test3,70,90,90,82,85


In [46]:
# Sorting by Column Indices
# Sort the columns into ascending order (left-to-right)
# by their column names.
# axis = 1 keyword argument indicate to sort the column indices
grades.sort_index(axis=1)

Unnamed: 0,Bob,Eva,Katie,Sam,Wally
Test1,83,100,100,94,87
Test2,65,100,81,77,96
Test3,85,90,82,90,70


In [47]:
grades.sort_index(axis=0)

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test2,96,100,77,81,65
Test3,70,90,90,82,85


In [48]:
# Sorting by Column Values
# To see Test1's grades in descending order using sort_values
grades.sort_values(by = 'Test1', axis = 1, ascending = False)

Unnamed: 0,Eva,Katie,Sam,Wally,Bob
Test1,100,100,94,87,83
Test2,100,81,77,96,65
Test3,90,82,90,70,85


In [49]:
# We can sort the transposed DataFrame instead
grades.T.sort_values(by = 'Test1', ascending = False)

Unnamed: 0,Test1,Test2,Test3
Eva,100,100,90
Katie,100,81,82
Sam,94,77,90
Wally,87,96,70
Bob,83,65,85


In [50]:
# If we want to see only Test1's grades
grades

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test2,96,100,77,81,65
Test3,70,90,90,82,85


In [51]:
grades.loc['Test1']

Wally     87
Eva      100
Sam       94
Katie    100
Bob       83
Name: Test1, dtype: int64

In [52]:
grades.loc['Test1'].sort_values(ascending = False)

Katie    100
Eva      100
Sam       94
Wally     87
Bob       83
Name: Test1, dtype: int64