In [1]:
import numpy as np

In [2]:
np.array([1, 2, 3, 4, 5]).dtype

dtype('int64')

In [3]:
enrollments = np.array([
    ['account_key', 'status', 'join_date', 'days_to_cancel', 'is_udacity'],
    [448, 'canceled', '2014-11-10', 65, True],
    [448, 'canceled', '2014-11-05', 5, True],
    [448, 'canceled', '2015-01-27', 0, True],
    [448, 'canceled', '2014-11-10', 0, True],
    [448, 'current', '2015-03-10', np.nan, True],
])

In [4]:
# Even though the arrays was created with integers and booleans without
# quotes. It converted them to strings.
enrollments

array([['account_key', 'status', 'join_date', 'days_to_cancel',
        'is_udacity'],
       ['448', 'canceled', '2014-11-10', '65', 'True'],
       ['448', 'canceled', '2014-11-05', '5', 'True'],
       ['448', 'cancelled', '2015-01-27', '0', 'True'],
       ['448', 'cancelled', '2014-11-10', '0', 'True'],
       ['448', 'current', '2015-03-10', 'nan', 'True']], 
      dtype='|S14')

In [5]:
enrollments[:, 3].mean()

TypeError: cannot perform reduce with flexible type

In [6]:
# The error above is because it can't take a mean of 'days_to_cancel'
# because it's all strings

# That's one benefit of Pandas dataframes over 2D NumPy arrays

In [8]:
# So instead we'll create a Pandas dataframe from this data by
# passing in a dictionary

import pandas as pd

enrollments_df = pd.DataFrame({
    'account_key': [448, 448, 448, 448, 448],
    'status': ['canceled', 'canceled', 'canceled', 'canceled', 'current'],
    'join_date': ['2014-11-10', '2014-11-05', '2015-01-27', '2014-11-10', '2015-03-10'],
    'days_to_cancel': [65, 5, 0, 0, np.nan],
    'is_udacity': [True, True, True, True, True]
})

In [9]:
# Pandas dataframe:
enrollments_df

Unnamed: 0,account_key,days_to_cancel,is_udacity,join_date,status
0,448,65.0,True,2014-11-10,canceled
1,448,5.0,True,2014-11-05,canceled
2,448,0.0,True,2015-01-27,canceled
3,448,0.0,True,2014-11-10,canceled
4,448,,True,2015-03-10,current


In [10]:
# Now if I take the mean of the dataframe, it takes the mean of the
# numerical columns and leaves the others alone.

# Notice it takes the mean of each column as it assumes each column is
# assumed to be a different type

enrollments_df.mean()

account_key       448.0
days_to_cancel     17.5
is_udacity          1.0
dtype: float64

In [11]:
# You can also use the axis argument to take hte mean of each row instead
# but that won't work in this case since each row contains non-numerical
# data

enrollments_df.mean(axis=1)

0    171.333333
1    151.333333
2    149.666667
3    149.666667
4    224.500000
dtype: float64