<h1 style="color:#0AF">PANDAS SERIES CREATION AND INDEXING</h2>

In [19]:
import pandas as pd
import numpy as np

<h2>Use data from step tracking application to create a Pandas Series</h2>

In [8]:
step_data = [3620, 7821, 9761, 3907, 4338, 5373]
step_counts = pd.Series(step_data, name='steps')

In [7]:
print(step_counts)

0    3620
1    7821
2    9761
3    3907
4    4338
5    5373
Name: steps, dtype: int64


<h2>Add a date range to the Series</h2>

In [10]:
step_counts.index = pd.date_range('20150329', periods=6)

In [11]:
print(step_counts)

2015-03-29    3620
2015-03-30    7821
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: steps, dtype: int64


<h2>Select data by the index values</h2>

In [12]:
#Just lije a dictionary
print(step_counts['2015-04-01'])

3907


In [14]:
#Or by index position-like an array
print(step_counts[3])

3907


In [15]:
#Select all of April
print(step_counts['2015-04'])

2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: steps, dtype: int64


<h1 style="color: blue">PANDAS DATA TYPES AND IMPUTATION</h1>

<h2>Data types can be viewed and converted</h2>

In [16]:
#View the data type
print(step_counts.dtypes)

int64


In [22]:
#Convert to a float
step_counts = step_counts.astype(np.float)
print(step_counts.dtypes)

float64


<h2>Invalid data points can be easily filed with values</h2>

In [25]:
#Create invalid data
step_counts[1:3] = np.NaN
print(step_counts)

2015-03-29    3620.0
2015-03-30       NaN
2015-03-31       NaN
2015-04-01    3907.0
2015-04-02    4338.0
2015-04-03    5373.0
Freq: D, Name: steps, dtype: float64


In [45]:
#Now fill it in with zeros
step_counts = step_counts.fillna(0.)
print(step_counts)

2015-03-29    3620.0
2015-03-30       0.0
2015-03-31       0.0
2015-04-01    3907.0
2015-04-02    4338.0
2015-04-03    5373.0
Freq: D, Name: steps, dtype: float64


<h1 style="color:#C0E">PANDAS DATAFRAME CREATION AND METHODS</h1>

<h2>Data Frames can be created from lists, dictionaries, and Panda Series </h2>

In [28]:
#Cycling distance
cycling_data = [10.7, 0, None, 2.4, 15.3, 10.9, 0, None]

#Create a tuple of data
joined_data = list(zip(step_data, cycling_data))

#The data frame
activity_data_frame = pd.DataFrame(joined_data)

print(activity_data_frame)

      0     1
0  3620  10.7
1  7821   0.0
2  9761   NaN
3  3907   2.4
4  4338  15.3
5  5373  10.9


<h2>Labeled columns and an index can be added</h2>

In [29]:
#Add column names to dataframe
activity_data_frame = pd.DataFrame(joined_data,
                                   index=pd.date_range('20150329',
                                   periods=6),
                                   columns=['Walking','Cycling'])
print(activity_data_frame)

            Walking  Cycling
2015-03-29     3620     10.7
2015-03-30     7821      0.0
2015-03-31     9761      NaN
2015-04-01     3907      2.4
2015-04-02     4338     15.3
2015-04-03     5373     10.9


<h1 style="color: #D00">INDEXING DATAFRAME ROWS</h2>

<h2>DataFrame rows can be indexed by row using the 'loc' and 'iloc' methods</h2>

In [31]:
#Select row of data by index name
print(activity_data_frame.loc['2015-04-01'])

Walking    3907.0
Cycling       2.4
Name: 2015-04-01 00:00:00, dtype: float64


In [32]:
#Select row of data by integer position
print(activity_data_frame.iloc[-3])

Walking    3907.0
Cycling       2.4
Name: 2015-04-01 00:00:00, dtype: float64


<h1 style="color: #D00">INDEXING DATAFRAME COLUMNS</h2>

<h2>DataFrame columns can be indexed by name</h2>

In [33]:
#Name of column
print(activity_data_frame['Walking'])

2015-03-29    3620
2015-03-30    7821
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: Walking, dtype: int64


<h2>DataFrame columns can also be indexed as properties</h2>

In [34]:
#Object-oriented approach
print(activity_data_frame.Walking)

2015-03-29    3620
2015-03-30    7821
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: Walking, dtype: int64


<h2>DataFrame columns can be indexed by integer</h2>

In [38]:
#First column
print(activity_data_frame.iloc[:,0])

2015-03-29    3620
2015-03-30    7821
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: Walking, dtype: int64


<h1 style="color:#00D">READING DATA WITH PANDAS</h1>

<h2>CSV and other common filetypes can be read with a single command</h2>

In [46]:
#The location of the data file
filepath = 'data/Iris_Data.csv'

#Import the data
data = pd.read_csv(filepath)

#Print a few rows
print(data.iloc[:5])

   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


<h1 style="color:#00D">ASSIGNING NEW DATA TO A DATAFRAME</h1>

<h2>Data can be (re)assigned to a DataFrame column</h2>

In [41]:
#Create a new column that is a product of both measurements
data['sepal_area'] = data.sepal_length*data.sepal_width

#Print a few rows and columns
print(data.iloc[:5, -3:])

   petal_width      species  sepal_area
0          0.2  Iris-setosa       17.85
1          0.2  Iris-setosa       14.70
2          0.2  Iris-setosa       15.04
3          0.2  Iris-setosa       14.26
4          0.2  Iris-setosa       18.00


<h1 style="color:#C00">APPLYING A FUNCTION TO A DATAFRAME COLUMN</h1>

<h2>Functions can be applied to columns or rows of a DataFrame or Series</h2>

In [42]:
#The lambda function applies what follows it to each row of data
data['abbrev'] = (data.species.apply(lambda x: x.replace('Iris-','')))

#Note that there are other ways to accomplish the above
print(data.iloc[:5, -3:])

       species  sepal_area  abbrev
0  Iris-setosa       17.85  setosa
1  Iris-setosa       14.70  setosa
2  Iris-setosa       15.04  setosa
3  Iris-setosa       14.26  setosa
4  Iris-setosa       18.00  setosa


<h1 style="color:#C00">CONCATENATING TWO DATAFRAMES</h1>

<h2>Two DataFrames can be concatenated along either dimension</h2>

In [48]:
#Concatenate the first two and las two rows
small_data = pd.concat([data.iloc[:2], data.iloc[-2:]])
print(small_data.iloc[:, -3:])

     petal_length  petal_width         species
0             1.4          0.2     Iris-setosa
1             1.4          0.2     Iris-setosa
148           5.4          2.3  Iris-virginica
149           5.1          1.8  Iris-virginica


<h1 style="color:#C00">AGGREGATED STATISTICS WITH GROUP BY</h1>

<h2>Using the group by method calculated aggregated DataFrame statistics</h2>

In [50]:
#Use the size method with a DataFrame to get count...
#...for a series, use the .value_counts method
group_sizes = (data.groupby('species').size())
print(group_sizes)

species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64


<h1 style="color:#C00">PERFORMING STATISTICAL CALCULATIONS</h1>

<h2>Pandas contains variety of statistical methods: mean, median and mode</h2>

In [54]:
#Mean calculated on a DataFrame
print(data.mean())

sepal_length    5.843333
sepal_width     3.054000
petal_length    3.758667
petal_width     1.198667
dtype: float64


In [52]:
#Median calculated on a Series
print(data.petal_length.median())

4.35


In [53]:
#Mode calculated on a Series
print(data.petal_length.mode())

0    1.5
dtype: float64


<h2>Standard deviation, variance, SEM, and quantiles can also be calculated</h2>

In [55]:
#Standard dev, variance, and SEM
print(data.petal_length.std(),
      data.petal_length.var(),
      data.petal_length.sem())

1.7644204199522626 3.113179418344519 0.1440643240210085


In [56]:
#As well as quantiles
print(data.quantile(0))

sepal_length    4.3
sepal_width     2.0
petal_length    1.0
petal_width     0.1
Name: 0, dtype: float64


<h2>Multiple calculations can be presented in a DataFrame</h2>

In [57]:
print(data.describe())

       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000


<h1 style="color:#C00">SAMPLING FROM DATAFRAMES</h1>

<h2>DataFrames can be randomly sampled from</h2>

In [58]:
#Sample 5 rows without replacement
sample = (data.sample(n=5,
                      replace=False,
                      random_state=42))
print(sample.iloc[:, -3:])

     petal_length  petal_width          species
73            4.7          1.2  Iris-versicolor
18            1.7          0.3      Iris-setosa
118           6.9          2.3   Iris-virginica
78            4.5          1.5  Iris-versicolor
76            4.8          1.4  Iris-versicolor


<h3>SciPy and NumPy also contain a variety of statistical functions...</h3>