## DataFrame (Array - 2 Dimensions)


DataFrames can be created from `lists`, `dictionaries` and Pandas `Series`.


In [2]:
# Cycling distance
cycling_data = [10.7, 0, None, 2.4, 15.3, 10.9, 0, None]
print(cycling_data)

[10.7, 0, None, 2.4, 15.3, 10.9, 0, None]


##### Create a tuple of data

In [3]:
step_data = [3620, 7891, 9761, 3907, 4338, 5373]
joined_data = list(zip(step_data, cycling_data))
print(joined_data)

[(3620, 10.7), (7891, 0), (9761, None), (3907, 2.4), (4338, 15.3), (5373, 10.9)]


##### The dataframe

In [4]:
import pandas as pd

activity_df = pd.DataFrame(joined_data, index = pd.date_range('20150329', periods = 6), columns = ['Walking', 'Cycling'])
print(activity_df)

            Walking  Cycling
2015-03-29     3620     10.7
2015-03-30     7891      0.0
2015-03-31     9761      NaN
2015-04-01     3907      2.4
2015-04-02     4338     15.3
2015-04-03     5373     10.9


##### Accessing Row

Select row of data by index name using `loc` method

In [5]:
# select april data
print(activity_df.loc['2015-04'])

            Walking  Cycling
2015-04-01     3907      2.4
2015-04-02     4338     15.3
2015-04-03     5373     10.9


In [6]:
# select a specific day data
print(activity_df.loc['2015-04-02'])

Walking    4338.0
Cycling      15.3
Name: 2015-04-02 00:00:00, dtype: float64


Select row of data by integer position using `iloc` method

In [7]:
# select first data

print(activity_df.iloc[0])

Walking    3620.0
Cycling      10.7
Name: 2015-03-29 00:00:00, dtype: float64


In [8]:
# select last data

print(activity_df.iloc[-1])

Walking    5373.0
Cycling      10.9
Name: 2015-04-03 00:00:00, dtype: float64


In [9]:
# select first 4 data

print(activity_df.iloc[:4])

            Walking  Cycling
2015-03-29     3620     10.7
2015-03-30     7891      0.0
2015-03-31     9761      NaN
2015-04-01     3907      2.4


In [10]:
# select last 4 data

print(activity_df.iloc[-4:])

            Walking  Cycling
2015-03-31     9761      NaN
2015-04-01     3907      2.4
2015-04-02     4338     15.3
2015-04-03     5373     10.9


#### Accessing columns

In [11]:
print(activity_df['Walking'])

2015-03-29    3620
2015-03-30    7891
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: Walking, dtype: int64


In [12]:
print(activity_df.Walking)

2015-03-29    3620
2015-03-30    7891
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: Walking, dtype: int64


In [13]:
print(activity_df.iloc[:,0])

2015-03-29    3620
2015-03-30    7891
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: Walking, dtype: int64


In [15]:
print(activity_df.iloc[2:4,1])

2015-03-31    NaN
2015-04-01    2.4
Freq: D, Name: Cycling, dtype: float64


### Reading Data with Pandas

In [33]:
filepath = '../data/Iris_Data.csv'

# importing data
data = pd.read_csv(filepath)

In [34]:
# print first 5 rows
print(data.iloc[:5])

   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [36]:
# print last 5 rows
print(data.iloc[-5:])

     sepal_length  sepal_width  petal_length  petal_width         species
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica


In [37]:
# print first 5 rows and last 3 columns
print(data.iloc[:5, -3:])

   petal_length  petal_width      species
0           1.4          0.2  Iris-setosa
1           1.4          0.2  Iris-setosa
2           1.3          0.2  Iris-setosa
3           1.5          0.2  Iris-setosa
4           1.4          0.2  Iris-setosa


##### Create a new column that is a product of two measurements

In [39]:
data['sepal_area'] = data['sepal_length'] * data['sepal_width']
print(data.iloc[:5])

   sepal_length  sepal_width  petal_length  petal_width      species  \
0           5.1          3.5           1.4          0.2  Iris-setosa   
1           4.9          3.0           1.4          0.2  Iris-setosa   
2           4.7          3.2           1.3          0.2  Iris-setosa   
3           4.6          3.1           1.5          0.2  Iris-setosa   
4           5.0          3.6           1.4          0.2  Iris-setosa   

   sepal_area  
0       17.85  
1       14.70  
2       15.04  
3       14.26  
4       18.00  


##### Applying a function to a dataframe column

Functions can be applied to `columns` or `rows` of a `DataFrame` or `Series`.

In [44]:
data['abbrev'] = (data.species.apply(lambda x: x.replace('Iris-', '')))
print(data.iloc[:5])

   sepal_length  sepal_width  petal_length  petal_width      species  \
0           5.1          3.5           1.4          0.2  Iris-setosa   
1           4.9          3.0           1.4          0.2  Iris-setosa   
2           4.7          3.2           1.3          0.2  Iris-setosa   
3           4.6          3.1           1.5          0.2  Iris-setosa   
4           5.0          3.6           1.4          0.2  Iris-setosa   

   sepal_area  abbrev  
0       17.85  setosa  
1       14.70  setosa  
2       15.04  setosa  
3       14.26  setosa  
4       18.00  setosa  


##### Concatenating two dataframes

Two dataframes can be concatenated along either dimension.

In [47]:
# Concatenate the first two and last two rows

small_data = pd.concat([data.iloc[:2], data.iloc[-2:]])

print(small_data)

     sepal_length  sepal_width  petal_length  petal_width         species  \
0             5.1          3.5           1.4          0.2     Iris-setosa   
1             4.9          3.0           1.4          0.2     Iris-setosa   
148           6.2          3.4           5.4          2.3  Iris-virginica   
149           5.9          3.0           5.1          1.8  Iris-virginica   

     sepal_area     abbrev  
0         17.85     setosa  
1         14.70     setosa  
148       21.08  virginica  
149       17.70  virginica  


In [51]:
# Concatenate sepal columns
sepal_columns = pd.concat([data.iloc[:2, :2], data.iloc[-2:, -2:-1]])

print(sepal_columns)

     sepal_area  sepal_length  sepal_width
0           NaN           5.1          3.5
1           NaN           4.9          3.0
148       21.08           NaN          NaN
149       17.70           NaN          NaN


In [52]:
# see the 'join' method for SQL style joining of dataframe

##### Aggregated statistics with groupby

In [62]:
# Use the size method with a dataframe to get count for a series

group_sizes = (data.groupby('species').size())

print(group_sizes)

species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64


In [57]:
# use the .value_counts method to count the values

print(group_sizes.value_counts())

50    3
dtype: int64


##### [GO BACK HOME](../../README.md)