# Pandas and Matplotlib Tutorial

In [None]:
# Standard Headers
import pandas as pd
import matplotlib.pyplot as plt

# Enable inline mode for matplotlib so that Jupyter displays graphs
%matplotlib inline

## 1. Intro to Pandas

Pandas provide two convenient data structures for storing and manipulating data-- Series and DataFrame. A Series is similar to a one-dimensional array whereas a DataFrame is more similar to representing a matrix or a spreadsheet table.  

The Pandas library documentation is [here](http://pandas.pydata.org/pandas-docs/stable/reference/index.html).

In this tutorial, we will use the <a href="http://archive.ics.uci.edu/ml/datasets/Iris">Iris dataset</a> from the UCI machine learning repository, which contains information on 150 Iris flowers, 50 each from one of three Iris species: Setosa, Versicolour, and Virginica. Each flower is characterized by five attributes:

- sepal length in centimeters

- sepal width in centimeters

- petal length in centimeters

- petal width in centimeters

- class (Setosa, Versicolour, Virginica) 

<img src="iris.png">

### 1.1 Reading data from a CSV file
You can find more on reading CSV (Comma Separated Value) data in to a Pandas dataframe [here](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html).

Documentation for Pandas DataFrames is [here](http://pandas.pydata.org/pandas-docs/stable/reference/frame.html).

In [None]:
#data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
data = pd.read_csv('iris.data.txt', header=None)
data.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']

data.head(10)  # displays the first 5 rows of the data frame

### 1.2 Accessing elements of a DataFrame

The elements of a DataFrame can be accessed in many ways.
Accessing an entire row or column will return a Pandas Series object.

Documentation for Pandas Series is [here](http://pandas.pydata.org/pandas-docs/stable/reference/series.html).

Documentation for accessing elements of a DataFrame is [here](http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html).

In [None]:
# accessing an entire row will return a Series object
print('Row 2 of data table:')
row = data.iloc[2]       # returns the 3rd row of DataFrame
#print(data[2])          # must use iloc on Data Frames, this will error
print(row)

print('\nType=', type(row))
print('Values=', row.values)   # display values of the Series
print('Index=', row.index)     # display indices of the Series

In [None]:
# accessing an entire column will return a Series object
print('Col 3 of data table:')
print(data['petal width'])  # access column by name
print(type(data['petal width']))

# could also do:
#print(data.iloc[:,3]) # [all rows, col 3]
#print(data.loc[:,'petal width']) # all rows, col by name

# use iloc for numerical indices, use loc for text indices

In [None]:
# accessing a specific element of the DataFrame
# use iloc for numerical indexes
# use loc for title indexes

print(data.iloc[1,2])            # retrieving first row, second column
print(data.loc[1,'sepal width']) # retrieving first row, column named 'model'

# accessing a slice of the DataFrame
print("")
print(data.iloc[1:10,1:3])  # get rows 1-9, cols 1-2

print("")
print(data.loc[5:10, ['petal length', 'sepal width']]) # get rows 5-9, cols 'petal length' & 'sepal width'

In [None]:
# selection and filtering
#print(data['sepal width'] > 3.5)

print(data[data['sepal width'] > 3.5])

### 1.3 Data Summary Information
Getting summary information about the data.

In [None]:
print('data.shape =', data.shape) # dimensions of the table
print('data.size =', data.size)   # total number of elements in the table

In [None]:
# see the unique values for a column
print(data['class'].unique())
print(data['sepal width'].unique())

# see how many there are of each of those values
print("")
print(data['class'].value_counts())
print(data['sepal width'].value_counts())
print(type(data['sepal width'].value_counts()))

In [None]:
# summary statistics for entire dataset
data.describe()

In [None]:
# group the data by class, then calculate summary statistics
result = data.groupby('class').describe()
result

In [None]:
# get just the stats for petal length, by class
petal_length_result = result['petal length']
petal_length_result

In [None]:
# get just the mean and std for petal length, by class
petal_length_result.loc[:,['mean', 'std']]

In [None]:
# compute a correlation between every pair of attributes
data.corr()

### 1.4 Arithmetic Operations

In [None]:
# many arithmetic operations only work on all numerical data, 
# so let's remove the last column, which is categorical
data_sample = data.head() # take a subset of the data
del data_sample['class']  # remove the last column
print(data_sample.shape)  # check the shape of the data sample
data.head()

In [None]:
# arithmetic operations
print(data_sample + 4)
print(data_sample * 4)
print(data_sample ** 4)
print(data_sample.abs())    # get the absolute value for each element
print(data_sample.T)    # transpose operation

In [None]:
# arithmetic with two dataframes

# create a new dataframe
column_names= ['sepal length', 'sepal width', 'petal length', 'petal width']
data_set_2 = pd.DataFrame([[1,2,3,4],
                         [5,6,7,8],
                         [9,10,11,12],
                         [13,14,15,16],
                         [17,18,19,20]], columns=column_names)

# add two dataframes together
print(data_sample + data_set_2)

In [None]:
# can do arithmetic on a single column
print(data_sample['sepal length'] + 4)

In [None]:
# maximum value for a column
print('\nMax value for petal length:', data['petal length'].max())  
 
# get maximum value for each column
# can only do this if all columns are numeric
data_numeric = data.copy()
del data_numeric['class']
print('\nMaximum value per col:')
print(data_numeric.max())
#print(data.head())

In [None]:
# get minimum value for each row
print('\nMinimum value per row:')
print(data.min(axis=1))    

# get sum of values for each column
print('\nSum of values per column:')
print(data.sum())    

# get average value for each row
print('\nAverage value per row:')
print(data.mean(axis=1))    

print('\nCalculate max - min per column:')
f = lambda x: x.max() - x.min()
print(data_numeric.apply(f))

print('\nCalculate max - min per row:')
f = lambda x: x.max() - x.min()
print(data_numeric.apply(f, axis=1))

## 2. Plotting Data

### 2.1 Built-in plotting for Series and DataFrames

There are built-in functions you can use to plot the data stored in a Series or a DataFrame.

In [1]:
# show a boxplot of each numerical feature
data.boxplot()

NameError: name 'data' is not defined

In [None]:
# create a line plot for one feature
petal_len = data['petal length']
petal_len.plot(kind='line', title='Line plot')

In [None]:
data.plot()

In [None]:
# create a scatter plot of 2 features
data.plot.scatter(x='petal length', y='petal width')

In [None]:
# show a histogram of one feature
petal_len.plot(kind='hist', title = 'Histogram')

In [None]:
# create a pie chart
groups = data.groupby('class')
print(groups.size()) # groups.size() is a Series object
groups.size().plot.pie()

### 2.2 Using Matplotlib
Sometimes the built-in dataframe plotting methods are not customizeable enough for what you are tyring to show. The matplotlib package can be used to create more interesting plots.
The matplotlib documentation is [here](https://matplotlib.org/api/axes_api.html#matplotlib.axes.Axes).

In [None]:
# plot petal length vs petal width, color data points by their class
colors = {'Iris-setosa':'red', 'Iris-versicolor':'blue', 'Iris-virginica':'green'}
plt.scatter(data['petal length'], 
           data['petal width'], 
           c=data['class'].apply(lambda x: colors[x]),
           marker = 'x')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
plt.title("Iris Data")
plt.show()

In [None]:
# scatter plot matrix
pd.plotting.scatter_matrix(data, figsize=(15,15))
plt.show()