# Day 3. Introduction to pandas

### Importing modules

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
# IPython Notebook option to show plots in the notebook (not in a separate window)
%matplotlib inline

### Series creation

A Series is a 1-dimensional data structure capable of holding single data type. It can easily be created using NumPy array.

In [None]:
arr = np.random.random(10)
series = pd.Series(arr)
series

## Series slicing and indexing

In [None]:
# Selecting particular element
series[0]

In [None]:
# You can also use iloc and loc to acces its elements
series.iloc[0]

In [None]:
# You can access multiple elements using slicing
series.iloc[0:4]

In [None]:
# We can see a beginning of the Series, default 5 elements
print(series.head())

In [None]:
# And an end of it, default 5 elements
print(series.tail(3))

## Basic mathematical operations on Series

You can easily performs operations such as addition, subtraction or multiplication using Pandas Serie.

In [None]:
ser_a = pd.Series(
    [
        15,
        20,
        33,
        17,
        4,
    ]
)
ser_b = pd.Series([15, 5, 7])

NumPy is the DataFrame's underlying data structure, actually:

In [None]:
print(type(series.values))

In [None]:
# All operations are performed elementwise and missing values are filled with NaNs
print(ser_a + ser_b)
print(ser_a - ser_b)
print(ser_a * ser_b)
print(ser_a / ser_b)

### DataFrame creation

A basic data structure in pandas is a DataFrame. It's a 2 dimensional data structure with rows and columns. One can easily create a DataFrame from a NumPy array.

In [None]:
arr = np.random.random([10, 5])
df = pd.DataFrame(arr)
df

We can give names to columns and rows, they don't have to be unique. 

In [None]:
df.columns = ["col_{}".format(i) for i in range(df.shape[1])]
df

In [None]:
df.index = ["row_{}".format(i) for i in range(df.shape[0])]
df.index.name = "row_number"
df

Now we can inspect some basic properties of the DataFrame

In [None]:
print(df.shape)

In [None]:
print(df.columns)

In [None]:
print(df.index)

In [None]:
print(df.dtypes)

### DataFrame slicing and indexing

In [None]:
# Selecting particular columns is very easy
df["col_0"]

In [None]:
# or we can do it this way
df.col_0

In [None]:
# DataFrame column is a Series
type(df["col_0"])

In [None]:
# We can also select a subset of columns by passing a list inside brackets
df[["col_0", "col_1"]]

#### Location attribute

In [None]:
# To select a particular row we have to indicate its index in .loc attribute
df.loc["row_0"]

In [None]:
# We can of course select a subset of rows
df.loc[["row_0", "row_1"]]

In [None]:
# The above is a Data Frame, so we can access its columns
df.loc[["row_0", "row_1"]][["col_1", "col_2"]]

In [None]:
# or we can access to this subframe directly
df.loc[["row_0", "row_1"], ["col_1", "col_2"]]

In [None]:
# To access selected item of a Series we have to specify index of the item
print("Type of selected single row: {}".format(type(df.loc["row_0"])))
df.loc["row_0"]["col_3"]

In [None]:
# or do it directly
df.loc["row_0", "col_3"]

Whereas `.loc` is used to access rows with their indices, `.iloc` is used to access rows with its position.

In [None]:
df.iloc[0]

In [None]:
df.iloc[0]["col_3"]

In [None]:
df.iloc[0, 3]

Similarly to NumPy, you can get slices with a colon `:`

In [None]:
df.loc[:"row_3", "col_2":"col_4"]

But be careful! While selecting with .loc __both__ endpoints of the slices are __included__ in slices! Unlike selecting with .iloc

In [None]:
# 'row_3' is a third row, while accessed with .loc it was included, with .iloc it's not
# similarly for 'col_4'
df.iloc[:3, 2:4]

Now the tricky part:

In [None]:
# This is a particular column
df["col_3"]

In [None]:
# This of course doesn't work, because there is no such columns as 3
df[3]

In [None]:
# But this one works, and takes first three ROWS
df[:3]

In [None]:
# You can achieve the same result with iloc
df.iloc[:3, :]

A faster way to get a scalar value:

In [None]:
# With rows and columns names
df.at["row_3", "col_1"]

In [None]:
# Or with its position
df.iat[3, 1]

In [None]:
# but this doesn't work, because you must select a scalar
df.at["row_3"]

In [None]:
# nor this one. Again: you can access only scalar values
df.at[:"row_3", "col_3"]

There are also some nice methods to sort data

In [None]:
# By default it sorts rows
df.sort_index(ascending=False)

In [None]:
# But it can sort columns
df.sort_index(axis=1, ascending=False)

In [None]:
# And it can sort rows by a column value
df.sort_values(by="col_2")