# Day 3. Introduction to pandas

### Importing modules

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
# IPython Notebook option to show plots in the notebook (not in a separate window)
%matplotlib inline

In [None]:
arr = np.random.random([10, 5])
df = pd.DataFrame(arr, columns=["col_0", "col_1", "col_2", "col_3", "col_4"])
df

## Filtering data

Now let's try some more sophisticated data selection.

In [None]:
# Select rows from a pd.DataFrame, for which 'col_0' has value >0.5
df[df["col_0"] > 0.3]

In [None]:
# Nicer way for the same operation
df.query("col_0 > 0.3")

In [None]:
# You can also access local variables and create more complicated expressions
threshold = 0.5
df.query("col_0 > @threshold or col_1 < @threshold")

### Adding new column

Let's look at how to create new columns of our dataset 

In [None]:
df["constant_column"] = 1
df.head()

In [None]:
df["copied_value"] = df["col_0"]
df.head()

In [None]:
df["copied_and_doubled"] = 2 * df["col_0"]
df.head()

It works similarly for over operations like adding, subtracting, dividing.

In [None]:
df["combination_of_columns"] = df["col_0"] + 3 * df["col_3"]
df.head()

But what if we need to create a more complicated computation?

In [None]:
def custom_function(entire_row):
    if entire_row["col_2"] > entire_row["col_4"]:
        return "col_2 is larger than col_4"
    else:
        return "col_2 is not larger than col_4"


df["custom_column"] = df.apply(custom_function, axis=1)
df.head()

### Basic aggregations

Pandas gives us also a nice opportunity to check simple statistical properties at once:

In [None]:
df.describe()

In [None]:
df["new_col"] = np.random.randint(0, 3, df.shape[0])  # Let's create new column
df

In [None]:
# Series has this cool method to inspect number of occurrences of each value
df["new_col"].value_counts()

In [None]:
# And we can list all unique values in a Series
df["new_col"].unique()

In [None]:
# Drop the column
df.drop("new_col", axis=1, inplace=True)

In [None]:
df["col_1"].max()

In [None]:
df["col_1"].std()

In [None]:
df["col_2"].mean()

### Group aggregations

In [None]:
df.groupby("custom_column")["col_2"].max()

In [None]:
df.groupby("custom_column")["col_2"].mean()

In [None]:
# We can also create new columns with results of group aggregations
df["col_2_mean_in_group"] = df.groupby("custom_column")["col_2"].transform("mean")
df.head()

#### And let's have a look at some other basic data manipulations

In [None]:
df["new_col"] = df["col_1"] / df["col_0"].max()
df

In [None]:
# To drop columns we have to specify axis=1, default axis=0
df.drop("new_col", inplace=True, axis=1)
df

In [None]:
# Let's create a new column
df["new_col"] = np.random.randint(0, 3, df.shape[0])
df

In [None]:
# And map its values to some new ones
my_map = {0: "a", 1: "b", 2: "c"}
df["new_col"] = df["new_col"].map(my_map)
df

### Iterating throught DataFrame

We can also iterate over rows or subframes:

In [None]:
%%timeit -n 100
for i, row in df.iterrows():
    df.loc[i, "col_0"] = row.loc["col_0"] * 2

You can also use itertuples which is faster than iterrows

In [None]:
%%timeit -n 100
for row in df.itertuples():
    df.loc[row.Index, "col_0"] = row.col_0 * 2

Recommended approach, the most efficient method to apply function along an axis

In [None]:
%%timeit -n 100
df.loc[:, "col_0"] = df.apply(lambda row: row["col_0"] * 2, axis=1)

In [None]:
for label, sub_df in df.groupby("new_col"):
    print("Label: {}".format(label))
    print("Subframe:\n{}\n".format(sub_df))

## Combining DataFrames

You can combine DataFrame in multiple ways:

In [None]:
df_1 = pd.DataFrame(
    {
        "A": ["A0", "A1", "A2"],
        "B": ["B0", "B1", "B2"],
        "C": ["C0", "C1", "C2"],
    },
    index=[0, 1, 2],
)

df_2 = pd.DataFrame(
    {
        "A": ["A4", "A5", "A6"],
        "B": ["B4", "B5", "B6"],
        "C": ["C4", "C5", "C6"],
    },
    index=[3, 4, 5],
)

df_3 = pd.DataFrame(
    {
        "A": ["A8", "A9", "A10"],
        "B": ["B8", "B9", "B10"],
        "C": ["C8", "C9", "C10"],
    },
    index=[6, 7, 8],
)

In [None]:
# concat combines multiple DataFrames
df_concat = pd.concat([df_1, df_2, df_3])
print(df_concat)

In [None]:
# You can associate specific keys with each of the DataFrames
df_concat = pd.concat([df_1, df_2, df_3], keys=["df_1", "df_2", "df_3"])
print(df_concat)

We can also join DataFrames in similar way to SQL tables:

In [None]:
df_1.loc[:, "key"] = ["K0", "K1", "K2"]
df_2.loc[:, "key"] = ["K1", "K1", "K3"]
print(f"df_1 =\n{df_1}\ndf_2 = \n{df_2}")

In [None]:
df_merged = pd.merge(df_1, df_2, on="key", how="inner")
print(df_merged)

In [None]:
# We can choose which DataFrame we want to use the value from
df_merged = pd.merge(df_1, df_2, on="key", how="right")

In [None]:
# Missing values were filled with NaNs
print(df_merged)

In [None]:
# We can use both set of values
df_merged = pd.merge(df_1, df_2, on="key", how="outer")
print(df_merged)