# Pandas for Data Analysis: Plotting with Matplotlib, Pandas, and Seaborn

## Outline:

* [Plotting using Packages](#Plotting-using-Packages)
  * [Matplotlib](#Matplotlib)
  * [Pandas](#Pandas)
  * [Seaborn](#Seaborn)
* [Plotting Data](#Plotting-Data)

## Plotting using Packages

### Matplotlib

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np

In [None]:
plt.plot()

In [None]:
x = np.linspace(0, 10)
y = np.linspace(0, 5)

plt.plot(x, y, label='linear')
plt.legend()

In [None]:
x = [1, 7, 3, 4]
y = [1, 3, 4, 5]
plt.scatter(x, y, color='darkgreen', marker='o')

In [None]:
_, ax = plt.subplots()

x = [1, 7, 3, 4]
y = [1, 3, 4, 5]

ax.scatter(x, y, color='darkgreen', marker='o')

In [None]:
fig = plt.figure(figsize=(20, 10))

ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

x = [1, 2, 3]
y = [3, 4, 5]

ax1.bar(x, y)
ax2.barh(x, y)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

x = [1, 2, 3]
y = [3, 4, 5]

ax1.bar(x, y)
ax2.barh(x, y)

In [None]:
x = np.arange(4)
money = [1.5e5, 2.5e6, 5.5e6, 2.0e7]

plt.bar(x, money)
plt.xticks(x, ('A', 'B', 'C', 'D'))

In [None]:
gaussian_numbers = np.random.randn(1000)
plt.hist(gaussian_numbers)

In [None]:
data = np.random.randn(1000)
plt.boxplot(data)

In [None]:
plt.boxplot([data, data, data])

In [None]:
np.random.seed(10)
d1 = np.random.normal(100, 10, 200)
d2 = np.random.normal(80, 30, 200)
d3 = np.random.normal(90, 20, 200)
d4 = np.random.normal(70, 25, 200)

data_to_plot_1 = [d1, d2, d3, d4]
data_to_plot_2 = [d4, d3, d2, d1]

_, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

ax1.boxplot(data_to_plot_1)
ax1.set_xticklabels(['Sample1', 'Sample2', 'Sample3', 'Sample4'])

ax2.boxplot(data_to_plot_2)
ax2.set_xticklabels(['Sample4', 'Sample3', 'Sample2', 'Sample1'])

### Pandas

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(data={
    'x': np.linspace(0, 10),
    'y': np.linspace(0, 5)    
})
df.plot(x='x', y='y', marker='.')

In [None]:
df = pd.DataFrame(data={
    'x': [1, 7, 3, 4],
    'y': [1, 3, 4, 5]
})
df.plot.scatter(x='x', y='y')

In [None]:
df = pd.DataFrame(data={
    'x': [1, 2, 3],
    'y': [3, 4, 5]
})

_, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6))

df.plot.bar(x='x', y='y', ax=ax1, legend=False)
df.plot.barh(x='x', y='y', ax=ax2, legend=False)

In [None]:
df = pd.DataFrame(data={
    'x': [1, 2, 3, 4],
    'money': [1.5e5, 2.5e6, 5.5e6, 2.0e7]
})

df.plot.bar(x='x', y='money', legend=False)
plt.xticks(x, ('A', 'B', 'C', 'D'))

In [None]:
df = pd.DataFrame(data={
    'x': np.random.randn(1000)
})
df.hist()

In [None]:
df = pd.DataFrame(data={
    'd1': np.random.normal(100, 10, 200),
    'd2': np.random.normal(80, 30, 200),
    'd3': np.random.normal(90, 20, 200),
    'd4': np.random.normal(70, 25, 200)
})
df.boxplot(column=['d1', 'd2', 'd3', 'd4'])

### Seaborn

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
df = pd.DataFrame(data={
    'x': np.linspace(0, 10),
    'y': np.linspace(0, 5)    
})
sns.lineplot(data=df, x='x', y='y')

In [None]:
df = pd.DataFrame(data={
    'x': [1, 7, 3, 4],
    'y': [1, 3, 4, 5]
})
sns.scatterplot(data=df, x='x', y='y')

In [None]:
df = pd.DataFrame(data={
    'x': [1, 2, 3],
    'y': [3, 4, 5]
})

_, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6))

sns.barplot(data=df, x='x', y='y', ax=ax1)
sns.barplot(data=df, x='y', y='x', ax=ax2, orient='h', order=[3, 2, 1])

In [None]:
df = pd.DataFrame(data={
    'x': [1, 2, 3, 4],
    'money': [1.5e5, 2.5e6, 5.5e6, 2.0e7]
})

sns.barplot(data=df, x='x', y='money')
plt.xticks(x, ('A', 'B', 'C', 'D'))

In [None]:
df = pd.DataFrame(data={
    'x': np.random.randn(1000)
})
sns.distplot(df.x)

In [None]:
df = pd.DataFrame(data={
    'd1': np.random.normal(100, 10, 200),
    'd2': np.random.normal(80, 30, 200),
    'd3': np.random.normal(90, 20, 200),
    'd4': np.random.normal(70, 25, 200)
})
sns.boxplot(data=df)

## Plotting Data

### Dataset 1: Amazon Review

In [None]:
%matplotlib inline

import gzip
music_review_lines = gzip.open('data/reviews_Digital_Music_5.json.gz', 'rt').readlines()

import json
df = pd.DataFrame(list(map(json.loads, music_review_lines)))

In [None]:
df['unixReviewTime'] = pd.to_datetime(df['unixReviewTime'], unit='s')
df['quarter'] = df.unixReviewTime.dt.quarter

กราฟแสดงจำนวนคนที่มารีวีว

In [None]:
df.groupby(df.unixReviewTime.dt.year)['reviewerID'].count().plot(figsize=(18, 6))

กราฟเปรียบเทียบค่า rating เฉลี่ยในแต่ละวันของอาทิตย์ระหว่างปี 2013 และ 2014

In [None]:
import matplotlib.pyplot as plt

by_weekday = df.groupby([df.unixReviewTime.dt.year, df.unixReviewTime.dt.dayofweek]).mean()

_, ax = plt.subplots(1, 2, figsize=(16, 6))

by_weekday.loc[2013].plot(title='Average Reviews Rating by Day of Week (2013)', ax=ax[0])
ax[0].set_xticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])

by_weekday.loc[2014].plot(title='Average Reviews Rating by Day of Week (2014)', ax=ax[1])
ax[1].set_xticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])

กราฟเปรียบเทียบจำนวนรีวีวต่อวันของเดือนระหว่างปี 2012, 2013 และ 2014

In [None]:
import matplotlib.pyplot as plt

by_month = df.groupby([df.unixReviewTime.dt.year, df.unixReviewTime.dt.day])['reviewerID'].count()

_, ax = plt.subplots(1, 3, figsize=(18, 6))

by_month.loc[2012].plot(title='Average Reviews by Month (2012)', ax=ax[0]);
by_month.loc[2013].plot(title='Average Reviews by Month (2013)', ax=ax[1]);
by_month.loc[2014].plot(title='Average Reviews by Month (2014)', ax=ax[2]);

### Dataset 2: Titanic

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

In [None]:
titanic_data_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
titanic = pd.read_csv(titanic_data_url)

In [None]:
titanic.plot(figsize=(15, 6))

In [None]:
titanic.hist(figsize=(15, 10))

In [None]:
titanic.groupby('sex').hist(figsize=(10, 10))

In [None]:
titanic.boxplot(column=['age'], by='survived', figsize=(6, 6))

In [None]:
titanic.boxplot(column=['pclass'], by='survived', figsize=(6, 6))

In [None]:
from pandas.tools.plotting import scatter_matrix

scatter_matrix(titanic, alpha=0.2, figsize=(15, 15), diagonal='kde')

ลอง Seaborn

In [None]:
import seaborn as sns

In [None]:
sns.distplot(titanic.age.dropna())

In [None]:
sns.barplot(x="sex", y="survived", hue="pclass", data=titanic)

In [None]:
sns.barplot(x="sex", y="survived", hue="pclass", data=titanic, palette=sns.cubehelix_palette(4, start=0.5, rot=-.75))

In [None]:
sns.countplot(y="embarked", hue="pclass", data=titanic, palette="Greens_d");

In [None]:
sns.stripplot(x=titanic.age, y=titanic.fare, hue=titanic.survived)

In [None]:
g = sns.FacetGrid(titanic, row="sex", col="survived")
g.map(plt.hist, 'age')

In [None]:
g = sns.FacetGrid(titanic, row="sex", col="pclass")
g.map(plt.hist, 'survived')

In [None]:
g = sns.FacetGrid(titanic, row="sex", col="survived")
g.map(plt.scatter, 'age', 'parch')

In [None]:
g = sns.FacetGrid(titanic, row="sex", col="survived")
g.map(sns.regplot, 'age', 'parch')