# Lecture 13 Visualization with Matplotlib

Learning Objectives: 
* Create visualizations using both `Axes` and `pyplot` interface.
* Handle subplots effectively
* Read documentation to adjust visualizations.

Visualization modules in Python: 
* Matplotlib
* Seaborn
* Plotly

In [None]:
import pandas as pd
import matplotlib
from matplotlib.patches import Rectangle, Circle
import matplotlib.pyplot as plt
# pyplot is a high-level sub-module of matplotlib

In [None]:
df = pd.read_csv('vgsales.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## [Plot Types](https://matplotlib.org/devdocs/plot_types/index.html)
* Pairwise data
    * Barplot (comparison)
    * Scatter plot (relationship)
    * Line plot (trend)
* Statistical distributions
    * Histogram
    * Boxplot

### Barplot: Num of games by genre

In [None]:
genre_count = df['Genre'].??
genre_count

In [None]:
# Pandas syntax
ax = genre_count.?? # returns an Axes object that represents the plot within a figure
ax.??
type(ax)

In [None]:
# pyplot interface (function-based, implicit)
plt.bar(??)
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.grid(axis='y')

### Barplot: global sales by genre

In [None]:
global_sales_by_genre = df[['Genre', 'Global_Sales']].??
global_sales_by_genre

In [None]:
global_sales_by_genre.plot.bar(ylabel='Global Sales (million)', legend=False)

### Subplots: barplot of sales by genre for each region

In [None]:
# Axes interface (object-based, explicit)
# returns a Figure and one or more Axes objects
fig, axes = plt.??
axes.shape

In [None]:
regions = ['NA', 'EU', 'JP', 'Other']
region_cols = [i + '_Sales' for i in regions]
print(region_cols)
df_by_genre = df[region_cols + ['Genre']].groupby('Genre').sum()
df_by_genre

In [None]:
fig, axes = plt.subplots(2, 2)
axes[0][0].barh(df_by_genre.index, df_by_genre['NA_Sales'])

In [None]:
fig, axes = plt.subplots(2, 2)
axes = axes.??()

genres = df['Genre'].unique()
cmap = matplotlib.colormaps.get_cmap('tab20')
colors = {genres[i]: cmap(i) for i in range(len(genres))}

for i in range(len(regions)): 
    region_sales = df_by_genre[region_cols[i]].sort_values()
    axes[i].barh(??)
    axes[i].set_title(??)
    axes[i].grid(axis='x')
plt.subplots_adjust(wspace=0.25)

### Lineplot: global sales by year

In [None]:
global_sales_by_year = df[['Year', 'Global_Sales']].groupby('Year').sum()
global_sales_by_year

In [None]:
global_sales_by_year.plot(ylabel='Global Sales (million)', ??)
plt.show()

In [None]:
plt.plot(??)
plt.xlabel('Year')
plt.ylabel('Global Sales (million)')
plt.show()

In [None]:
genres = df['Genre'].unique()

ax = plt.??()
for g in global_sales_by_genre.index[:5]:
    subset = ??
    subset_by_year = ??
    ax.plot(??)

rect = ??(??, edgecolor='none', facecolor='lightblue', alpha=0.5, zorder=-1)
ax.add_patch(rect)

plt.xlabel("Year")
plt.ylabel("Global Sales (million)")
plt.legend(title="Genres")
plt.show()

In [None]:
df.sort_values('Global_Sales', ascending=False).head(10)

### Histogram: global sales

In [None]:
df['Global_Sales'].hist(bins=10)
plt.xlabel("Global Sales (million)")
plt.ylabel("Count")
plt.show()

In [None]:
df[df['Global_Sales'] < 2]['Global_Sales'].hist(bins=20)
plt.xlabel("Global Sales (million)")
plt.ylabel("Count")
plt.show()

In [None]:
plt.hist(df[df['Global_Sales'] < 2]['Global_Sales'])
plt.xlabel("Global Sales (million)")
plt.ylabel("Count")
plt.show()

### Boxplot: sales by publisher

In [None]:
sales_by_publisher = df[['Global_Sales', 'Publisher']].groupby('Publisher').sum().sort_values('Global_Sales', ascending=False)
sales_by_publisher

In [None]:
top10_publisher_df = df[??]
top10_publisher_df

In [None]:
top10_publisher_df.boxplot('Global_Sales', 'Publisher', vert=False, xlabel='Sales (million)')
plt.title('')
plt.suptitle('Global Sales by Publisher')
plt.xlim(0, 6)

In [None]:
top10_publisher_names = ??
records_by_publisher = []
for publisher in top10_publisher_names:
    records_by_publisher.append(??)
plt.boxplot(records_by_publisher, tick_labels=top10_publisher_names, vert=False)

for i, publisher in enumerate(top10_publisher_names): 
    top1_game = ??
    plt.annotate(??)
    
plt.xlabel('Sales (million)')
plt.ylabel('Publisher')
plt.title('Global Sales by Publisher')