In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
​
#You can find the data here: 
#https://github.com/fivethirtyeight/data/tree/master/college-majors
​
recent_grads = pd.read_csv("recent-grads.csv")
recent_grads.head(5)

In [None]:
recent_grads.tail(5)

In [None]:
recent_grads.describe()

In [None]:
#We are first inspecting the data to see if there are NaN values
print("Original df row numbers:", recent_grads.shape[0])

#Now, we will create a new df with all NaN's removed, and we will compare the 
#number of rows in each df
recent_grads = recent_grads.dropna()
print("Cleaned df row numbers:", recent_grads.shape[0])

The above cell tells us that there is one row containing NaN values
In the cell below, we will use the Pandas method "scatter_matrix" to examine the relationship between Unemployment_rate and ShareWomen

In [None]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(recent_grads[['ShareWomen', 'Unemployment_rate']])

The point of this exercise will be to create a custom plot that looks exactly like the one Pandas generated in order to gain a better understanding of the abstraction levels in matplotlib.

In [None]:
#Now, we will format the above plot a little more by using the Figure method
fig = plt.figure(figsize=(8,6))

#Creating 4 distinct subplots within the figure
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)

#We now create our 4 subplots to display the relationship in question
ax1.hist(recent_grads['ShareWomen'])
ax2.scatter(recent_grads['Unemployment_rate'], recent_grads['ShareWomen'])
ax3.scatter(recent_grads['ShareWomen'], recent_grads['Unemployment_rate'])
ax4.hist(recent_grads['Unemployment_rate'])

#Now hiding the x-axes on the top 2 rows
ax1.get_xaxis().set_visible(False)
ax2.get_xaxis().set_visible(False)

#Now hiding the y-axes on the left two columns
ax2.get_yaxis().set_visible(False)
ax4.get_yaxis().set_visible(False)

#Now assigning the column names as the x- and y-axis labels:
#For subplot 1, we set the y-axis to ShareWomen
ax1.set_ylabel("ShareWomen")

#For subplot 3, we set the x-axis to ShareWomen and the y-axis label 
#to Unemploytment_rate
ax3.set_xlabel("ShareWomen")
ax3.set_ylabel("Unemployment_rate")

#For subplot 4, we set the x-axis label to Unemployment_rate
ax4.set_xlabel("Unemployment_rate")

#We now have to set the limits for the x-axes and y-axes, also rotate the labels:
#For subplot 1, set the x-axis data limit to range from 0 to 30.
#Also, we are adjusting the ticks to be out of the way and at certain intervals
#We will also do this for the other subplots
ax1.set_ylim(0,30)
ax1.set_yticklabels([0,5,10,15,20,25,30])
ax1.get_xaxis().tick_top()
ax1.get_yaxis().tick_left()

#For subplot 2, set the x-axis data limit to range from 0.0 to 1.0.
ax2.set_xlim(0.0,0.20)

#For subplot 3, set the x-axis data limit to range from 0.0 to 1.0
#and, set the y-axis data limit to range from 0.0 to 0.20.
ax3.set_ylim(0.00,0.20)
ax3.set_xlim(0.0, 1.0)
ax3.set_yticklabels([0.00, 0.05, 0.10, 0.15])
ax3.set_xticklabels([0.0, 0.2, 0.4, 0.6, 0.8], rotation=90)
ax3.get_xaxis().tick_bottom()
ax3.get_yaxis().tick_left()

#For subplot 4, set the x-axis data limit to range from 0.0 to 0.20.
ax4.set_xlim(0.00,0.20)
ax4.set_xticklabels([0.00, 0.05, 0.10, 0.15, 0.20], rotation=90)
ax4.get_xaxis().tick_bottom()

#We will now format space between figures further using subplots_adjust()
fig.subplots_adjust(wspace=0, hspace=0)

plt.show()

In [None]:
#To create a grouped bar plot that shows genders by major, first we must 
#create a new column in the df that contains ShareMen
recent_grads['ShareMen'] = recent_grads['Men'] / recent_grads['Total']

In [None]:
import numpy as np

#Now creating a df with just the "Arts" major category
arts_df = recent_grads[recent_grads['Major_category'] == 'Arts']

fig = plt.figure(figsize=(8,8))
ax1 = fig.add_subplot(1,1,1)

majors = len(set(arts_df['Major']))
locs = np.arange(0, majors)

bar_1 = ax1.bar(left=locs, height=arts_df['ShareMen'].tolist(), width=0.35)

#Now changing the axes labels to the Major titles
majors_list = arts_df['Major'].tolist()
ax1.set_xticklabels(majors_list, rotation=90)

#This next section will be adding in the ShareWomen so we get a better comparison
offset_locs = locs + 0.35

bar_2 = ax1.bar(left=offset_locs, height=arts_df['ShareWomen'].tolist(), 
                width=0.35, color='green')
ax1.set_xticks(offset_locs)

#Now adding a legend and a grid 
plt.legend((bar_1, bar_2), ("ShareMen", "ShareWomen"), loc="upper left")
plt.grid(which='major', axis='both')

#Lastly, we will add some axes labels
ax1.set_xlabel("Major")
ax1.set_ylabel("Frequency")

plt.show()

Some follow up questions to answer with the data: Visualize the gender ratios for each major by creating a stacked box plot instead of a grouped bar plot. Practice generating histograms from scratch without relying on the Matplotlib method hist() Practice generating box plots using Matplotlib only.

Write a function that takes in a DataFrame, takes in a list of column names, and generates a scatter matrix for combinations of columns. 
    While the scatter matrix you generated in this guided project used 2 columns, how can you generalize the code to handle n columns. 
        As n gets larger, how do you dynamically specify the figsize parameter when creating the Plot instance so the data visualization is legible with more subplots.