# Lab | Matplotlib & Seaborn

#### Import all the necessary libraries here:

In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Challenge 1

#### The data we will use in this challenge is:

In [None]:
x = np.arange(0, 100)
y = x * 2
z = x ** 2

#### Plot (x, y) and (x, z).
There are 2 ways of doing this. Do in both ways.

**Hint**: Check out the nrows, ncols and index arguments of subplots. 

Also, play around with the linewidth and style. Use the ones you're most happy with.

In [None]:
plt.plot(x, y, linewidth = 2, color = 'green', linestyle = 'dashdot')
plt.show()

plt.plot(x, z, linewidth = 4, color = 'red', linestyle = 'dashed')
plt.show()

#### Use plt.subplots(nrows=1, ncols=2) to create the plot.

In [None]:
plot_options, (plot_1, plot_2) = plt.subplots(nrows = 1, ncols = 2)

plot_1.plot(x, y, linewidth = 2, color = 'green', linestyle = 'dashdot')
plot_2.plot(x, z, linewidth = 4, color = 'red', linestyle = 'dashed')

plt.show()

#### Use your previous code but now, resize your plot.
**Hint**: Add the figsize argument in plt.subplots().

If you want, try to add a title to the plot or even axes labels. You can also play with the fontweight and fontsize of the titles and labels. 

In [None]:
plot_options, (plot_1, plot_2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13, 4))

plot_1.plot(x, y, linewidth = 2, color = 'green', linestyle = 'dashdot')
plot_1.set_xlabel('plot 1 x axis', fontweight = 'bold', size = 10)
plot_1.set_ylabel('plot 1 y axis', fontweight = 'bold', size = 10)
plot_1.set_title('Very important plot', fontweight = 'bold', fontstyle = 'italic', size = 18)

plot_2.plot(x, z, linewidth = 4, color = 'red', linestyle = 'dashed')
plot_2.set_xlabel('plot 2 x axis', fontweight = 'bold', size = 10)
plot_2.set_ylabel('plot 2 y axis', fontweight = 'bold', size = 10)
plot_2.set_title('Even more important plot', fontweight = 'bold', fontstyle = 'italic', size = 18)

plt.subplots_adjust(wspace = 0.25)

plt.show()

#### Plot both $y=x^2$ and $y=e^x$ in the same plot using normal and logarithmic scale.
**Hint**: Use `set_xscale` and `set_yscale`.

In [None]:
plot_options, (plot_1, plot_2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13, 4))

y = x ** 2

plot_1.plot(x, y, linewidth = 2, color = 'green', linestyle = 'dashdot')
plot_1.set_xlabel('normal scale x axis', fontweight = 'bold', size = 10)
plot_1.set_ylabel('normal scale y axis', fontweight = 'bold', size = 10)
plot_1.set_title('normal scale', fontweight = 'bold', fontstyle = 'italic', size = 18)

y = np.exp(x)

plot_2.plot(x, y, linewidth = 4, color = 'red', linestyle = 'dashed')
plot_2.set_xlabel('logarithmic scale x axis', fontweight = 'bold', size = 10)
plot_2.set_ylabel('logarithmic scale y axis', fontweight = 'bold', size = 10)
plot_2.set_title('logarithmic scale', fontweight = 'bold', fontstyle = 'italic', size = 18)

plot_2.set_xscale('log')
plot_2.set_yscale('log')

plt.subplots_adjust(wspace = 0.25)

plt.show()

#### As a bonus challenge, try to add a legend to the plot.

In [None]:
plot_options, (plot_1, plot_2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13, 4))

y = x ** 2

plot_1.plot(x, y, linewidth = 2, color = 'green', linestyle = 'dashdot', label = 'y = x ** 2')
plot_1.set_xlabel('normal scale x axis', fontweight = 'bold', size = 10)
plot_1.set_ylabel('normal scale y axis', fontweight = 'bold', size = 10)
plot_1.set_title('normal scale', fontweight = 'bold', fontstyle = 'italic', size = 18)
plot_1.legend(loc = 'center left', bbox_to_anchor = (0.01, 0.919))

y = np.exp(x)

plot_2.plot(x, y, linewidth = 4, color = 'red', linestyle = 'dashed', label = 'y = np.exp(x)')
plot_2.set_xlabel('logarithmic scale x axis', fontweight = 'bold', size = 10)
plot_2.set_ylabel('logarithmic scale y axis', fontweight = 'bold', size = 10)
plot_2.set_title('logarithmic scale', fontweight = 'bold', fontstyle = 'italic', size = 18)
plot_2.legend(loc = 'center left', bbox_to_anchor = (0.01, 0.919))

plot_2.set_xscale('log')
plot_2.set_yscale('log')

plt.subplots_adjust(wspace = 0.25)

plt.show()

## Challenge 2
#### Import the `Fitbit2` dataset and store it in a variable called `fitbit`. You can find the dataset in Ironhack's database:
* db: `fitbit`
* table: `fitbit2`

In [None]:
fitbit = pd.read_csv('Fitbit2.csv')

fitbit.head()

#### From the Fitbit data, we want to visually understand:

How the average number of steps change by month. Use the appropriate visualization to show the median steps by month. Is Fitbitter more active on weekend or workdays? All plots must be in the same jupyter notebook cell.

**Hints**:

* Use Months_encoded and Week_or Weekend columns.
* Use matplolib.pyplot object oriented API.
* Set your size figure to 12,4
* Explore plt.sca
* Explore plt.xticks
* Save your figures in a folder called `figures` in your repo. 

In [None]:
# Calculate steps by months (mean)
steps_month_mean = fitbit.groupby(['Months', 'Months_encoded'])['Steps'].agg(np.mean).reset_index().sort_values('Months_encoded')

# Calculate steps by work or weekend (mean)
steps_work_we_mean = fitbit.groupby(['Work_or_Weekend'])['Steps'].agg(np.mean).reset_index().sort_values('Work_or_Weekend', ascending = False)
steps_work_we_mean['Work_or_Weekend'] = [['Weekend', 'Work'][value] for value in steps_work_we_mean['Work_or_Weekend']]

# Plot average steps per month
plt.figure(figsize = (12, 4))
plt.bar(steps_month_mean['Months'], steps_month_mean['Steps'])
plt.title('Average steps by months', fontstyle = 'italic', size = 18)
plt.ylabel('Steps - mean', fontweight = 'bold', size = 10)
plt.xticks(rotation = 90)
plt.savefig('figures/steps_month_mean')
plt.show()

# Plot average steps by work or weekend
plt.figure(figsize = (12, 4))
plt.bar(steps_work_we_mean['Work_or_Weekend'], steps_work_we_mean['Steps'])
plt.title('Average steps by work or weekend', fontstyle = 'italic', size = 18)
plt.ylabel('Steps - mean', fontweight = 'bold', size = 10)
plt.xticks(size = 14)
plt.savefig('figures/steps_work_we_mean')
plt.show()

#### Write a loop to plot 3 scatter plots of the following features:

* Minutes Lightly Active vs Steps    
* Minutes Very Active vs Steps    
* Minutes Sedentary vs Steps  

In [None]:
features = ['Minutes Lightly Active', 'Minutes Very Active', 'Minutes Sedentary']

for feature in features:
   plt.scatter(fitbit[feature], fitbit['Steps'], c = fitbit['Steps'])
   plt.title(f'{feature} vs. Steps', fontstyle = 'italic', size = 18)
   plt.xlabel(feature, fontweight = 'bold', size = 10)
   plt.ylabel('Steps made', fontweight = 'bold', size = 10)
   plt.colorbar()
   plt.show()

## Challenge 3

#### Import the `titanic` dataset and store it in a variable called `titanic`. You can find the dataset in Ironhack's database:
* db: `titanic`
* table: `titanic`

In [None]:
titanic = pd.read_csv('titanic.csv')

titanic.head()

#### Explore the titanic dataset using Pandas dtypes.

In [None]:
titanic.dtypes

#### What are your numerical variables? What are your categorical variables?
**Hint**: Use Pandas select_dtypes.

In [None]:
# Numerical variables

titanic.select_dtypes(include = 'number').head()

In [None]:
# Categorical variables

titanic.select_dtypes(exclude = 'number').head()

#### Set the plot style to classic and the figure size to (12,6).
**Hint**: To set the style you can use matplotlib or seaborn functions. Do some research on the matter.

In [None]:
plt.style.use('classic')
plt.figure(figsize = (12, 6))

#### Use the right visulalization to show the distribution of column `Age`.

In [None]:
plt.hist(titanic['Age'], bins = int(titanic['Age'].max()))
plt.title('Passenger age distribution', fontstyle = 'italic', size = 18)
plt.xlabel('Number of years', size = 10)
plt.ylabel('Number of passengers', size = 10)
plt.show()

#### Use subplots and plot the distribution of the `Age`  with bins equal to 10, 20 and 50.

In [None]:
plot_options, (plot_1, plot_2, plot_3) = plt.subplots(nrows = 1, ncols = 3, figsize = (13, 4))

plt.suptitle('Passenger age distribution', fontweight = 'bold', fontstyle = 'italic', size = 20, y = 1.06)

plot_1.hist(titanic['Age'], bins = 10)
plot_1.set_ylabel('Number of passengers', size = 10, fontweight = 'bold')
plot_1.set_title('10 bins', fontstyle = 'italic', size = 15)

plot_2.hist(titanic['Age'], color = 'green', bins = 20)
plot_2.set_xlabel('Number of years', size = 10, fontweight = 'bold')
plot_2.set_title('20 bins', fontstyle = 'italic', size = 15)

plot_3.hist(titanic['Age'], color = 'orange', bins = 50)
plot_3.set_title('50 bins', fontstyle = 'italic', size = 15)

plt.show()

#### How does the bin size affect your plot?

In [None]:
# ANSWER:
# The size of the bins determines how detailed the data is displayed.
# The larger they are, the more 'interpolated' the data is plotted.
# To use a reasonable size in our case, I would go for 50 bins.

#### Use seaborn to show the distribution of column `Age`.

In [None]:
sns.displot(titanic['Age'], binwidth = 2)
plt.xlim(0, 80)
plt.xlabel('Number of years')
plt.ylabel('Number of passengers')
plt.title('Passenger age distribution', fontstyle = 'italic', size = 18, y = 1.04)
plt.show()

#### Use the right plot to visualize column `Gender`. There are 2 ways of doing it. Do it both ways.
**Hint**: Use matplotlib and seaborn.

In [None]:
# Method 1 - matplotlib
plt.figure(figsize = (5, 5))
plt.hist(titanic['Gender'], color = 'orange', bins = np.arange(0, 3) - 0.5)
plt.title('Passenger gender distribution', fontstyle = 'italic', size = 18, y = 1.04)
plt.xlabel('Gender', fontweight = 'bold', size = 18)
plt.ylabel('Number of passengers')
plt.show()

In [None]:
# Method 2 - seaborn
sns.displot(titanic['Gender'])
plt.title('Passenger gender distribution', fontstyle = 'italic', size = 18, y = 1.04)
plt.xlabel('Gender', fontweight = 'bold', size = 18)
plt.ylabel('Number of passengers')
plt.show()

#### Use the right plot to visualize the column `Pclass`.

In [None]:
# Calculate pclass distribution
pclass = titanic['Pclass'].value_counts()

# Plot pclass distribution
plt.bar(pclass.index, pclass.values, color = 'purple')
plt.title('Passengers by class', fontstyle = 'italic', size = 20, y = 1.04)
plt.xticks([1, 2, 3])
plt.xlabel('Passenger classes', fontweight = 'bold', size = 14)
plt.ylabel('Number of passengers', size = 14)
plt.show()

#### We would like to have in one single plot the summary statistics of the feature `Age`. What kind of plot would you use? Plot it. 

In [None]:
plt.figure(figsize = (5, 6))
plt.title("Summary stats titanic['Age']", size = 20, y = 1.04)
sns.boxplot(y = titanic['Age'], color = 'yellow')
plt.show()

In [None]:
# ANSWER:
# I would use a boxplot because it shows many important statistical values at once.
# => minimum, 1st quartile, 2nd quartile (mean), 3rd quartile, interquartile range, maximum, oultliers

#### What does the last plot tell you about the feature `Age`?

In [None]:
# ANSWER:
# The passenger age range (maximum - minimum) is 0 to ~80 years.
# 50 percent of the passengers were between ~22 and ~38 (interquartile range).
# The data are slightly negatively skewed with the mean being at ~29 years.
# There are some outliers among older people (~59 - 80).

#### Now in addition to the summary statistics, we want to have in the same plot the distribution of `Age`. What kind of plot would you use? Plot it. 

In [None]:
plt.figure(figsize = (5, 6))
plt.title("Summary stats titanic['Age']", size = 20, y = 1.04)
sns.violinplot(data = titanic['Age'], color = 'yellow')
plt.show()

#### What additional information does the last plot provide about feature `Age`?

In [None]:
# ANSWER:
# In addition to the previous information, we now also see the age distribution.

#### We suspect that there is a linear relationship between `Fare` and `Age`. Use the right plot to show the relationship between these 2 features. There are 2 ways, please do it both ways.
**Hint**: Use matplotlib and seaborn.

In [None]:
# Method 1 - matplotlib

plt.figure(figsize = (5, 5))
plt.scatter(titanic['Age'], titanic['Fare'], color = 'orange')
plt.title('Passenger age/fare relation', fontstyle = 'italic', size = 18, y = 1.04)
plt.xlabel('Passenger age', size = 14)
plt.ylabel('Passenger fare', size = 14)
# Remove negative start ticks
xticks = [tick for tick in plt.gca().get_xticks() if tick >= 0]
yticks = [tick for tick in plt.gca().get_yticks() if tick >= 0]
plt.gca().set_xticks(xticks)
plt.gca().set_yticks(yticks)
plt.show()

In [None]:
# Method 2 - seaborn

plt.figure(figsize = (5, 5))
sns.scatterplot(data = titanic, x = 'Age', y = 'Fare', hue = 'Pclass')
plt.title('Passenger age - fare relation', fontstyle = 'italic', size = 18, y = 1.04)
plt.xlabel('Passenger age', size = 14)
plt.ylabel('Passenger fare', size = 14)
# Remove negative start ticks
xticks = [tick for tick in plt.gca().get_xticks() if tick >= 0]
yticks = [tick for tick in plt.gca().get_yticks() if tick >= 0]
plt.gca().set_xticks(xticks)
plt.gca().set_yticks(yticks)
plt.show()

#### Plot the correlation matrix using seaborn.

In [None]:
titanic_matrix = titanic.drop(['PassengerId'], axis = 1)

sns.heatmap(titanic_matrix.corr(), annot=True)
plt.title('Titanic correlation matrix', fontstyle = 'italic', size = 18, y = 1.04)
plt.show()

#### What are the most correlated features?

In [None]:
# ANSWER:
# Passenger class / Fare      => The higher the class, the higher the fare (unsurprisingly)
# Passenger class / Survived  => The lower the class, the lower the chance to survive
# Passenger class / Age       => The lower the age, the lower the class

#### Use the most appropriate plot to display the summary statistics of `Age` depending on `Pclass`.

In [None]:
plt.figure(figsize = (5, 6))
sns.boxplot(data = titanic, x = 'Pclass', y = 'Age')
plt.title("Summary stats Age - Pclass", size = 20, y = 1.04)
plt.xlabel('Passenger class', size = 14)
plt.ylabel('Passenger age', size = 14)
plt.show()

#### Use seaborn to plot the distribution of `Age` based on the `Gender`.
**Hint**: Use Facetgrid.

In [None]:
titanic_gender_age_dist = sns.FacetGrid(titanic, col = 'Gender')
titanic_gender_age_dist.map(plt.hist, 'Age')
titanic_gender_age_dist.fig.suptitle('Passenger gender - age distribution')
titanic_gender_age_dist.fig.subplots_adjust(top = 0.82)
plt.show()