# Data Visualisation with Python

In [None]:
import pandas as pd
import numpy as np

# Let's create a sample DataFrame
np.random.seed(0)
dataframe = pd.DataFrame({
    'age': np.random.randint(20, 60, 100),
    'income': np.random.randint(30000, 80000, 100),
    'genre': np.random.choice(['Action', 'Comedy', 'Drama', 'Thriller'], 100)
})

# Let's see the first 5 rows
display(dataframe.head())


In [None]:
# Import the required libraries for visualisation

import matplotlib.pyplot as plt
import seaborn as sns

Now we we can start plotting our data. We will use the [matplotlib](https://matplotlib.org/) library for this. It is a very powerful library that can be used to create all kinds of plots. We will only use a small subset of the functionality.

## Histograms
Histograms are a great way to visualise the distribution of a variable. Let's start by plotting the distribution of the age column.

In [None]:
# Histogram
plt.figure(figsize=(10, 6))
plt.hist(dataframe['age'], bins=10, edgecolor='black')
plt.title('Histogram')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

From histograms we can extract useful information from the data. For example, we can see that the majority of the people in the dataset are between 20 and 40 years old. We can also see that there are a few people that are older than 60 years old.

## Density plots
Density plots are similar to histograms, but they are smoother. They are also called kernel density plots. Let's plot the density of the age column.

In [None]:
# Density Plot
plt.figure(figsize=(10, 6))
sns.kdeplot(dataframe['age'], color='darkblue')
plt.title('Density Plot')
plt.xlabel('Age')
plt.ylabel('Density')
plt.show()

From the density plot is possible to see in more detail the distribution of the age column. We can see that the distribution is not symmetric and that it is skewed to the right. We can also see that the distribution is bimodal, which means that there are two peaks in the distribution. One peak is around 35 years old and the other is around 55 years old.

## Box plots
Box plots are another way to visualise the distribution of a variable. They are very useful to compare the distribution of a variable across different groups. Let's plot the box plot of the age column.

In [None]:
# Boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(dataframe['age'], color='lightblue')
plt.title('Boxplot')
plt.xlabel('Age')
plt.show()

From the box plot we can see that the median age is around 35 years old. We can also see that the 50% of the people in the dataset are between 28 and 52 years old. Data points that are outside the whiskers are considered outliers.

## Violin plots
Violin plots are similar to box plots, but they also show the density of the distribution. Let's plot the violin plot of the age column.

In [None]:
# Violin Plot
plt.figure(figsize=(10, 6))
sns.violinplot(dataframe['age'], color='lightgreen')
plt.title('Violin Plot')
plt.xlabel('Age')
plt.show()


From this violin plot we can see that the distribution of the age column is bimodal apart from the information already extracted from the box plot.

## Bar plots
Bar plots are used to display the frequency of different categories. Being a graphical representation of data, bar plots are a way to show data with respect to a variable.

In [None]:
# Bar Plot
# Assuming 'genre' is categorical. First, we count the number of instances for each category.
category_counts = dataframe['genre'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=category_counts.index, y=category_counts.values, color='salmon')
plt.title('Bar Plot')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=90)  # Rotate x-axis labels for better visibility if they are long
plt.show()


## Visualising Correlation and Relationships

To create sample code for each type of data visualization mentioned in the section (scatter plots, pair plots, heatmaps, and line charts), we'll use Python and some popular libraries for data visualization such as `matplotlib`, `seaborn`, and `pandas`. We'll also use a sample dataset to create these visualizations.

In [None]:
# Scatter Plots with Complex Input Data and Randomness
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Sample data with randomness
np.random.seed(42)
size = 100
data = {'Variable1': np.random.randint(1, 100, size=size),
        'Variable2': np.random.randint(50, 150, size=size)}

df = pd.DataFrame(data)

# Scatter plot
plt.scatter(df['Variable1'], df['Variable2'])
plt.xlabel('Variable 1')
plt.ylabel('Variable 2')
plt.title('Scatter Plot')
plt.show()


### Pair Plots

In [None]:
# Pair Plots
import seaborn as sns
import pandas as pd

# Sample data
data = {'Variable1': [1, 2, 3, 4, 5],
        'Variable2': [5, 4, 3, 2, 1],
        'Variable3': [2, 4, 1, 3, 5]}

df = pd.DataFrame(data)

# Pair plot
sns.pairplot(df)
plt.show()



### Heatmaps

In [None]:
# Heatmaps
import seaborn as sns
import pandas as pd

# Sample data
data = {'Variable1': [1, 2, 3, 4, 5],
        'Variable2': [5, 4, 3, 2, 1],
        'Variable3': [2, 4, 1, 3, 5]}

df = pd.DataFrame(data)

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Heatmap')
plt.show()

### Line Charts

In [None]:
# Line Charts
import matplotlib.pyplot as plt

# Sample data
time = [1, 2, 3, 4, 5]
data_series1 = [5, 3, 8, 4, 7]
data_series2 = [2, 4, 6, 8, 10]

# Line chart for data_series1
plt.plot(time, data_series1, label='Data Series 1', marker='o')
plt.xlabel('Time')
plt.ylabel('Values')
plt.title('Line Chart - Data Series 1')
plt.legend()
plt.show()

# Line chart for data_series2
plt.plot(time, data_series2, label='Data Series 2', marker='x')
plt.xlabel('Time')
plt.ylabel('Values')
plt.title('Line Chart - Data Series 2')
plt.legend()
plt.show()

## Advanced Visualisation

### Swarmplot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
iris = sns.load_dataset('iris')

# Create a swarmplot
sns.swarmplot(x="species", y="petal_length", data=iris)

# Show the plot
plt.show()



### Facet Grid Example

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
iris = sns.load_dataset('iris')

# Create a facet grid
g = sns.FacetGrid(iris, col="species")
g.map(plt.hist, "petal_length")

# Show the plot
plt.show()


### Bubble Plot Example

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Generate random data
data = pd.DataFrame({
    'X': np.random.rand(50),
    'Y': np.random.rand(50),
    'Z': np.random.rand(50)
})

# Create a bubble plot
plt.scatter('X', 'Y', s=100, c='Z', cmap='viridis', data=data, alpha=0.6)

# Show the colour bar
plt.colorbar(label='Z value')

# Show the plot
plt.show()



### Treemap Example


In [None]:
import matplotlib.pyplot as plt
import squarify

# Sample data for Top 10 Countries by GDP (Gross Domestic Product in Trillions USD)
countries = ['USA', 'China', 'Japan', 'Germany', 'UK', 'India', 'France', 'Italy', 'Brazil', 'Canada']
gdp = [21.43, 14.34, 5.08, 3.86, 2.83, 2.87, 2.71, 2.07, 1.87, 1.64]

# Create a DataFrame from the data
data = {'Country': countries, 'GDP (Trillions USD)': gdp}
df = pd.DataFrame(data)

# Sort the DataFrame by GDP in descending order for better visualization
df.sort_values(by='GDP (Trillions USD)', ascending=False, inplace=True)

# Calculate the cumulative percentage of GDP
df['GDP %'] = df['GDP (Trillions USD)'] / df['GDP (Trillions USD)'].sum()

# Create the treemap
plt.figure(figsize=(10, 6))
squarify.plot(sizes=df['GDP (Trillions USD)'], label=df['Country'], alpha=0.8)
plt.axis('off')
plt.title('Top 10 Countries by GDP (2021)')
plt.show()



### Radar Chart

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from math import pi

# Create a data frame
df = pd.DataFrame({
    'group': ['A'],
    'var1': [38],
    'var2': [29],
    'var3': [8],
    'var4': [7],
    'var5': [28]
})

# Number of variables
categories=list(df)[1:]
N = len(categories)

# Compute angle of each axis in the plot (divide the plot / number of variable)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

# Initialise the spider plot
ax = plt.subplot(111, polar=True)

# Draw one axe per variable + add labels
plt.xticks(angles[:-1], categories, color='grey', size=8)

# Draw ylabels
ax.set_rlabel_position(0)
plt.yticks([10,20,30], ["10","20","30"], color="grey", size=7)
plt.ylim(0,40)

# Plot data
values=df.loc[0].drop('group').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle='solid')

# Fill area
ax.fill(angles, values, 'b', alpha=0.1)

plt.show()


### Network Diagram Example

In [None]:
import matplotlib.pyplot as plt
import networkx as nx

# Create a graph
G = nx.Graph()

# Add nodes
G.add_nodes_from(["A", "B", "C", "D", "E"])

# Add edges
G.add_edges_from([("A", "B"), ("A", "C"), ("A", "D"), ("B", "E"), ("C", "E")])

# Draw the network diagram
nx.draw(G, with_labels=True)

plt.show()

## Choropleth Maps

In [None]:
# Import necessary libraries
import geopandas as gpd
import matplotlib.pyplot as plt

# Load the dataset
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Let's say we want to plot a choropleth map of the world, colored by population
world = world[(world.pop_est>0) & (world.name!="Antarctica")]

# Normalize the population data to range between 0 and 1
world['pop_est_norm'] = world['pop_est'] / world['pop_est'].max()

# Plot the choropleth map
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
world.plot(column='pop_est_norm', ax=ax, legend=True, cmap='YlOrRd')

# Adding title
plt.title('World Population Choropleth Map')

# Remove axis
ax.axis('off')

# Show the map
plt.show()