## Generating summary statistics using pandas and scipy

In [26]:
# Yo, check it! We're gearing up to analyze some volcanic 
# data, like diving into the urban jungle with our crew.

# Rolling with NumPy, our heavy-duty toolkit for crunching 
# numbers, like the muscle car revving up for action.
import numpy as np

# Here comes Pandas, the solid foundation for our data game, 
# making sure our stats stay on track, just like a street 
# racer's chassis holding it down.
import pandas as pd

# We're grabbing Series and DataFrame from the Pandas posse, 
# those slick data structures keeping our volcanic info 
# organized, like the street signs guiding the way.
from pandas import Series, DataFrame

# Scipy's on deck, packing serious statistical firepower 
# for analyzing volcanic activity, like the high-tech gear 
# helping us navigate the concrete jungle.
import scipy

# We're rolling with the stats crew from Scipy, hooking us 
# up with all the math magic we need to make sense of our 
# volcanic data, just like the expert advice from the 
# neighborhood elders.
from scipy import stats

In [27]:
# Setting the coordinates for the data eruption point
address = '../volcanos.csv'

# Activating the data eruption point and capturing the volcanic
# activity into the 'volcanos' DataFrame
volcanos = pd.read_csv(address)

In [28]:
# Renaming the features to better interpret the volcanic
# eruption data and understand the terrain
volcanos.columns = ['VolcanoID', 'V_Name', 'Country', 'Region',
                    'Subregion', 'Latitude', 'Longitude', 'PEI',
                    'H_active', 'VEI_Holoce', 'hazard', 'class',
                    'risk']

# Observing the modified volcanic landscape to see how the
# eruption features have been renamed
volcanos.head()


Unnamed: 0,VolcanoID,V_Name,Country,Region,Subregion,Latitude,Longitude,PEI,H_active,VEI_Holoce,hazard,class,risk
0,210010,West Eifel Volcanic Field,Germany,Mediterranean and W Asia,Western Europe,50.17,6.85,6,0,Unknown VEI,,U-HR,
1,210020,Cha?ne des Puys,France,Mediterranean and W Asia,Western Europe,45.775,2.97,7,0,Unknown VEI,,U-HR,
2,210030,Olot Volcanic Field,Spain,Mediterranean and W Asia,Western Europe,42.17,2.53,5,0,No confirmed eruptions,,U-NHHR,
3,210040,Calatrava Volcanic Field,Spain,Mediterranean and W Asia,Western Europe,38.87,-4.02,6,0,Unknown VEI,,U-HR,
4,211001,Larderello,Italy,Mediterranean and W Asia,Italy,43.25,10.87,4,0,3,,U-HR,


### Looking at summary statistics that decribe a variable's numeric values

In [29]:
# volcanos.sum()

# # TypeError: can only concatenate str (not "int") to str

In [30]:
# In the realm of molten fire and earth's command,
# We seek the numeric might, firm and grand.
# Selecting types of data, numeric and bold,
# With volcanic power, our sums unfold.

# Selecting columns with numeric data types, excluding
# any non-numeric columns, to perform summation.
# The `include` parameter specifies the data types to 
# include in the selection, here set to only include 
# numeric types using `np.number`.
# The `.sum()` method calculates the sum along the rows 
# (axis 0 by default) of the selected numeric columns.
volcanos.select_dtypes(include=[np.number]).sum()

VolcanoID    4.574110e+08
Latitude     2.221560e+04
Longitude    3.854220e+04
PEI          4.786000e+03
H_active     5.960000e+02
hazard       6.100000e+02
risk         5.620000e+02
dtype: float64

In [31]:
# Beneath the volcano's fiery gaze,
# We seek the numbers that amaze.
# Selecting data types of numeric might,
# To sum them up, our goal in sight.

# Selecting columns with numeric data types, excluding
# any non-numeric columns, to perform summation.
# Here, 'number' is used as shorthand for numeric data types.
# The `.sum()` method calculates the sum along the rows 
# (axis 0 by default) of the selected numeric columns.
volcanos.select_dtypes('number').sum()

VolcanoID    4.574110e+08
Latitude     2.221560e+04
Longitude    3.854220e+04
PEI          4.786000e+03
H_active     5.960000e+02
hazard       6.100000e+02
risk         5.620000e+02
dtype: float64

In [32]:
# Amidst the molten lava's glow,
# We tally numbers row by row.
# Summing up the volcanic might,
# With only numeric columns in our sight.

# Summing up the numeric values across all columns,
# excluding non-numeric ones, like an eruption's fiery flow.
# The `numeric_only=True` parameter ensures that only 
# numeric columns are considered for summation.
volcanos.sum(numeric_only=True)

VolcanoID    4.574110e+08
Latitude     2.221560e+04
Longitude    3.854220e+04
PEI          4.786000e+03
H_active     5.960000e+02
hazard       6.100000e+02
risk         5.620000e+02
dtype: float64

In [33]:
# Like shedding a layer of volcanic crust,
# We drop the 'VolcanoID' column, leaving behind the lust.
# Summing up the remaining volcanic might,
# With only numeric columns in our sight.

# Dropping the 'VolcanoID' column along the specified axis,
# then summing up the numeric values across all remaining columns.

# Here, we exclude the 'VolcanoID' column from our dataset 
# using the drop() method, specifying the axis along which 
# to drop the column. Then, we sum the numeric values across 
# all remaining columns using the sum() method with the 
# numeric_only parameter set to True.
volcanos.drop('VolcanoID', axis=1).sum(numeric_only=True)

Latitude     22215.599
Longitude    38542.200
PEI           4786.000
H_active       596.000
hazard         610.000
risk           562.000
dtype: float64

In [34]:
# As the volcanos rumble and roar, 
# Let's sum up their data to explore.

# The `sum` function is used to calculate the sum of values
# along the specified axis, considering only numeric columns.
# Here, we sum across each row, treating only numeric values,
# to get the total sum for each volcano.

# Parameters:
# - axis: Specifies the axis along which the sum is computed.
#   - axis=1: Summation is performed along rows.
# - numeric_only: If True, only numeric columns will be summed,
#   excluding non-numeric columns like object or categorical.
#   Defaults to True.
# Returns:
# - Series: The sum of values for each volcano.
volcanos_sum = volcanos.sum(axis=1, numeric_only=True)

In [35]:
# Amidst the magma and rock, seeking the median, 
# our quest on the volcanic terrain is steadily leadin'.

# Calling `median()` upon our dataset of fire, 
# we summon forth the middle value, a measure we admire.

# With `numeric_only=True`, we specify our interest, 
# focusing solely on numerical columns, a directive 
# we insist.

# Parameters:
# - numeric_only: A boolean parameter indicating whether to consider only 
#   numeric data when calculating the median. If set to True, non-numeric 
#   columns will be excluded from the calculation. Defaults to False.
volcanos.median(numeric_only=True)

VolcanoID    290355.000
Latitude         14.011
Longitude        39.155
PEI               2.000
H_active          0.000
hazard            2.000
risk              2.000
dtype: float64

In [36]:
# With volcanoes in mind, dropping the 'VolcanoID' column 
# to attain the mean is quite divine.

# The `drop()` function, a powerful tool, removes the 
# specified column axis-wise, keeping our data cool. 
# Setting `axis=1` ensures we target columns, not rows, 
# leaving behind only the info that glows.

# The `mean()` function, post-drop, calculates the average 
# of the remaining data, a move that's not a flop. By 
# specifying `numeric_only=True`, we ensure that only 
# numeric columns are included in our review, allowing us 
# to analyze and construe.
volcanos.drop('VolcanoID', axis=1).mean(numeric_only=True)

Latitude     14.369728
Longitude    24.930272
PEI           3.095731
H_active      0.385511
hazard        1.859756
risk          1.713415
dtype: float64

In [37]:
# In the land of fire and ash, where volcanoes reign supreme,
# We drop the ID of molten rocks, a dream within a dream.
# Seeking the mightiest value, the highest peak we chase,
# Numeric only, to exclude the rest, in this volcanic race.

# Parameters:
#   - axis: Specifies the axis along which to drop the column. 
#           0 for rows and 1 for columns.
#   - numeric_only: If True, only include numeric data in the calculation 
#                    of the maximum value. If False, include all data types.
# The max() function returns the maximum value along the specified axis, 
# excluding non-numeric columns if numeric_only is set to True.
volcanos.drop('VolcanoID', axis=1).max(numeric_only=True)

Latitude      88.27
Longitude    179.58
PEI            7.00
H_active       1.00
hazard         3.00
risk           3.00
dtype: float64

### Looking at summary statistics that describe variable distribution

In [38]:
# In the land of fire and ash, where volcanoes cast their shadow,
# We drop the mighty ID, from the depths, we let it go.
# Standard deviation we seek, a measure of dispersion's might,
# Numeric only, to exclude all but the numeric, shining bright.

# Parameters:
#   - axis: Specifies the axis along which to drop the column. 
#           0 for rows and 1 for columns.
#   - numeric_only: If True, only include numeric data in the calculation 
#                    of the standard deviation. If False, include all data types.
# The std() function calculates the standard deviation of the data along the specified axis.
# Standard deviation measures the dispersion of values in a dataset from the mean. 
# It is calculated as the square root of the variance.
volcanos.drop('VolcanoID', axis=1).std(numeric_only=True)

Latitude      31.773402
Longitude    112.847466
PEI            1.700149
H_active       0.486873
hazard         0.811893
risk           0.764385
dtype: float64

In [39]:
# In a land of molten rock, where volcanoes reign supreme,
# We drop the specified column, like a boulder in a stream.
# Variance we seek, a measure of spread and range,
# Numeric only, to exclude non-numeric, it's no longer strange.

# Parameters:
#   - axis: Specifies the axis along which to drop the column. 
#           0 for rows and 1 for columns.
#   - numeric_only: If True, only include numeric data in the calculation 
#                    of the variance. If False, include all data types.
# The var() function calculates the variance of the data along the specified axis.
# Variance is a measure of how much the values in a dataset vary from the mean. 
# It is calculated as the average of the squared differences from the mean.
volcanos.drop('VolcanoID', axis=1).var(numeric_only=True)

Latitude      1009.549047
Longitude    12734.550624
PEI              2.890506
H_active         0.237046
hazard           0.659171
risk             0.584284
dtype: float64

In [40]:
# A volcano awakens, its fury held tight,
# We capture its rumble, with all of our might.

# Assigning the 'H_active' column to 'h_active',
# Like a seismic recorder, our data we'll derive.
h_active = volcanos.H_active

# Counting the eruptions, their frequency we track,
# To understand their patterns, and how they stack.

# Using the value_counts() function, it's key,
# As it tallies the eruptions, for all to see.
# This function computes the frequency of unique values in a Series.
h_active.value_counts()

H_active
0    950
1    596
Name: count, dtype: int64

In [41]:
# In the realm of fire, where volcanoes dwell,
# Let's explore their secrets, tales they tell.

# With describe() in hand, we step into the glow,
# Unveiling the volcano's story, row by row.

# This function whispers of eruptions past,
# Sketching the landscape of our fiery blast.
volcanos.describe()

Unnamed: 0,VolcanoID,Latitude,Longitude,PEI,H_active,hazard,risk
count,1546.0,1546.0,1546.0,1546.0,1546.0,328.0,328.0
mean,295867.385511,14.369728,24.930272,3.095731,0.385511,1.859756,1.713415
std,49124.026936,31.773402,112.847466,1.700149,0.486873,0.811893,0.764385
min,210010.0,-78.5,-179.97,1.0,0.0,1.0,1.0
25%,261055.0,-6.77,-77.6145,2.0,0.0,1.0,1.0
50%,290355.0,14.011,39.155,2.0,0.0,2.0,2.0
75%,342087.5,41.9355,138.53275,4.0,1.0,3.0,2.0
max,390140.0,88.27,179.58,7.0,1.0,3.0,3.0
