## Summarizing categorical data using pandas

In [21]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

### The basics

In [22]:
# The {address} variable holds the file path to the storm data CSV, 
# allowing us to access and load the data into our analysis 
# environment. It serves as the starting point for our exploration 
# into the stormy seas of data analysis.
address = '../storms_points.csv'

# Upon loading the storm data from the provided CSV file, the 
# {storms} variable becomes a container for our dataset, holding 
# information about various storm events, including their names, 
# locations, and intensity.
storms = pd.read_csv(address)

# After loading the data, the column names of the {storms} DataFrame 
# are renamed for clarity and consistency, ensuring that each column 
# accurately represents the corresponding aspect of storm data.
storms.columns = ['storm_name', 'year', 'month', 'day', 'hour', 'minute', 
                  'timestamp', 'record_ident', 'status', 'latitude', 
                  'longitude', 'max_wind_kts', 'max_wind_kph', 
                  'max_wind', 'min_press']

# Setting the storm names as the index of the {storms} DataFrame 
# provides a convenient way to identify and reference specific 
# storm events during our analysis, acting as a navigational guide 
# through the dataset.
storms.index = storms.storm_name

# Displaying the first 10 rows of the storm data, the {storms.head()} 
# function offers a glimpse into the structure and contents of the 
# DataFrame, showcasing key information such as storm names, dates, 
# and wind speeds.
storms.head(10)


Unnamed: 0_level_0,storm_name,year,month,day,hour,minute,timestamp,record_ident,status,latitude,longitude,max_wind_kts,max_wind_kph,max_wind,min_press
storm_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Unnamed 1,Unnamed 1,1990,5,24,18,0,1990-05-24T18:00:00Z,,TD,18.8,-84.0,25,46,29,
Unnamed 1,Unnamed 1,1990,5,25,0,0,1990-05-25T00:00:00Z,,TD,19.7,-84.0,25,46,29,
Unnamed 1,Unnamed 1,1990,5,25,6,0,1990-05-25T06:00:00Z,,TD,20.7,-83.7,25,46,29,
Unnamed 1,Unnamed 1,1990,5,25,12,0,1990-05-25T12:00:00Z,,TD,21.8,-83.3,25,46,29,
Unnamed 1,Unnamed 1,1990,5,25,18,0,1990-05-25T18:00:00Z,,TD,23.0,-82.8,25,46,29,
Unnamed 1,Unnamed 1,1990,5,26,0,0,1990-05-26T00:00:00Z,,TD,24.0,-82.2,25,46,29,
Unnamed 1,Unnamed 1,1990,5,26,6,0,1990-05-26T06:00:00Z,,TD,24.0,-82.2,25,46,29,
Arthur,Arthur,1990,7,22,6,0,1990-07-22T06:00:00Z,,TD,8.8,-41.9,25,46,29,1010.0
Arthur,Arthur,1990,7,22,12,0,1990-07-22T12:00:00Z,,TD,8.9,-43.2,25,46,29,1010.0
Arthur,Arthur,1990,7,22,18,0,1990-07-22T18:00:00Z,,TD,9.3,-44.6,25,46,29,1010.0


In [23]:
# As we sail through the stormy seas of data, let's harness the power 
# of the winds with the {max_wind} variable, capturing the maximum wind 
# speed recorded for each storm event. Like a fierce gust of wind, it 
# sweeps through the dataset, revealing the intensity of each storm's 
# fury.

# By counting the occurrences of different wind speeds using the 
# {value_counts()} method, we gain insights into the distribution of 
# maximum wind speeds across the storm dataset, akin to charting the 
# varying strengths of tropical tempests in our data exploration journey.
max_wind = storms.max_wind
max_wind.value_counts()


max_wind
35     1710
29     1157
40     1093
46      988
52      988
58      821
63      605
75      594
69      564
23      473
81      366
86      347
92      245
104     221
98      183
115     141
109     124
132     114
127     102
121      89
138      81
17       80
144      62
12       40
150      38
161      24
155      21
167      13
173      11
178       4
184       1
Name: count, dtype: int64

In [24]:
# Amidst the storm of data, we seek to extract specific coordinates 
# and temporal information to guide our journey through the turbulent 
# seas.

# Extracting the categorical storm data related to latitude, longitude, 
# and year, we create a new dataframe named {storms_cat} by selecting 
# columns 'latitude', 'longitude', and 'year' from the {storms} 
# dataframe.
storms_cat = storms[['latitude', 'longitude', 'year']]

# Displaying the first few rows of the {storms_cat} dataframe to provide 
# a glimpse into the extracted categorical storm data, offering insight 
# into the initial coordinates and temporal details of our stormy 
# expedition.
storms_cat.head()


Unnamed: 0_level_0,latitude,longitude,year
storm_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Unnamed 1,18.8,-84.0,1990
Unnamed 1,19.7,-84.0,1990
Unnamed 1,20.7,-83.7,1990
Unnamed 1,21.8,-83.3,1990
Unnamed 1,23.0,-82.8,1990


In [25]:
# In the storm's domain we dwell,
# Collecting data, our story to tell,
# With latitude and longitude in our hand,
# And the year, to understand.

# Grouped by year, we explore,
# Unveiling trends, seeking more,
# Descriptive stats, our guide,
# Revealing patterns far and wide.

# Grouping the storm data by 'year'
year_group = storms_cat.groupby('year')

# Displaying descriptive statistics for the grouped data
year_group.describe()


Unnamed: 0_level_0,latitude,latitude,latitude,latitude,latitude,latitude,latitude,latitude,longitude,longitude,longitude,longitude,longitude,longitude,longitude,longitude
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
1990,502.0,25.182869,9.651564,7.2,17.7,23.9,32.775,50.0,502.0,-52.309363,17.426505,-107.0,-63.425,-49.45,-40.05,-16.6
1991,242.0,30.279339,10.65678,9.7,25.45,31.5,37.975,51.9,242.0,-53.196281,21.563911,-98.0,-68.5,-56.3,-35.0,-9.9
1992,273.0,30.653846,8.256512,10.8,25.7,30.9,36.1,49.5,273.0,-54.997802,19.09842,-91.7,-70.8,-56.2,-42.0,-7.0
1993,224.0,24.929018,10.593746,10.0,15.8,24.6,31.85,48.0,224.0,-65.320982,19.73187,-104.7,-79.475,-64.6,-55.925,-5.0
1994,234.0,24.473932,8.23387,10.1,16.625,24.1,31.3,44.7,234.0,-69.051282,19.989143,-99.0,-84.975,-78.4,-54.875,-20.8
1995,767.0,26.916037,10.599088,8.3,19.3,24.9,33.2,65.0,767.0,-62.258018,19.814183,-99.0,-79.45,-60.5,-48.55,-1.0
1996,544.0,25.650368,13.103541,8.6,15.0,20.9,34.0,63.0,544.0,-60.426654,20.949814,-107.3,-78.0,-64.3,-43.0,1.0
1997,206.0,31.32767,8.926768,10.9,26.575,31.85,37.6,49.0,206.0,-62.150485,18.331111,-92.6,-73.0,-62.95,-54.25,-14.0
1998,539.0,27.279592,11.209744,9.6,17.35,26.1,33.85,63.5,539.0,-58.030241,24.847533,-101.2,-80.4,-56.2,-37.85,-5.0
1999,457.0,24.219694,8.660647,9.8,17.2,22.8,29.4,51.0,457.0,-66.581619,17.498692,-103.0,-78.8,-64.2,-56.1,-18.9


### Transforming variables to categorical data type

In [26]:
# Amidst the stormy haze, a new label we raise,
# Assigning each year a categorical blaze,
# With dtype='category', we define its form,
# A unique identifier to weather the storm.

# Creating a new column 'group' in the storms DataFrame,
# Setting its values to the 'year' column,
# And specifying the data type as 'category'
storms['group'] = pd.Series(storms.year, dtype='category')


In [27]:
# Retrieve the data type of the 'group' column,
# Ensuring consistency, as we need to control,
# This detail helps understand the data's form,
# Clear insights emerge, analysis can perform.
storms['group'].dtypes


CategoricalDtype(categories=[1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
                  2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
                  2010, 2011, 2012, 2013],
, ordered=False, categories_dtype=int64)

In [28]:
# Count occurrences of each category in the 'group' column,
# With this insight, our analysis can enthrall,
# Knowing the distribution, we can plan ahead,
# Insights from data, our decisions are led.
storms['group'].value_counts()


group
2005    907
1995    767
2010    663
2012    654
2004    612
2003    593
2011    557
2008    556
1996    544
1998    539
1990    502
2001    488
2000    469
1999    457
2007    404
2002    400
2006    398
2013    346
1992    273
2009    265
1991    242
1994    234
1993    224
1997    206
Name: count, dtype: int64

### Describing categorical data with crosstabs

In [29]:
# Generate a cross-tabulation between 'year' and 'month' columns,
# Revealing insights, patterns that will enthral,
# Rows for years, columns for months, a matrix unfolds,
# Each cell a count, a story it holds.
pd.crosstab(storms['year'], storms['month'])


month,1,4,5,6,7,8,9,10,11,12
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1990,0,0,7,0,56,172,141,126,0,0
1991,0,0,0,6,25,75,80,47,9,0
1992,0,14,0,5,8,52,140,54,0,0
1993,0,0,2,24,0,104,94,0,0,0
1994,0,0,0,3,35,59,50,0,87,0
1995,0,0,0,35,83,264,207,169,9,0
1996,0,0,0,22,69,132,146,121,54,0
1997,0,0,1,10,98,0,67,30,0,0
1998,0,0,0,0,18,103,267,78,66,7
1999,0,0,0,26,4,128,154,103,42,0
