# Uses MapReduce to aggregate weather occurances for each state 

In [189]:
%%file count_type_per_state_per_year.py
from mrjob.job import MRJob
import csv

# Counts the number of events per state per year
class EventCountByStateAndYear(MRJob):

    # Yields state and weather type
    def mapper(self, _, line):
        reader = csv.reader([line])
        for row in reader:
            if row[0] == 'EventId':
                return

            state = row[12]
            try:
                # Extract year from StartTime(UTC)
                date_parts = row[3].split('/')
                year = date_parts[2][:4]
            except (IndexError, ValueError): # Skip this line if date is malformed
                return  
        
        yield (state, year), 1

    # Sums values
    def reducer(self, key, counts):
        yield key, sum(counts)

if __name__ == '__main__':
    EventCountByStateAndYear.run()

Overwriting count_type_per_state_per_year.py


In [190]:
# Runs Map Reduce job and saves to a text file to be sorted (test file)
# !python count_type_per_state_per_year.py testing.csv > raw_output.txt

In [191]:
# Runs Map Reduce job and saves to a text file to be sorted (real file - 8.6 million rows)
!python count_type_per_state_per_year.py WeatherEvents_Jan2016-Dec2022.csv > raw_output.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\tonyg\AppData\Local\Temp\count_type_per_state_per_year.tonyg.20250511.003553.697893
Running step 1 of 1...
job output is in C:\Users\tonyg\AppData\Local\Temp\count_type_per_state_per_year.tonyg.20250511.003553.697893\output
Streaming final output from C:\Users\tonyg\AppData\Local\Temp\count_type_per_state_per_year.tonyg.20250511.003553.697893\output...
Removing temp directory C:\Users\tonyg\AppData\Local\Temp\count_type_per_state_per_year.tonyg.20250511.003553.697893...


In [192]:
# Sorts MapReduce output and returns sorted output for each state and max value for each subcategory
def sort_MR_output():
    data = []
    most_common = []
    prev_state = ''

    # Processes and sorts map reduce output
    with open("raw_output.txt") as f:
        for line in f:
            key_str, value_str = line.strip().split("\t")

            # Remove brackets and quotes from key string
            state, key2 = eval(key_str)  
            count = int(value_str)
            data.append(((state, key2), count))

    # Sort by state (ascending), then by count (descending)
    sorted_data = sorted(data, key=lambda x: (x[0][0], -x[1]))

    # Finds the most comment event per state
    for (state, key2), count in sorted_data:
        if state != prev_state:
            most_common.append(((state, key2), count))
            prev_state = state

    return sorted_data, most_common


In [193]:
print("=== Number of Events Per State Per Year ===\n")

sorted_data, most_common = sort_MR_output()

# Print the number of events per state per year
for (state, year), count in sorted_data:
    print(f"{state}, {year}: {count}")

=== Number of Events Per State Per Year ===

AL, 2017: 2538
AL, 2018: 2407
AL, 2016: 2054
AL, 2019: 2012
AL, 2020: 1780
AL, 2021: 1500
AL, 2022: 1196
AR, 2018: 1585
AR, 2020: 1488
AR, 2019: 1470
AR, 2022: 1318
AR, 2021: 1210
AR, 2017: 1208
AR, 2016: 805
AZ, 2021: 542
AZ, 2019: 528
AZ, 2016: 401
AZ, 2022: 399
AZ, 2017: 312
AZ, 2020: 289
AZ, 2018: 246
CA, 2019: 2171
CA, 2017: 1922
CA, 2021: 1913
CA, 2018: 1781
CA, 2016: 1720
CA, 2022: 1661
CA, 2020: 1641
CO, 2019: 2060
CO, 2016: 1959
CO, 2017: 1834
CO, 2018: 1395
CO, 2021: 1344
CO, 2022: 1302
CO, 2020: 1244
CT, 2020: 707
CT, 2019: 688
CT, 2021: 681
CT, 2018: 645
CT, 2022: 593
CT, 2016: 424
CT, 2017: 423
FL, 2017: 1815
FL, 2018: 1758
FL, 2016: 1743
FL, 2019: 1462
FL, 2020: 1394
FL, 2021: 1306
FL, 2022: 1234
GA, 2020: 2065
GA, 2021: 2033
GA, 2018: 1949
GA, 2019: 1722
GA, 2017: 1693
GA, 2022: 1623
GA, 2016: 1398
IA, 2018: 2429
IA, 2016: 2223
IA, 2017: 1829
IA, 2019: 1829
IA, 2020: 1518
IA, 2022: 1451
IA, 2021: 1422
ID, 2017: 1261
ID, 2019: 

In [194]:
print("=== Year With Highest Number of Events Per State ===\n")

# Print the year with the most events per state
for (state, year), count in most_common:
    print(f"{state}, {year}: {count}")

=== Year With Highest Number of Events Per State ===

AL, 2017: 2538
AR, 2018: 1585
AZ, 2021: 542
CA, 2019: 2171
CO, 2019: 2060
CT, 2020: 707
FL, 2017: 1815
GA, 2020: 2065
IA, 2018: 2429
ID, 2017: 1261
IL, 2019: 2271
IN, 2019: 920
KS, 2020: 488
LA, 2018: 718
MA, 2019: 2094
MD, 2021: 790
ME, 2018: 2045
MI, 2019: 3150
MN, 2019: 5135
MO, 2016: 2591
MS, 2021: 923
NC, 2020: 1840
ND, 2018: 3341
NE, 2019: 1357
NM, 2019: 1916
NV, 2019: 1032
NY, 2018: 5837
OH, 2018: 1562
OK, 2018: 1353
OR, 2022: 2347
PA, 2018: 916
RI, 2019: 1046
SC, 2020: 1671
SD, 2019: 2009
TN, 2020: 992
TX, 2016: 2906
UT, 2019: 964
VA, 2020: 1893
WA, 2017: 2124
WI, 2019: 1543
WV, 2018: 1125
WY, 2019: 1906


In [195]:
%%file count_severity_per_state_per_year.py
from mrjob.job import MRJob
import csv

# Counts the number of severe events per state per year
class SeverityCountPerStatePerYear(MRJob):

    # Yields state if severe
    def mapper(self, _, line):
        reader = csv.reader([line])
        for row in reader:
            if row[0] == 'EventId':
                return

            severity = row[2]
            if severity != 'Severe':
                return

            state = row[12]
            try:
                # Extract year from StartTime(UTC)
                date_parts = row[3].split('/')
                year = date_parts[2][:4]
            except (IndexError, ValueError): # Skip this line if date is malformed
                return  
        
        yield (state, year), 1

    # Sums values
    def reducer(self, key, counts):
        yield key, sum(counts)

if __name__ == '__main__':
    SeverityCountPerStatePerYear.run()

Overwriting count_severity_per_state_per_year.py


In [196]:
# Runs Map Reduce job and saves to a text file to be sorted (test file)
# !python count_severity_per_state_per_year.py testing.csv > raw_output.txt

In [197]:
# Runs Map Reduce job and saves to a text file to be sorted (real file - 8.6 million rows)
!python count_severity_per_state_per_year.py WeatherEvents_Jan2016-Dec2022.csv > raw_output.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\tonyg\AppData\Local\Temp\count_severity_per_state_per_year.tonyg.20250511.003621.607452
Running step 1 of 1...
job output is in C:\Users\tonyg\AppData\Local\Temp\count_severity_per_state_per_year.tonyg.20250511.003621.607452\output
Streaming final output from C:\Users\tonyg\AppData\Local\Temp\count_severity_per_state_per_year.tonyg.20250511.003621.607452\output...
Removing temp directory C:\Users\tonyg\AppData\Local\Temp\count_severity_per_state_per_year.tonyg.20250511.003621.607452...


In [198]:
print("=== Number of Severe Events Per State Per Year ===\n")

sorted_data, most_common = sort_MR_output()

# Print the number of severe events per state per year
for (state, year), count in sorted_data:
    print(f"{state}, {year}: {count}")

=== Number of Severe Events Per State Per Year ===

AL, 2021: 355
AL, 2017: 315
AL, 2020: 311
AL, 2022: 287
AL, 2016: 270
AL, 2019: 238
AL, 2018: 184
AR, 2020: 328
AR, 2021: 316
AR, 2022: 307
AR, 2017: 296
AR, 2019: 277
AR, 2018: 238
AR, 2016: 158
AZ, 2020: 45
AZ, 2021: 43
AZ, 2016: 39
AZ, 2019: 39
AZ, 2022: 29
AZ, 2017: 15
AZ, 2018: 9
CA, 2022: 972
CA, 2021: 949
CA, 2020: 905
CA, 2019: 797
CA, 2016: 702
CA, 2017: 685
CA, 2018: 559
CO, 2019: 366
CO, 2016: 342
CO, 2017: 280
CO, 2018: 196
CO, 2022: 193
CO, 2020: 172
CO, 2021: 170
CT, 2020: 232
CT, 2022: 180
CT, 2021: 168
CT, 2019: 153
CT, 2018: 112
CT, 2016: 59
CT, 2017: 52
FL, 2017: 243
FL, 2022: 234
FL, 2021: 209
FL, 2019: 180
FL, 2016: 162
FL, 2018: 153
FL, 2020: 134
GA, 2021: 656
GA, 2022: 607
GA, 2020: 500
GA, 2019: 423
GA, 2017: 389
GA, 2016: 308
GA, 2018: 266
IA, 2020: 598
IA, 2021: 507
IA, 2022: 467
IA, 2016: 422
IA, 2019: 421
IA, 2017: 407
IA, 2018: 402
ID, 2022: 255
ID, 2017: 203
ID, 2016: 197
ID, 2019: 180
ID, 2021: 149
ID, 20

In [199]:
print("=== Year With Highest Number of Severe Events Per State ===\n")

# Print the year with the most events per state
for (state, year), count in most_common:
    print(f"{state}, {year}: {count}")

=== Year With Highest Number of Severe Events Per State ===

AL, 2021: 355
AR, 2020: 328
AZ, 2020: 45
CA, 2022: 972
CO, 2019: 366
CT, 2020: 232
FL, 2017: 243
GA, 2021: 656
IA, 2020: 598
ID, 2022: 255
IL, 2021: 390
IN, 2020: 153
KS, 2020: 135
LA, 2021: 126
MA, 2019: 490
MD, 2022: 130
ME, 2018: 495
MI, 2016: 515
MN, 2019: 1131
MO, 2020: 678
MS, 2021: 391
NC, 2020: 479
ND, 2018: 1322
NE, 2019: 274
NM, 2019: 391
NV, 2021: 135
NY, 2019: 1238
OH, 2020: 215
OK, 2019: 215
OR, 2022: 502
PA, 2020: 161
RI, 2020: 320
SC, 2021: 315
SD, 2022: 448
TN, 2020: 141
TX, 2020: 620
UT, 2022: 249
VA, 2020: 320
WA, 2019: 330
WI, 2019: 204
WV, 2020: 183
WY, 2016: 442


In [200]:
%%file count_type_per_state_per_type.py
from mrjob.job import MRJob
import csv

# Counts the number of each event type per state
class EventCountByStateAndType(MRJob):

    # Yields state and weather type
    def mapper(self, _, line):
        reader = csv.reader([line])
        for row in reader:
            if row[0] == 'EventId':
                return
            state = row[12]
            weather_type = row[1]
        
        yield (state, weather_type), 1

    # Sums values
    def reducer(self, key, counts):
        yield key, sum(counts)


if __name__ == '__main__':
    EventCountByStateAndType.run()

Overwriting count_type_per_state_per_type.py


In [201]:
# Runs Map Reduce job and saves to a text file to be sorted (test file)
# !python count_type_per_state_per_type.py testing.csv > raw_output.txt

In [202]:
# Runs Map Reduce job and saves to a text file to be sorted (real file - 8.6 million rows)
!python count_type_per_state_per_type.py WeatherEvents_Jan2016-Dec2022.csv > raw_output.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\tonyg\AppData\Local\Temp\count_type_per_state_per_type.tonyg.20250511.003630.032307
Running step 1 of 1...
job output is in C:\Users\tonyg\AppData\Local\Temp\count_type_per_state_per_type.tonyg.20250511.003630.032307\output
Streaming final output from C:\Users\tonyg\AppData\Local\Temp\count_type_per_state_per_type.tonyg.20250511.003630.032307\output...
Removing temp directory C:\Users\tonyg\AppData\Local\Temp\count_type_per_state_per_type.tonyg.20250511.003630.032307...


In [203]:
print("=== Weather Counts Per State ===\n")

sorted_data, most_common = sort_MR_output()

# Print counts per state
for (state, weather_type), count in sorted_data:
    print(f"{state}, {weather_type}: {count}")

=== Weather Counts Per State ===

AL, Rain: 10700
AL, Fog: 1925
AL, Precipitation: 529
AL, Cold: 268
AL, Snow: 65
AR, Rain: 6069
AR, Fog: 2166
AR, Precipitation: 392
AR, Cold: 218
AR, Snow: 218
AR, Storm: 20
AR, Hail: 1
AZ, Snow: 1294
AZ, Rain: 1028
AZ, Fog: 325
AZ, Precipitation: 33
AZ, Cold: 19
AZ, Storm: 14
AZ, Hail: 4
CA, Fog: 6722
CA, Rain: 5284
CA, Cold: 643
CA, Storm: 144
CA, Precipitation: 13
CA, Snow: 2
CA, Hail: 1
CO, Snow: 5450
CO, Rain: 3269
CO, Fog: 2000
CO, Cold: 200
CO, Storm: 170
CO, Precipitation: 49
CT, Rain: 2565
CT, Fog: 1178
CT, Snow: 337
CT, Precipitation: 59
CT, Cold: 14
CT, Storm: 7
CT, Hail: 1
FL, Rain: 8631
FL, Fog: 1270
FL, Precipitation: 520
FL, Cold: 217
FL, Storm: 73
FL, Hail: 1
GA, Rain: 8593
GA, Fog: 3480
GA, Cold: 222
GA, Precipitation: 158
GA, Snow: 28
GA, Storm: 2
IA, Rain: 5293
IA, Fog: 4646
IA, Snow: 2014
IA, Precipitation: 375
IA, Cold: 269
IA, Storm: 63
IA, Hail: 41
ID, Snow: 3233
ID, Rain: 3093
ID, Fog: 1441
ID, Cold: 128
ID, Storm: 42
ID, Precip

In [204]:
print("=== Most Common Weather Types Per State ===\n")

# Print most common weather per state
for (state, weather_type), count in most_common:
    print(f"{state}, {weather_type}: {count}")

=== Most Common Weather Types Per State ===

AL, Rain: 10700
AR, Rain: 6069
AZ, Snow: 1294
CA, Fog: 6722
CO, Snow: 5450
CT, Rain: 2565
FL, Rain: 8631
GA, Rain: 8593
IA, Rain: 5293
ID, Snow: 3233
IL, Rain: 9204
IN, Rain: 3320
KS, Rain: 1834
LA, Rain: 3498
MA, Rain: 6776
MD, Rain: 3151
ME, Rain: 5724
MI, Rain: 8858
MN, Rain: 14122
MO, Rain: 9841
MS, Rain: 2959
NC, Rain: 5945
ND, Snow: 5821
NE, Rain: 3774
NM, Rain: 5930
NV, Rain: 2422
NY, Rain: 16826
OH, Rain: 6148
OK, Rain: 5348
OR, Rain: 10130
PA, Rain: 3186
RI, Rain: 4088
SC, Rain: 7368
SD, Rain: 4096
TN, Rain: 4500
TX, Rain: 11718
UT, Rain: 2175
VA, Rain: 7160
WA, Rain: 10362
WI, Rain: 5284
WV, Rain: 4063
WY, Snow: 3917
