In [20]:
# package imports go here
import pandas as pd
import numpy as np
import fastparquet as fp


### Read in TMAX Weather Data from us_ca_daily_weather_limited_TMAX.

- THis file has dropped all data without true TMAX values

In [21]:
# Read weather paqquet file into a pandas dataframe to verify data integrity is maintained
weather_data_df = pd.read_parquet('result_files/stp1_us_ca_daily_weather_TMAX_limited.parquet.gzip', engine="fastparquet") 

In [22]:
# Show weather data column types
weather_data_df.dtypes 

Attr
ID              object
Date    datetime64[ns]
PRCP           float64
TAVG           float64
TMAX           float64
TMIN           float64
dtype: object

In [23]:
# Show weather data
weather_data_df

Attr,ID,Date,PRCP,TAVG,TMAX,TMIN
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,CA001011500,2023-01-01,0.000000,41.00,46.40,35.60
1,CA001011500,2023-01-02,0.007874,41.00,44.60,37.40
2,CA001011500,2023-01-03,0.047244,39.74,41.90,37.40
3,CA001011500,2023-01-04,0.062992,40.64,43.70,37.40
4,CA001011500,2023-01-05,0.066929,44.60,48.20,41.00
...,...,...,...,...,...,...
8849488,USW00096409,2023-12-27,0.000000,-4.90,9.50,-19.30
8849489,USW00096409,2023-12-28,0.000000,0.14,5.90,-5.62
8849490,USW00096409,2023-12-29,0.000000,-7.69,-1.48,-13.90
8849491,USW00096409,2023-12-30,0.000000,-12.55,-9.40,-15.70


### Determine how many days per year in given temperature ranges

#### Approach

Goal: How many days of year in each temperature range

- Create bins for highs (50-55, 55-6, …. 95-100, 100-105) etc
- Create bins for lows
- <or> Alternate: may also try bins for days > 80, days > 90, days < 30 etc.
- Save results

Result should look like:
| Station ID | #days < 10 | #days < 20 | ... | #days < 65 | #days > 60 | ... | #days > 95 | # days > 100 |
|----|----|----|----|----|----|----|----|----|
| US001 | 0 | 0 | ... | 300 | 365 | ... | 45 | 20 |
| US002 | 5 | 15 | ... | 360 | 280 | ... | 5 | 0 |

#### Find outliers in TMAX to refine the bins

In [24]:
# Template for binning TMAX values:
# ------------------------------------------
temp_bins = [-300, -200, -100, -50, 0, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
#temp_headings = ['-300 To -200', '-200 To -100', '-100 To -50', '-50 To 0', '0 To 50', '50 To 100', '100 To 200', '200 To 300', '300 To 400', '400 To 500', '500 To 600', '600 To 700', '700 To 800', '800 To 900', '900 to 1000']

### TMAX
weather_data_TMAX_df = pd.crosstab(weather_data_df['ID'], pd.cut(weather_data_df['TMAX'], temp_bins)).rename_axis(columns=None, index=None)
weather_data_TMAX_df

Unnamed: 0,"(-100, -50]","(-50, 0]","(0, 50]","(50, 100]","(100, 200]","(200, 300]","(300, 400]","(400, 500]","(500, 600]","(600, 700]","(800, 900]"
CA001011500,0,0,128,209,0,0,0,0,0,0,0
CA001012055,0,0,77,131,2,0,0,0,0,0,0
CA001012475,0,0,123,235,0,0,0,0,0,0,0
CA001012710,0,0,119,227,0,0,0,0,0,0,0
CA001014820,0,0,154,202,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
USW00096405,0,0,163,148,0,0,0,0,0,0,0
USW00096406,0,30,206,128,0,0,0,0,0,0,0
USW00096407,0,27,176,115,0,0,0,0,0,0,0
USW00096408,0,13,239,112,0,0,0,0,0,0,0


In [25]:
# Fix Headings
temp_headings = ['-100 To -50', '-50 To 0', '0 To 50', '50 To 100', '100 To 200', '200 To 300', '300 To 400', '400 To 500', '500 To 600', '600 To 700', '800 To 900']

counter =-1
for h in temp_headings:
    counter += 1 
    weather_data_TMAX_df.rename(columns={weather_data_TMAX_df.columns[counter]: h}, inplace = True)

In [26]:
# Show Outliers
print((weather_data_TMAX_df != 0).sum())

-100 To -50       6
-50 To 0       1694
0 To 50        8300
50 To 100      8639
100 To 200     2952
200 To 300        2
300 To 400        1
400 To 500        1
500 To 600        1
600 To 700        3
800 To 900        2
dtype: int64


##### Outliers

x<-50 has 6 outliers<br>
x>200 has 10 outliers

#### Refine Bins

In [27]:
temp_bins = [ -500, -400, -300, -200, -100, -50, 0, 25, 50, 75, 100, 125, 150, 200, 250, 300, 350, 400, 500]
#temp_headings = ['-500 To -400', '-400 To -300', '-300 To -200', '-200 To -100', '-100 To -50', '-50 To 0', '0 To 25', '25 To 50', '50 To 75', '75 To 100', 
#                 '100 To 125', '125 To 150', '150 To 200', '200 To 250', '250 To 300', '300 To 350', '350 To 400, ', '400 To 500']

weather_data_TMAX_df = pd.crosstab(weather_data_df['ID'], pd.cut(weather_data_df['TMAX'], temp_bins)).rename_axis(columns=None, index=None)

weather_data_TMAX_df

Unnamed: 0,"(-100, -50]","(-50, 0]","(0, 25]","(25, 50]","(50, 75]","(75, 100]","(100, 125]","(125, 150]","(150, 200]","(250, 300]","(300, 350]","(400, 500]"
CA001011500,0,0,0,128,156,53,0,0,0,0,0,0
CA001012055,0,0,0,77,73,58,2,0,0,0,0,0
CA001012475,0,0,0,123,225,10,0,0,0,0,0,0
CA001012710,0,0,0,119,212,15,0,0,0,0,0,0
CA001014820,0,0,0,154,144,58,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
USW00096405,0,0,3,160,147,1,0,0,0,0,0,0
USW00096406,0,30,107,99,110,18,0,0,0,0,0,0
USW00096407,0,27,80,96,105,10,0,0,0,0,0,0
USW00096408,0,13,120,119,107,5,0,0,0,0,0,0


In [28]:
# Fix Headings
temp_headings = ['-100 To -50', '-50 To 0', '0 To 25', '25 To 50', '50 To 75', '75 To 100', '100 To 125', '125 To 150', '150 To 200', '250 To 300', '300 To 350', '400 To 500']
counter =-1
for h in temp_headings:
    counter += 1 
    weather_data_TMAX_df.rename(columns={weather_data_TMAX_df.columns[counter]: h}, inplace = True)

In [29]:
# Show Outliers
print((weather_data_TMAX_df != 0).sum())

-100 To -50       6
-50 To 0       1694
0 To 25        5568
25 To 50       8296
50 To 75       8618
75 To 100      8407
100 To 125     2947
125 To 150        9
150 To 200        2
250 To 300        2
300 To 350        1
400 To 500        1
dtype: int64


##### Outliers

x<-50 has 6 outliers<br>
x>125 has 15 outliers

#### Define Final Bins

In [30]:
temp_bins = [-50, -25, -10, 0, 10, 20, 25, 32, 40, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 110, 120, 125]
#temp_headings = ['-50 To -25', '-25 To -10', '-10 To 0', '0 To 10', '10 To 20', '20 To 25', '25 To 32', '32 To 40', '40 To 50', '50 To 55', '55 To 60', '60 To 65', '65 To 70', '70 To 75', '75 To 80', '80 To 85', '85 To 90', '90 To 95', '95 To 100', '100 To 110', '110 To 120', '120 To 125']

weather_data_TMAX_df = pd.crosstab(weather_data_df['ID'], pd.cut(weather_data_df['TMAX'], temp_bins)).rename_axis(columns=None, index=None)
weather_data_TMAX_df

Unnamed: 0,"(-50, -25]","(-25, -10]","(-10, 0]","(0, 10]","(10, 20]","(20, 25]","(25, 32]","(32, 40]","(40, 50]","(50, 55]",...,"(65, 70]","(70, 75]","(75, 80]","(80, 85]","(85, 90]","(90, 95]","(95, 100]","(100, 110]","(110, 120]","(120, 125]"
CA001011500,0,0,0,0,0,0,1,9,118,36,...,30,34,34,11,6,2,0,0,0,0
CA001012055,0,0,0,0,0,0,2,4,71,16,...,13,12,17,13,16,10,2,2,0,0
CA001012475,0,0,0,0,0,0,0,6,117,57,...,42,11,8,2,0,0,0,0,0,0
CA001012710,0,0,0,0,0,0,0,8,111,48,...,39,37,10,5,0,0,0,0,0,0
CA001014820,0,0,0,0,0,0,2,27,125,22,...,33,38,33,17,4,4,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
USW00096405,0,0,0,0,0,3,13,63,84,49,...,9,4,1,0,0,0,0,0,0,0
USW00096406,2,11,17,40,49,18,42,28,29,20,...,25,23,10,6,2,0,0,0,0,0
USW00096407,0,6,21,30,31,19,43,28,25,27,...,23,16,9,1,0,0,0,0,0,0
USW00096408,0,1,12,29,54,37,31,43,45,23,...,21,11,5,0,0,0,0,0,0,0


In [31]:
# Fix Headings
temp_headings = ['-50 To -25', '-25 To -10', '-10 To 0', '0 To 10', '10 To 20', '20 To 25', '25 To 32', '32 To 40', '40 To 50', '50 To 55', '55 To 60', '60 To 65', '65 To 70', '70 To 75', '75 To 80', '80 To 85', '85 To 90', '90 To 95', '95 To 100', '100 To 110', '110 To 120', '120 To 125']

counter =-1
for h in temp_headings:
    counter += 1 
    weather_data_TMAX_df.rename(columns={weather_data_TMAX_df.columns[counter]: h}, inplace = True)

In [32]:
# Sum all columns and rows TMAX
sums = weather_data_TMAX_df.sum().sum()
print(sums)

2947562


In [33]:
# Displaying the types of data
print("\nTypes of data:\n", weather_data_TMAX_df.dtypes)


Types of data:
 -50 To -25    int64
-25 To -10    int64
-10 To 0      int64
0 To 10       int64
10 To 20      int64
20 To 25      int64
25 To 32      int64
32 To 40      int64
40 To 50      int64
50 To 55      int64
55 To 60      int64
60 To 65      int64
65 To 70      int64
70 To 75      int64
75 To 80      int64
80 To 85      int64
85 To 90      int64
90 To 95      int64
95 To 100     int64
100 To 110    int64
110 To 120    int64
120 To 125    int64
dtype: object


In [34]:
# Writing to parquet
weather_data_TMAX_df.to_parquet('result_files/stp3_us_ca_daily_TMAX_weather_bins.parquet.gzip', compression='gzip', engine="fastparquet")  

In [35]:
# Read TMIN weather paqquet file into a pandas dataframe to verify data integrity is maintained
weather_data_TMAX_df = pd.read_parquet('result_files/stp3_us_ca_daily_TMAX_weather_bins.parquet.gzip', engine="fastparquet") 
weather_data_TMAX_df

Unnamed: 0_level_0,-50 To -25,-25 To -10,-10 To 0,0 To 10,10 To 20,20 To 25,25 To 32,32 To 40,40 To 50,50 To 55,...,65 To 70,70 To 75,75 To 80,80 To 85,85 To 90,90 To 95,95 To 100,100 To 110,110 To 120,120 To 125
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CA001011500,0,0,0,0,0,0,1,9,118,36,...,30,34,34,11,6,2,0,0,0,0
CA001012055,0,0,0,0,0,0,2,4,71,16,...,13,12,17,13,16,10,2,2,0,0
CA001012475,0,0,0,0,0,0,0,6,117,57,...,42,11,8,2,0,0,0,0,0,0
CA001012710,0,0,0,0,0,0,0,8,111,48,...,39,37,10,5,0,0,0,0,0,0
CA001014820,0,0,0,0,0,0,2,27,125,22,...,33,38,33,17,4,4,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
USW00096405,0,0,0,0,0,3,13,63,84,49,...,9,4,1,0,0,0,0,0,0,0
USW00096406,2,11,17,40,49,18,42,28,29,20,...,25,23,10,6,2,0,0,0,0,0
USW00096407,0,6,21,30,31,19,43,28,25,27,...,23,16,9,1,0,0,0,0,0,0
USW00096408,0,1,12,29,54,37,31,43,45,23,...,21,11,5,0,0,0,0,0,0,0


### Repeat for TMIN

In [36]:
### TMIN 
temp_bins = [-50, -25, -10, 0, 10, 20, 25, 32, 40, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 110, 120]
#temp_headings = ['-50 To -25', '-25 To -10', '-10 To 0', '0 To 10', '10 To 20', '20 To 25', '25 To 32', '32 To 40', '40 To 50', '50 To 55', '55 To 60', '60 To 65', '65 To 70', '70 To 75', '75 To 80', '80 To 85', '85 To 90', '90 To 95', '95 To 100', '100 To 110', '110 To 120']

weather_data_TMIN_df = pd.crosstab(weather_data_df['ID'], pd.cut(weather_data_df['TMIN'], temp_bins)).rename_axis(columns=None, index=None)

weather_data_TMIN_df

Unnamed: 0,"(-50, -25]","(-25, -10]","(-10, 0]","(0, 10]","(10, 20]","(20, 25]","(25, 32]","(32, 40]","(40, 55]","(55, 60]","(60, 65]","(65, 70]","(70, 75]","(75, 80]","(80, 85]","(85, 90]","(90, 95]","(95, 100]","(100, 110]","(110, 120]"
CA001011500,0,0,0,0,0,2,12,90,165,48,15,2,1,0,1,1,0,0,0,0
CA001012055,0,0,0,0,0,1,18,36,68,38,18,5,3,5,4,8,4,1,1,0
CA001012475,0,0,0,0,0,0,4,36,298,20,0,0,0,0,0,0,0,0,0,0
CA001012710,0,0,0,0,0,0,7,79,216,44,0,0,0,0,0,0,0,0,0,0
CA001014820,0,0,0,0,2,4,16,123,141,40,21,9,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
USW00096405,0,0,2,11,27,28,70,57,116,0,0,0,0,0,0,0,0,0,0,0
USW00096406,42,40,37,34,43,19,29,24,86,10,0,0,0,0,0,0,0,0,0,0
USW00096407,23,31,25,31,38,17,30,29,73,21,0,0,0,0,0,0,0,0,0,0
USW00096408,2,38,46,54,48,24,26,42,83,1,0,0,0,0,0,0,0,0,0,0


In [39]:
# Sum all columns and rows TMIN
sums = weather_data_TMIN_df.sum().sum()
print(sums)

2947482


In [41]:
weather_data_TMIN_df.columns

Index(['-50 To -25', '-25 To -10', '-10 To 0', '0 To 10', '10 To 20',
       '20 To 25', '25 To 32', '32 To 40', '40 To 50', '50 To 55', '55 To 60',
       '60 To 65', '65 To 70', '70 To 75', '75 To 80', '80 To 85', '85 To 90',
       '90 To 95', '95 To 100', '100 To 110'],
      dtype='object')

In [42]:
#Fix Headings
temp_headings = ['-50 To -25', '-25 To -10', '-10 To 0', '0 To 10', '10 To 20',
                '20 To 25', '25 To 32', '32 To 40', '40 To 50', '50 To 55', '55 To 60',
                '60 To 65', '65 To 70', '70 To 75', '75 To 80', '80 To 85', '85 To 90',
                '90 To 95', '95 To 100', '100 To 110']

counter =-1
for h in temp_headings:
    counter += 1 
    weather_data_TMIN_df.rename(columns={weather_data_TMIN_df.columns[counter]: h}, inplace = True)

In [43]:
# Write TMIN weather data to parquet file
# Writing to parquet
weather_data_TMIN_df.to_parquet('result_files/stp3_us_ca_daily_TMIN_weather_bins.parquet.gzip', compression='gzip', engine="fastparquet")  

# Displaying the types of data
print("\nTypes of data:\n", weather_data_TMIN_df.dtypes)


Types of data:
 -50 To -25    int64
-25 To -10    int64
-10 To 0      int64
0 To 10       int64
10 To 20      int64
20 To 25      int64
25 To 32      int64
32 To 40      int64
40 To 50      int64
50 To 55      int64
55 To 60      int64
60 To 65      int64
65 To 70      int64
70 To 75      int64
75 To 80      int64
80 To 85      int64
85 To 90      int64
90 To 95      int64
95 To 100     int64
100 To 110    int64
dtype: object


In [44]:
# Read TMIN weather paqquet file into a pandas dataframe to verify data integrity is maintained
weather_data_TMIN_df = pd.read_parquet('result_files/stp3_us_ca_daily_TMIN_weather_bins.parquet.gzip', engine="fastparquet") 
weather_data_TMIN_df

Unnamed: 0_level_0,-50 To -25,-25 To -10,-10 To 0,0 To 10,10 To 20,20 To 25,25 To 32,32 To 40,40 To 50,50 To 55,55 To 60,60 To 65,65 To 70,70 To 75,75 To 80,80 To 85,85 To 90,90 To 95,95 To 100,100 To 110
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
CA001011500,0,0,0,0,0,2,12,90,165,48,15,2,1,0,1,1,0,0,0,0
CA001012055,0,0,0,0,0,1,18,36,68,38,18,5,3,5,4,8,4,1,1,0
CA001012475,0,0,0,0,0,0,4,36,298,20,0,0,0,0,0,0,0,0,0,0
CA001012710,0,0,0,0,0,0,7,79,216,44,0,0,0,0,0,0,0,0,0,0
CA001014820,0,0,0,0,2,4,16,123,141,40,21,9,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
USW00096405,0,0,2,11,27,28,70,57,116,0,0,0,0,0,0,0,0,0,0,0
USW00096406,42,40,37,34,43,19,29,24,86,10,0,0,0,0,0,0,0,0,0,0
USW00096407,23,31,25,31,38,17,30,29,73,21,0,0,0,0,0,0,0,0,0,0
USW00096408,2,38,46,54,48,24,26,42,83,1,0,0,0,0,0,0,0,0,0,0


### Repeat for TAVG

In [45]:
### TAVG
temp_bins = [-50, -25, -10, 0, 10, 20, 25, 32, 40, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 110, 120]
#temp_headings = ['-50 To -25', '-25 To -10', '-10 To 0', '0 To 10', '10 To 20', '20 To 25', '25 To 32', '32 To 40', '40 To 50', '40 To 55', '55 To 60', '60 To 65', '65 To 70', '70 To 75', '75 To 80', '80 To 85', '85 To 90', '90 To 95', '95 To 100', '100 To 110', '110 To 120']

weather_data_TAVG_df = pd.crosstab(weather_data_df['ID'], pd.cut(weather_data_df['TAVG'], temp_bins)).rename_axis(columns=None, index=None)

weather_data_TAVG_df

Unnamed: 0,"(-50, -25]","(-25, -10]","(-10, 0]","(0, 10]","(10, 20]","(20, 25]","(25, 32]","(32, 40]","(40, 50]","(50, 55]",...,"(60, 65]","(65, 70]","(70, 75]","(75, 80]","(80, 85]","(85, 90]","(90, 95]","(95, 100]","(100, 110]","(110, 120]"
CA001011500,0,0,0,0,0,0,6,30,133,28,...,50,32,12,4,2,1,0,0,0,0
CA001012055,0,0,0,0,0,0,3,20,74,14,...,19,21,18,9,6,8,4,1,1,0
CA001012475,0,0,0,0,0,0,2,9,154,63,...,34,9,0,0,0,0,0,0,0,0
CA001012710,0,0,0,0,0,0,2,17,143,48,...,64,14,0,0,0,0,0,0,0,0
CA001014820,0,0,0,0,0,2,6,76,96,38,...,46,42,10,8,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
USW00096405,0,0,0,0,9,15,42,67,96,44,...,0,0,0,0,0,0,0,0,0,0
USW00096406,7,38,34,39,43,25,23,25,36,21,...,30,6,2,0,0,0,0,0,0,0
USW00096407,2,27,26,26,41,21,24,32,35,27,...,21,12,1,0,0,0,0,0,0,0
USW00096408,0,8,28,55,62,23,31,34,48,36,...,14,3,0,0,0,0,0,0,0,0


In [46]:
#Fix Headings
temp_headings = ['-50 To -25', '-25 To -10', '-10 To 0', '0 To 10', '10 To 20', '20 To 25', '25 To 32', '32 To 40', '40 To 50', '50 To 55', '55 To 60', '60 To 65', '65 To 70', '70 To 75', '75 To 80', '80 To 85', '85 To 90', '90 To 95', '95 To 100', '100 To 110', '110 To 120']

counter =-1
for h in temp_headings:
    counter += 1 
    weather_data_TAVG_df.rename(columns={weather_data_TAVG_df.columns[counter]: h}, inplace = True)

In [47]:
# Sum all columns and rows TAVG
sums = weather_data_TAVG_df.sum().sum()
print(sums)

2947585


In [48]:
# Write TAVG weather data to parquet file

# Writing to parquet
weather_data_TAVG_df.to_parquet('result_files/stp3_us_ca_daily_TAVG_weather_bins.parquet.gzip', compression='gzip', engine="fastparquet")  

# Displaying the types of data
print("\nTypes of data:\n", weather_data_TAVG_df.dtypes)


Types of data:
 -50 To -25    int64
-25 To -10    int64
-10 To 0      int64
0 To 10       int64
10 To 20      int64
20 To 25      int64
25 To 32      int64
32 To 40      int64
40 To 50      int64
50 To 55      int64
55 To 60      int64
60 To 65      int64
65 To 70      int64
70 To 75      int64
75 To 80      int64
80 To 85      int64
85 To 90      int64
90 To 95      int64
95 To 100     int64
100 To 110    int64
110 To 120    int64
dtype: object


In [49]:
# Read TAVG weather paqquet file into a pandas dataframe to verify data integrity is maintained
weather_data_TAVG_df = pd.read_parquet('result_files/stp3_us_ca_daily_TAVG_weather_bins.parquet.gzip', engine="fastparquet") 
weather_data_TAVG_df

Unnamed: 0_level_0,-50 To -25,-25 To -10,-10 To 0,0 To 10,10 To 20,20 To 25,25 To 32,32 To 40,40 To 50,50 To 55,...,60 To 65,65 To 70,70 To 75,75 To 80,80 To 85,85 To 90,90 To 95,95 To 100,100 To 110,110 To 120
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CA001011500,0,0,0,0,0,0,6,30,133,28,...,50,32,12,4,2,1,0,0,0,0
CA001012055,0,0,0,0,0,0,3,20,74,14,...,19,21,18,9,6,8,4,1,1,0
CA001012475,0,0,0,0,0,0,2,9,154,63,...,34,9,0,0,0,0,0,0,0,0
CA001012710,0,0,0,0,0,0,2,17,143,48,...,64,14,0,0,0,0,0,0,0,0
CA001014820,0,0,0,0,0,2,6,76,96,38,...,46,42,10,8,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
USW00096405,0,0,0,0,9,15,42,67,96,44,...,0,0,0,0,0,0,0,0,0,0
USW00096406,7,38,34,39,43,25,23,25,36,21,...,30,6,2,0,0,0,0,0,0,0
USW00096407,2,27,26,26,41,21,24,32,35,27,...,21,12,1,0,0,0,0,0,0,0
USW00096408,0,8,28,55,62,23,31,34,48,36,...,14,3,0,0,0,0,0,0,0,0


### Compute PRCP Total Days

In [50]:
# Count the number of non-zero values for each ID
PRCP_counts = weather_data_df.groupby('ID')['PRCP'].apply(lambda x: (x != 0).sum()).reset_index()
TMAX_counts = weather_data_df.groupby('ID')['TMAX'].apply(lambda x: (x != 0).sum()).reset_index()

In [51]:
TMAX_PRCP_counts = pd.merge(PRCP_counts, TMAX_counts, on='ID', how='inner')
TMAX_PRCP_counts = TMAX_PRCP_counts[TMAX_PRCP_counts['TMAX'] > 300]
TMAX_PRCP_counts

Unnamed: 0,ID,PRCP,TMAX
0,CA001011500,156,337
2,CA001012475,0,358
3,CA001012710,145,346
4,CA001014820,152,356
5,CA001015630,163,356
...,...,...,...
8668,USW00096405,212,311
8669,USW00096406,169,364
8670,USW00096407,130,318
8671,USW00096408,166,364


In [52]:
TMAX_PRCP_counts.dtypes

ID      object
PRCP     int64
TMAX     int64
dtype: object

In [53]:
TMAX_PRCP_counts.loc[:, 'PRCP'] = (TMAX_PRCP_counts['PRCP'] * 365 / TMAX_PRCP_counts['TMAX']).round(1)

TMAX_PRCP_counts

  TMAX_PRCP_counts.loc[:, 'PRCP'] = (TMAX_PRCP_counts['PRCP'] * 365 / TMAX_PRCP_counts['TMAX']).round(1)


Unnamed: 0,ID,PRCP,TMAX
0,CA001011500,169.0,337
2,CA001012475,0.0,358
3,CA001012710,153.0,346
4,CA001014820,155.8,356
5,CA001015630,167.1,356
...,...,...,...
8668,USW00096405,248.8,311
8669,USW00096406,169.5,364
8670,USW00096407,149.2,318
8671,USW00096408,166.5,364


#### Write and Verify PRCP data

In [54]:
TMAX_PRCP_counts.to_parquet('result_files/stp3_us_ca_daily_PRCP_weather_sums.parquet.gzip', compression='gzip', engine="fastparquet")  


In [55]:
TMAX_PRCP_counts = pd.read_parquet('result_files/stp3_us_ca_daily_PRCP_weather_sums.parquet.gzip', engine="fastparquet") 
TMAX_PRCP_counts

Unnamed: 0_level_0,ID,PRCP,TMAX
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,CA001011500,169.0,337
2,CA001012475,0.0,358
3,CA001012710,153.0,346
4,CA001014820,155.8,356
5,CA001015630,167.1,356
...,...,...,...
8668,USW00096405,248.8,311
8669,USW00096406,169.5,364
8670,USW00096407,149.2,318
8671,USW00096408,166.5,364
