## import library

In [1]:
import pandas as pd

## import data

In [2]:
lon_lat_dir = './lon_lat_data/'
yscyll_filename = 'year_state_county_yield_lon_lat.csv'

weather_dir = './weather_data/'
wdtemplate = r'weather-data-for-index__{padded}.csv'

df_yscyll = pd.read_csv(lon_lat_dir + yscyll_filename)
print(df_yscyll.shape)
print()

w_df = {}
for i in range(0,len(df_yscyll)):
    try:
        padded = str(i).zfill(4)
        w_df[i] = pd.read_csv(weather_dir + wdtemplate.format(padded=padded))
        # Want to have a name for the index of my dataframe
        w_df[i].rename(columns={'Unnamed: 0': 'date'}, 
                       inplace=True)
    # w_df[i] = w_df[i].rename_axis(index='DATE')
    except:
        print(f'{padded} is missing')
print()
print(w_df[4].shape)
print(w_df[4].head())

(8594, 6)


(366, 7)
       date  T2M_MAX  T2M_MIN  PRECTOTCORR  GWETROOT  EVPTRNS  \
0  20000101    13.84     0.48         0.00      0.45     0.01   
1  20000102     7.37    -2.22         0.02      0.45     0.00   
2  20000103     0.50    -6.90        12.31      0.45     0.00   
3  20000104     2.05   -12.32         0.00      0.45     0.00   
4  20000105     5.31    -6.51         0.33      0.46     0.00   

   ALLSKY_SFC_PAR_TOT  
0                46.1  
1                55.3  
2                 9.7  
3                28.4  
4                39.0  


In [3]:
# takes as input a dataframe whose index field is called "date" and
#   holds 8-character dates, and with columns 
#   ['T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'GWETROOT', 'EVPTRNS', 'ALLSKY_SFC_PAR_TOT']
# produces dataframe with same shape, but the values are grouped by WEEK,
#   with a particular aggregation used for each column

def create_weekly_df(df):
    df1 = df.copy()
    # convert index to datetime format    
    df1.index = pd.to_datetime(df['date'], format='%Y%m%d')
    # use 'M' for monthly, use 'W' for weekly
    df1_weekly = df1.resample('W').agg({'T2M_MAX':'mean',
                                       'T2M_MIN':'mean',
                                       'PRECTOTCORR':'sum',
                                       'GWETROOT':'mean',
                                       'EVPTRNS':'mean',
                                       'ALLSKY_SFC_PAR_TOT':'sum'})    

    # convert index back to string format YYYYMM
    df1_weekly.index = df1_weekly.index.strftime('%Y%m%d')
    
    return df1_weekly
    


print(create_weekly_df(w_df[4]).head(50))

            T2M_MAX    T2M_MIN  PRECTOTCORR  GWETROOT   EVPTRNS  \
date                                                              
20000102  10.605000  -0.870000         0.02  0.450000  0.005000   
20000109   5.342857  -5.642857        12.87  0.461429  0.000000   
20000116  10.654286  -2.945714         0.12  0.462857  0.010000   
20000123   8.725714  -3.738571         0.04  0.452857  0.011429   
20000130   0.530000  -6.394286         8.83  0.450000  0.000000   
20000206   7.405714  -5.150000         0.62  0.460000  0.005714   
20000213   8.964286  -3.271429         0.38  0.454286  0.015714   
20000220  11.538571  -3.502857         4.12  0.450000  0.028571   
20000227  16.627143   3.685714        42.49  0.498571  0.105714   
20000305  14.245714   0.445714        42.68  0.514286  0.130000   
20000312  13.904286   0.732857        13.58  0.522857  0.165714   
20000319  11.340000  -1.534286        13.65  0.504286  0.107143   
20000326  18.302857   3.972857        62.72  0.554286  0.33714

In [4]:
import json

df_t0 = w_df[0]
cols_narrow = df_t0.columns.values.tolist()[1:]
print(cols_narrow)

print()

df_t1 = create_weekly_df(df_t0)     # dfw['0001']
# print(df_t1.head())

cols_wide = []
for i in range(0,len(df_t1)):
    row = df_t1.iloc[i]
    # print(row)
    # can't use date, because it has year built in, and weeks start on different numbers...
    week_id = 'week_' + str(i).zfill(2)
    # print(date)
    for c in cols_narrow:
        cols_wide.append(week_id + '__' + c)
        
print(cols_wide)

['T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'GWETROOT', 'EVPTRNS', 'ALLSKY_SFC_PAR_TOT']

['week_00__T2M_MAX', 'week_00__T2M_MIN', 'week_00__PRECTOTCORR', 'week_00__GWETROOT', 'week_00__EVPTRNS', 'week_00__ALLSKY_SFC_PAR_TOT', 'week_01__T2M_MAX', 'week_01__T2M_MIN', 'week_01__PRECTOTCORR', 'week_01__GWETROOT', 'week_01__EVPTRNS', 'week_01__ALLSKY_SFC_PAR_TOT', 'week_02__T2M_MAX', 'week_02__T2M_MIN', 'week_02__PRECTOTCORR', 'week_02__GWETROOT', 'week_02__EVPTRNS', 'week_02__ALLSKY_SFC_PAR_TOT', 'week_03__T2M_MAX', 'week_03__T2M_MIN', 'week_03__PRECTOTCORR', 'week_03__GWETROOT', 'week_03__EVPTRNS', 'week_03__ALLSKY_SFC_PAR_TOT', 'week_04__T2M_MAX', 'week_04__T2M_MIN', 'week_04__PRECTOTCORR', 'week_04__GWETROOT', 'week_04__EVPTRNS', 'week_04__ALLSKY_SFC_PAR_TOT', 'week_05__T2M_MAX', 'week_05__T2M_MIN', 'week_05__PRECTOTCORR', 'week_05__GWETROOT', 'week_05__EVPTRNS', 'week_05__ALLSKY_SFC_PAR_TOT', 'week_06__T2M_MAX', 'week_06__T2M_MIN', 'week_06__PRECTOTCORR', 'week_06__GWETROOT', 'week_06__EVPT

In [7]:
# starts with a df with the weekly aggregates for weather params,
# and produces a long sequence of all the WEEKLY weather values, in order corresponding to cols_wide

print(w_df[0].columns.tolist()[1:])
print(w_df[0].shape)
print(create_weekly_df(w_df[0]).shape)

def create_weather_seq_for_weekly(dfw):
    seq = []
    cols = dfw.columns.tolist()
    for i in range(0,len(dfw)):
        for c in cols:
            seq.append(dfw.iloc[i][c])
    return seq

# sanity check
dfw = create_weekly_df(w_df[0])
print(dfw.head(10))

seqw = create_weather_seq_for_weekly(dfw)
print(json.dumps(seqw, indent=4))

['T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'GWETROOT', 'EVPTRNS', 'ALLSKY_SFC_PAR_TOT']
(366, 7)
(53, 6)
            T2M_MAX   T2M_MIN  PRECTOTCORR  GWETROOT   EVPTRNS  \
date                                                             
20000102  11.880000  0.870000         0.17  0.620000  0.025000   
20000109   6.188571 -2.394286         4.89  0.620000  0.002857   
20000116   9.602857 -2.657143         0.00  0.620000  0.020000   
20000123   7.357143 -3.325714         1.63  0.620000  0.011429   
20000130  -0.447143 -6.998571         7.87  0.611429  0.000000   
20000206   5.167143 -5.711429         0.47  0.618571  0.002857   
20000213  10.338571 -2.730000         0.23  0.610000  0.038571   
20000220   9.318571 -2.881429        32.83  0.625714  0.041429   
20000227  18.052857  5.187143        23.70  0.642857  0.224286   
20000305  15.361429  1.410000        16.38  0.645714  0.225714   

          ALLSKY_SFC_PAR_TOT  
date                          
20000102               108.1  
20000109      

In [10]:
import time

u_df = {}
dfw = {}
seqw = {}


for i in range(0,len(df_yscyll)):
    padded = str(i).zfill(4)
    # print(padded)
    u_df[padded] = pd.read_csv(weather_dir + wdtemplate.format(padded=padded))
    # Want to have a name for the index of my dataframe
    u_df[padded].rename(columns={'Unnamed: 0': 'date'}, 
                   inplace=True)
    
    dfw[padded] = create_weekly_df(u_df[padded])
    # print(dfw.head())

    seqw[i] = create_weather_seq_for_weekly(dfw[padded])
    # print(json.dumps(dictw, indent=4)

    # introducing a small occassional sleep because my python kernel kept complaining about
    # exceeding some I/O threshold
    # if i % 30 == 0:
    #     time.sleep(0.05)
        
    # if i > 4000 and i % 100 == 0:
    #     time.sleep(0.5)
    
    if i % 100 == 0:
        print('Completed processing of index ', i)
    
# sanity check
print(print(json.dumps(seqw, indent=4)))


Completed processing of index  0
Completed processing of index  100
Completed processing of index  200
Completed processing of index  300
Completed processing of index  400
Completed processing of index  500
Completed processing of index  600
Completed processing of index  700
Completed processing of index  800
Completed processing of index  900
Completed processing of index  1000
Completed processing of index  1100
Completed processing of index  1200
Completed processing of index  1300
Completed processing of index  1400
Completed processing of index  1500
Completed processing of index  1600
Completed processing of index  1700
Completed processing of index  1800
Completed processing of index  1900
Completed processing of index  2000
Completed processing of index  2100
Completed processing of index  2200
Completed processing of index  2300
Completed processing of index  2400
Completed processing of index  2500
Completed processing of index  2600
Completed processing of index  2700
Comp

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [11]:
print(len(seqw))

8594


In [14]:
print(dfw['0000'].shape)
print(len(cols_wide))
print(len(df_yscyll))
print(len(seqw[0]))
print()

df_wide_weather_weekly_prelim = pd.DataFrame.from_dict(seqw, orient='index', columns=cols_wide)

print(df_wide_weather_weekly_prelim.shape)
print()
print(df_wide_weather_weekly_prelim.head())

(53, 6)
318
8594
318
8594



ValueError: 318 columns passed, passed data had 324 columns

In [None]:
# print(cols_wide_weekly)
print(df_wide_weather_weekly_prelim.shape)
week_31_cols = ['week_31__T2M_MAX', 'week_31__T2M_MIN', 'week_31__PRECTOTCORR', 'week_31__GWETROOT', 'week_31__EVPTRNS', 'week_31__ALLSKY_SFC_PAR_TOT']

df_wide_weather_weekly = df_wide_weather_weekly_prelim.drop(columns=week_31_cols)

print()
print(df_wide_weather_weekly.shape)
print(df_wide_weather_weekly.head())


In [None]:
sclls_file = 'state_county_lon_lat_soil.csv'

df_scsoil = pd.read_csv(soil_dir + sclls_file).drop(columns=['lon','lat'])
print(df_scsoil.shape)
# print(df_scsoil.head())

# will continue working with df_yscyll because updated DU PAGE county 
#     (and might update other things in future versions...)

df_ysc_y_soil = pd.merge(df_yscyll, df_scsoil, on=['state_name','county_name'],how='left')

df_ysc_y_soil = df_ysc_y_soil.drop(columns=['lon','lat'])

print()
print(df_ysc_y_soil.shape)
print(df_ysc_y_soil.head())


In [None]:
df_ysc_y_soil_weather_weekly = pd.concat([df_ysc_y_soil, df_wide_weather_weekly], axis='columns')

print(df_ysc_y_soil_weather_weekly.shape)
# print(df_ysc_y_soil_weather_weekly.head(10))
print(df_ysc_y_soil_weather_weekly.loc[28:32,:])

In [None]:
ml_tables_dir = './ML_data/'

ml_file = 'ML-table-weekly.csv'

df_ysc_y_soil_weather_monthly.to_csv(ml_tables_dir + ml_file, index=False)

print('Wrote file ', ml_tables_dir + ml_file)