## import library

In [32]:
import pandas as pd

## import data

### yield

In [31]:
yield_dir = './yield_data/'
yield_file = 'wheat_yield_data_final.csv'

yield_df = pd.read_csv(yield_dir + yield_file)
print(yield_df.head())
print("-----")
soil_dir = './soil_data/'
soil_file = 'state_county_lon_lat_soil.csv'
soil_df = pd.read_csv(soil_dir + soil_file)
print(soil_df.head())


   year state_name county_name  yield
0  2000     KANSAS       ALLEN   40.0
1  2000     KANSAS    ANDERSON   44.0
2  2000     KANSAS    ATCHISON   44.0
3  2000     KANSAS      BARBER   37.0
4  2000     KANSAS      BARTON   44.0
-----
  state_name county_name         lon        lat  nutr_ret_high  \
0     KANSAS    CHEYENNE -101.757549  39.795580             10   
1     KANSAS     DECATUR -100.472769  39.794053             10   
2     KANSAS      GRAHAM  -99.898062  39.340620             10   
3     KANSAS      NORTON  -99.910003  39.794470             10   
4     KANSAS     RAWLINS -101.099472  39.790480             10   

   suit_irrig_high_soy  AEZ_1  AEZ_2  AEZ_3  AEZ_4  ...  SQL_1  SQL_2  SQL_3  \
0                10000  False  False  False  False  ...  False  False  False   
1                 8536  False  False  False  False  ...  False  False  False   
2                 7778  False  False  False  False  ...  False  False  False   
3                 7778  False  False  False  Fals

### weather data

In [49]:
lon_lat_dir = './lon_lat_data/'
yscyll_filename = 'year_state_county_yield_lon_lat.csv'

weather_dir = './weather_data/'
wdtemplate = r'weather-data-for-index__{padded}.csv'

df_yscyll = pd.read_csv(lon_lat_dir + yscyll_filename)
print(df_yscyll.shape)
print()

w_df = {}
for i in range(0,len(df_yscyll)):
    try:
        padded = str(i).zfill(4)
        w_df[i] = pd.read_csv(weather_dir + wdtemplate.format(padded=padded))
        # Want to have a name for the index of my dataframe
        w_df[i].rename(columns={'Unnamed: 0': 'date'}, 
                       inplace=True)
    # w_df[i] = w_df[i].rename_axis(index='DATE')
    except:
        print(f'{padded} is missing')
print()
print(w_df[4].shape)
print(w_df[4].head())

(8594, 6)


(366, 7)
       date  T2M_MAX  T2M_MIN  PRECTOTCORR  GWETROOT  EVPTRNS  \
0  20000101    13.84     0.48         0.00      0.45     0.01   
1  20000102     7.37    -2.22         0.02      0.45     0.00   
2  20000103     0.50    -6.90        12.31      0.45     0.00   
3  20000104     2.05   -12.32         0.00      0.45     0.00   
4  20000105     5.31    -6.51         0.33      0.46     0.00   

   ALLSKY_SFC_PAR_TOT  
0                46.1  
1                55.3  
2                 9.7  
3                28.4  
4                39.0  


In [50]:
# takes as input a dataframe whose index field is called "date" and
#   holds 8-character dates, and with columns 
#   ['T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'GWETROOT', 'EVPTRNS', 'ALLSKY_SFC_PAR_TOT']
# produces dataframe with same shape, but the values are grouped by MONTH,
#   with a particular aggregation used for each column

def create_monthly_df(df):
    df1 = df.copy()
    # convert index to datetime format    
    df1.index = pd.to_datetime(df['date'], format='%Y%m%d')
    # use 'M' for monthly, use 'W' for weekly
    df1_monthly = df1.resample('M').agg({'T2M_MAX':'mean',
                                       'T2M_MIN':'mean',
                                       'PRECTOTCORR':'sum',
                                       'GWETROOT':'mean',
                                       'EVPTRNS':'mean',
                                       'ALLSKY_SFC_PAR_TOT':'sum'})    

    # convert index back to string format YYYYMM
    df1_monthly.index = df1_monthly.index.strftime('%Y%m%d')
    
    return df1_monthly
    


print(create_monthly_df(w_df[4]).head(50))

            T2M_MAX    T2M_MIN  PRECTOTCORR  GWETROOT   EVPTRNS  \
date                                                              
20000131   6.553871  -4.556774        22.08  0.456452  0.005161   
20000229  11.777586  -1.560000        48.32  0.467931  0.048276   
20000331  14.182258   0.869032       153.63  0.529032  0.202258   
20000430  19.896333   4.384667        43.77  0.501333  0.613667   
20000531  26.460323  12.685484        79.40  0.485806  1.300645   
20000630  29.561000  16.036667       104.36  0.455333  1.075333   
20000731  33.254516  20.190000       130.35  0.471935  1.356774   
20000831  37.988387  22.292258         6.45  0.418065  0.367742   
20000930  31.740333  14.993000        19.19  0.408000  0.038667   
20001031  22.063871   9.725161       120.16  0.455484  0.132581   
20001130   8.641333  -2.732000        19.08  0.489000  0.040667   
20001231   1.824194  -8.334194        13.98  0.473226  0.001613   

          ALLSKY_SFC_PAR_TOT  
date                          

In [51]:
import json

df_t0 = w_df[0]
cols_narrow = df_t0.columns.values.tolist()[1:]
print(cols_narrow)

print()

df_t1 = create_monthly_df(df_t0)     # dfw['0001']
print(len(df_t1))
# print(df_t1.head())

cols_wide = []
for i in range(0,len(df_t1)):
    row = df_t1.iloc[i]
    # print(row)
    # can't use date, because it has year built in, and weeks start on different numbers...
    month_id = 'month_' + str(i).zfill(2)
    # print(date)
    for c in cols_narrow:
        cols_wide.append(month_id + '__' + c)
        
print(cols_wide)
print(len(cols_wide))

['T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'GWETROOT', 'EVPTRNS', 'ALLSKY_SFC_PAR_TOT']

12
['month_00__T2M_MAX', 'month_00__T2M_MIN', 'month_00__PRECTOTCORR', 'month_00__GWETROOT', 'month_00__EVPTRNS', 'month_00__ALLSKY_SFC_PAR_TOT', 'month_01__T2M_MAX', 'month_01__T2M_MIN', 'month_01__PRECTOTCORR', 'month_01__GWETROOT', 'month_01__EVPTRNS', 'month_01__ALLSKY_SFC_PAR_TOT', 'month_02__T2M_MAX', 'month_02__T2M_MIN', 'month_02__PRECTOTCORR', 'month_02__GWETROOT', 'month_02__EVPTRNS', 'month_02__ALLSKY_SFC_PAR_TOT', 'month_03__T2M_MAX', 'month_03__T2M_MIN', 'month_03__PRECTOTCORR', 'month_03__GWETROOT', 'month_03__EVPTRNS', 'month_03__ALLSKY_SFC_PAR_TOT', 'month_04__T2M_MAX', 'month_04__T2M_MIN', 'month_04__PRECTOTCORR', 'month_04__GWETROOT', 'month_04__EVPTRNS', 'month_04__ALLSKY_SFC_PAR_TOT', 'month_05__T2M_MAX', 'month_05__T2M_MIN', 'month_05__PRECTOTCORR', 'month_05__GWETROOT', 'month_05__EVPTRNS', 'month_05__ALLSKY_SFC_PAR_TOT', 'month_06__T2M_MAX', 'month_06__T2M_MIN', 'month_06__PRECTOT

In [52]:
# starts with a df with the weekly aggregates for weather params,
# and produces a long sequence of all the MONTHLY weather values, in order corresponding to cols_wide

print(w_df[0].columns.tolist()[1:])
print(w_df[0].shape)
print(create_monthly_df(w_df[0]).shape)

def create_weather_seq_for_monthly(dfw):
    seq = []
    cols = dfw.columns.tolist()
    for i in range(0,len(dfw)):
        for c in cols:
            seq.append(dfw.iloc[i][c])
    return seq

# sanity check
dfw = create_monthly_df(w_df[0])
print(dfw.head(10))

seqw = create_weather_seq_for_monthly(dfw)
print(json.dumps(seqw, indent=4))

['T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'GWETROOT', 'EVPTRNS', 'ALLSKY_SFC_PAR_TOT']
(366, 7)
(12, 6)
            T2M_MAX    T2M_MIN  PRECTOTCORR  GWETROOT   EVPTRNS  \
date                                                              
20000131   5.982581  -3.749355        14.74  0.617742  0.009355   
20000229  11.609310  -0.812759        57.20  0.625862  0.098621   
20000331  15.397419   2.484839        99.40  0.659032  0.270968   
20000430  19.538000   5.900333        31.91  0.622333  0.754333   
20000531  26.444839  14.440645        81.56  0.565484  1.129032   
20000630  27.569000  16.717333       192.90  0.571667  1.365000   
20000731  32.229677  20.251613        79.41  0.542258  1.342258   
20000831  38.795484  23.239355         6.28  0.477742  0.137097   
20000930  31.704667  15.760333        50.85  0.472333  0.017667   
20001031  22.730000  10.503548       119.12  0.521613  0.156452   

          ALLSKY_SFC_PAR_TOT  
date                          
20000131              1472.6  
20

In [53]:
import time

u_df = {}   # each entry will hold a df corresponding to a weather .csv file 
dfw = {}    # each entry will hold the df corresponding to monthly aggregation of a weather .csv file
seqw = {}   # each entry will hold the "flattening" of the monthly aggregation df


for i in range(0,len(df_yscyll)):
    padded = str(i).zfill(4)
    # print(padded)
    u_df[padded] = pd.read_csv(weather_dir + wdtemplate.format(padded=padded))
    # Want to have a name for the index of my dataframe
    u_df[padded].rename(columns={'Unnamed: 0': 'date'}, 
                   inplace=True)
    
    dfw[padded] = create_monthly_df(u_df[padded])
    # print(dfw.head())

    seqw[i] = create_weather_seq_for_monthly(dfw[padded])
    # print(json.dumps(dictw, indent=4)
    
    # introducing a small occassional sleep because my python kernel kept complaining about
    # exceeding some I/O threshold
    if i % 30 == 0:
        time.sleep(0.05)
        
    if i > 9000 and i % 100 == 0:
        time.sleep(0.5)
    
    if i % 500 == 0:
        print('Completed processing for index: ', i)
    
    
# sanity check
# print(json.dumps(seqw, indent=4))


Completed processing for index:  0
Completed processing for index:  500
Completed processing for index:  1000
Completed processing for index:  1500
Completed processing for index:  2000
Completed processing for index:  2500
Completed processing for index:  3000
Completed processing for index:  3500
Completed processing for index:  4000
Completed processing for index:  4500
Completed processing for index:  5000
Completed processing for index:  5500
Completed processing for index:  6000
Completed processing for index:  6500
Completed processing for index:  7000
Completed processing for index:  7500
Completed processing for index:  8000
Completed processing for index:  8500


In [54]:
print(len(seqw))
print(json.dumps(seqw, indent=4))

8594


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [55]:
print(dfw['0000'].shape)
print(len(cols_wide))
print(len(df_yscyll))
print(len(seqw[0]))
print()

df_wide_weather_monthly = pd.DataFrame.from_dict(seqw, orient='index', columns=cols_wide)

print(df_wide_weather_monthly.shape)
print()
print(df_wide_weather_monthly.head())

(12, 6)
72
8594
72

(8594, 72)

   month_00__T2M_MAX  month_00__T2M_MIN  month_00__PRECTOTCORR  \
0           5.982581          -3.749355                  14.74   
1           6.086452          -3.769355                  17.07   
2           4.339677          -5.088387                   5.80   
3           9.012903          -3.157419                  16.94   
4           6.553871          -4.556774                  22.08   

   month_00__GWETROOT  month_00__EVPTRNS  month_00__ALLSKY_SFC_PAR_TOT  \
0            0.617742           0.009355                        1472.6   
1            0.624839           0.010645                        1452.0   
2            0.544516           0.000323                        1339.8   
3            0.435806           0.027419                        1454.7   
4            0.456452           0.005161                        1443.3   

   month_01__T2M_MAX  month_01__T2M_MIN  month_01__PRECTOTCORR  \
0          11.609310          -0.812759                  57.

In [56]:
sclls_file = 'state_county_lon_lat_soil.csv'

df_scsoil = pd.read_csv(soil_dir + sclls_file).drop(columns=['lon','lat'])
print(df_scsoil.shape)
# print(df_scsoil.head())

# will continue working with df_yscyll because updated DU PAGE county 
#     (and might update other things in future versions...)

df_ysc_y_soil = pd.merge(df_yscyll, df_scsoil, on=['state_name','county_name'],how='left')

df_ysc_y_soil = df_ysc_y_soil.drop(columns=['lon','lat'])

print()
print(df_ysc_y_soil.shape)
print(df_ysc_y_soil.head())


(557, 45)

(8594, 47)
   year state_name county_name  yield  nutr_ret_high  suit_irrig_high_soy  \
0  2000     KANSAS       ALLEN   40.0           10.0              10000.0   
1  2000     KANSAS    ANDERSON   44.0           10.0              10000.0   
2  2000     KANSAS    ATCHISON   44.0           10.0              10000.0   
3  2000     KANSAS      BARBER   37.0           10.0              10000.0   
4  2000     KANSAS      BARTON   44.0           10.0              10000.0   

   AEZ_1  AEZ_2  AEZ_3  AEZ_4  ...  SQL_1  SQL_2  SQL_3  SQL_4  SQL_5  SQL_6  \
0  False  False  False  False  ...  False  False  False  False  False  False   
1  False  False  False  False  ...  False  False  False  False  False  False   
2  False  False  False  False  ...  False  False  False  False  False  False   
3  False  False  False  False  ...  False  False  False  False  False  False   
4  False  False  False  False  ...  False  False  False  False  False  False   

   SQL_7  SQL_8  SQL_9 SQL_10  
0 

In [57]:
df_ysc_y_soil_weather_monthly = pd.concat([df_ysc_y_soil, df_wide_weather_monthly], axis='columns')

print(df_ysc_y_soil_weather_monthly.shape)
# print(df_ysc_y_soil_weather_monthly.head(10))
print(df_ysc_y_soil_weather_monthly.loc[28:32,:])

(8594, 119)
    year state_name county_name  yield  nutr_ret_high  suit_irrig_high_soy  \
28  2000     KANSAS        FORD   33.0           10.0               8031.0   
29  2000     KANSAS    FRANKLIN   41.0           10.0              10000.0   
30  2000     KANSAS       GEARY   44.0           10.0               8333.0   
31  2000     KANSAS        GOVE   34.0           10.0               3333.0   
32  2000     KANSAS      GRAHAM   36.0           10.0               7778.0   

    AEZ_1  AEZ_2  AEZ_3  AEZ_4  ... month_10__PRECTOTCORR month_10__GWETROOT  \
28  False  False  False  False  ...                  8.01           0.414000   
29  False  False  False  False  ...                 47.95           0.591333   
30  False  False  False  False  ...                 26.55           0.438000   
31  False  False  False  False  ...                 19.33           0.424000   
32  False  False  False  False  ...                 26.59           0.419333   

   month_10__EVPTRNS month_10__ALLSKY_

In [58]:
ml_tables_dir = './ML_data/'

ml_file = 'ML-table-monthly.csv'

df_ysc_y_soil_weather_monthly.to_csv(ml_tables_dir + ml_file, index=False)

print('Wrote file ', ml_tables_dir + ml_file)

Wrote file  ./ML_data/ML-table-monthly.csv
