In [1]:
"""
Compute acreage per cultivar for double peak data
Do this for all cultivars labeled as double peaked, and then, also, filter
the cultivars by those that are potentially double cropped. (i.e. Filter out orchard stuff.)
And then filter those with distance between peaks greater than a threshold.
"""

import csv
import numpy as np
import pandas as pd
# import geopandas as gpd
from IPython.display import Image
# from shapely.geometry import Point, Polygon
from math import factorial
import datetime
import time
import scipy
import os, os.path

from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.linear_model import LinearRegression
from patsy import cr

# from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sb


import sys

# Directories

In [5]:
data_dir = "/Users/hn/Documents/01_research_data/remote_sensing" + \
           "/02_peaks_and_plots/peaks_savitzky/"

param_dir = "/Users/hn/Documents/00_GitHub/Ag/remote_sensing/parameters/"

sys.path.append('/Users/hn/Documents/00_GitHub/Ag/remote_sensing/python/')
import remote_sensing_core as rc
import remote_sensing_core as rcp

# Read data

In [66]:
# read parameter
double_crop_potentials = pd.read_csv(param_dir + "double_crop_potential_plants.csv")
dbl_crop_potentials = double_crop_potentials['Crop_Type'].unique()

In [36]:
file_name = "all_poly_and_maxs_savitzky.csv"

all_poly_and_maxs_savitzky = pd.read_csv(data_dir + file_name)

n_rows = len(all_poly_and_maxs_savitzky)
print ("number of rows is: " + str(n_rows))


# we have an extra row at the end
# all_poly_and_maxs_savitzky.dropna(subset=['max_count'], inplace=True)
all_poly_and_maxs_savitzky = all_poly_and_maxs_savitzky[0:(n_rows-1)]
n_rows = len(all_poly_and_maxs_savitzky)
print ("number of rows is: " + str(n_rows))

number of rows is: 20732
number of rows is: 20731


In [None]:
all_poly_and_maxs_savitzky = all_poly_and_maxs_savitzky.astype({"year": int, "max_count":int})

g_vec = ["county", "year", "CropTyp"]

# Acreage per cultivar

This cannot be true. We did not save those with no peaks. Do this in R using shapefile.

In [149]:
triple_peaks_df = all_poly_and_maxs_savitzky[all_poly_and_maxs_savitzky.max_count.isin([3, 4])] 

In [102]:
triple_peaks_df.CropTyp.unique()

array(['Alfalfa Hay', 'Fallow', 'Timothy', 'Wheat', 'Corn, Field',
       'Wheat Fallow', 'Alfalfa/Grass Hay', 'Pasture', 'Bean, Dry',
       'Grass Hay', 'Mint', 'Potato', 'Unknown', 'Pea, Green',
       'Corn, Sweet', 'Grass Seed, Other'], dtype=object)

#  Filter those with two peaks

This is probably useless. So we do not save it.

In [155]:
double_peaks = all_poly_and_maxs_savitzky[all_poly_and_maxs_savitzky.max_count.isin([2])]
acreage_per_double_peaks = double_peaks.groupby(g_vec).ExctAcr.sum().reset_index()

double_peaks.max_count.unique()

array([2])

# Filter potential crops

In [157]:
double_crop_poten_df = all_poly_and_maxs_savitzky[all_poly_and_maxs_savitzky.CropTyp.isin(dbl_crop_potentials)]

A = len(double_crop_poten_df.CropTyp.unique())
print ("There are [{}] crops that have potential of being double cropped.".format(len(dbl_crop_potentials)))
print ("There are [{}] crops in current (county, year) that have potential of being double cropped.".format(A))

# acerage per cultivar that has potential of bein double cropped.
acr_per_potential_cults = double_crop_poten_df.groupby(g_vec).ExctAcr.sum().reset_index()

There are [108] crops that have potential of being double cropped.
There are [42] crops in current (county, year) that have potential of being double cropped.


In [153]:
double_crop_poten_df.max_count.unique()

array([1, 2, 3])

# Filter double peaked out of double potentials

In [158]:
dbl_peaks_dbl_poten = double_crop_poten_df[double_crop_poten_df.max_count.isin([2])]
acr_per_dbl_peaks_doubl_poten = dbl_peaks_dbl_poten.groupby(g_vec).ExctAcr.sum().reset_index()

In [159]:
dbl_peaks_dbl_poten.max_count.unique()

array([2])

# save the tables

In [84]:
out_dir = data_dir + "/acreage_tables/"
os.makedirs(out_dir, exist_ok=True)

In [86]:
path_name = out_dir + "acr_per_dbl_peaks_doubl_poten.csv"
acr_per_dbl_peaks_doubl_poten.to_csv(path_name, index = False)

In [89]:
path_name = out_dir + "acr_per_potential_cults.csv"
acr_per_potential_cults.to_csv(path_name, index = False)

# Filter further by the distance between peaks

### Double peaks and double potentials|

In [239]:
min_diff = 60
dbl_peak_poly_list = dbl_peaks_dbl_poten['geo'].unique()

dbl_potent_dbl_peak_good_dist = pd.DataFrame(data = None, 
                                             index = np.arange(len(dbl_peaks_dbl_poten)), 
                                             columns = dbl_peaks_dbl_poten.columns)
pointer = 0

In [240]:
for a_poly in dbl_peak_poly_list:
    curr_df = dbl_peaks_dbl_poten[dbl_peaks_dbl_poten['geo']==a_poly]
    
    # Sanity check. Sort the dataframe according to max_DoY
    curr_df = curr_df.sort_values(["max_Doy"], ascending = (True))
    # print (len(curr_ldf))
    
    # compute difference between DoY of peaks
    # diffs = curr_df.max_Doy[1:len(curr_df)].values - curr_df.max_Doy[0:len(curr_df)-1].values
    diff = max(curr_df.max_Doy) - min(curr_df.max_Doy)
    
    # check if there is any difference greater than min_diff
    # if (sum(diffs > min_diff)) > 0:
    if (diff > min_diff):
        dbl_potent_dbl_peak_good_dist.iloc[pointer:(pointer + len(curr_df))] = curr_df.values
        pointer += len(curr_df)

        
dbl_potent_dbl_peak_good_dist = dbl_potent_dbl_peak_good_dist[0:pointer]

In [248]:
path_name = out_dir + "acr_per_potential_dbl_peaked_" + str(min_diff) +"_apart.csv"
dbl_potent_dbl_peak_good_dist.to_csv(path_name, index = False)

In [249]:
out_dir

'/Users/hn/Documents/01_research_data/remote_sensing/02_peaks_and_plots/peaks_savitzky//acreage_tables/'