# create Confusion table

First we had double peaked field's area that was greater than double cropped. Then we filtered out the orchards and irrelevant fields. Then the area of double-peacked dropped below area of double-cropped.

Then we ran the code for several parameters for Grant 2016 and 2017 and now we want to create confusion table to see which parameters are the best, using Grant 2016.

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import sys
from IPython.display import Image
from shapely.geometry import Point, Polygon
from math import factorial
import datetime
import time
import scipy

import os, os.path

from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.linear_model import LinearRegression
from patsy import cr

from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sb

# import core module

In [None]:
import sys
# search path for modules
# look @ https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
sys.path.append('/Users/hn/Documents/00_GitHub/Ag/remote_sensing/python/')
import remote_sensing_core as rc

In [None]:
data_dir_base = "/Users/hn/Documents/01_research_data/remote_sensing/02_peaks_and_plots/Grant_2016/csv/"
param_dir = "/Users/hn/Documents/00_GitHub/Ag/remote_sensing/parameters/"

# Grant 2016 Time Series 
which includes all polygons in it

In [None]:
Grant_2016_TS = pd.read_csv("/Users/hn/Documents/01_research_data/" + \
                            "remote_sensing/02_peaks_and_plots/Grant_2016/Grant_2016_TS.csv")

# drop image columns
Grant_2016_TS.drop(["system:index", "B2" , "B3", "B4", "B8", "doy", "NDVI"], axis=1, inplace=True)

# dropping ALL duplicte values 
Grant_2016_TS.drop_duplicates(inplace = True) 

# remane .geo column to geo
Grant_2016_TS.rename(columns={".geo": "geo"}, inplace=True)

In [None]:
Grant_2016_TS.shape[0]

In [None]:
Grant_2016_TS["geo"].nunique()

### List of files
    - Extract list of files in the input directory
    - Filter the wanted files that contain "all_polygons" in their name.


In [None]:
file_list = os.listdir(data_dir_base)
file_list = [k for k in file_list if 'all_polygons' in k]

# Form confusion table

    - define "double" as yes. Predicted Yes (double) and actual (yes) double-cropped:
    
    ----------------------------------------------------------------
    |                  |                  |                        |
    |                  | Predicted double |  Predicted NOT double  |
    |                  |      2 peaks     |       !(2 peaks)       |
    ----------------------------------------------------------------
    |                  |                  |                        |
    | Actual double    |        TP        |           FN           |
    |                  |                  |                        |
    ----------------------------------------------------------------
    |                  |                  |                        |
    | Actual NOT double|       FP         |          TN            |
    |                  |                  |                        |
    ----------------------------------------------------------------

In [None]:
output_columns = ['col', 'predicted_double_peak', 'predicted_not_double_peak',
                  'total_sum', 'params', "ignored_fields_count"]

all_confusions = pd.DataFrame(data=None, 
                              index=np.arange(len(file_list)*2), 
                              columns=output_columns)
pointer = 0
all_confusions.shape

In [None]:
output_columns = ['col', 'predicted_double_peak', 'predicted_not_double_peak',
                  'total_sum', 'params', "ignor"]

all_confusions = pd.DataFrame(data=None, 
                              index=np.arange(len(file_list)*2), 
                              columns=output_columns)
pointer = 0

for file in file_list:
    broken_pieces = file.split("_")
    a_data = pd.read_csv(data_dir_base + file)
    
    # remove the last row
    a_data = a_data[0:(a_data.shape[0]-1)]
    double_cropped, NotDouble_cropped = rc.divide_double_nonDouble_by_notes(a_data)
    double_crop_double_peak, double_crop_NotDouble_peak = rc.divide_double_nonDouble_peaks(double_cropped)
    NotDouble_cropped_double_peak, NotDouble_cropped_NotDouble_peak = rc.divide_double_nonDouble_peaks(NotDouble_cropped)
    
    ############################################################
    ###
    ###     Form the confusion matrix
    ###
    ############################################################
    params = broken_pieces[0] + " = " + broken_pieces[1] + ", " + broken_pieces[2] + " = " + broken_pieces[3]
    TP = double_crop_double_peak.shape[0]
    FN = double_crop_NotDouble_peak.shape[0]
    FP = NotDouble_cropped_double_peak.shape[0]
    TN = NotDouble_cropped_NotDouble_peak.shape[0]
    total_size = TP + TN + FP + FN
    ignored_fields_count = Grant_2016_TS.shape[0] - a_data.shape[0]
    d = {'col' : ["Actual double-cropped", "actual not-double-cropped"], 
         'predicted_double_peak': [TP, FP],
         'predicted_not_double_peak': [FN, TN],
         'total_sum': [total_size, total_size],
         'params': [params, params],
         "ignored_fields_count":[ignored_fields_count, ignored_fields_count ]
        }
    curr_confusion = pd.DataFrame(data=d)
    
    all_confusions.iloc[pointer:(pointer+2)] = curr_confusion.values
    pointer += 2
    

In [None]:
# all_confusions.sort_values(by=['params'], inplace=True)

In [None]:
output_file_name = "/Users/hn/Desktop/all_confusions_2016.csv"
all_confusions.to_csv(output_file_name, index = False)

In [None]:
file =  file_list[1]
a_data = pd.read_csv(data_dir_base + file)
a_data = a_data[0:(a_data.shape[0]-1)]
print(a_data.shape)

double_cropped, NotDouble_cropped = rc.divide_double_nonDouble_by_notes(a_data)
double_crop_double_peak, double_crop_NotDouble_peak = rc.divide_double_nonDouble_peaks(double_cropped)
NotDouble_cropped_double_peak, NotDouble_cropped_NotDouble_peak = rc.divide_double_nonDouble_peaks(NotDouble_cropped)
TP = double_crop_double_peak.shape[0]
FN = double_crop_NotDouble_peak.shape[0]
FP = NotDouble_cropped_double_peak.shape[0]
TN = NotDouble_cropped_NotDouble_peak.shape[0]

print("TP = " + str(TP))
print("FP = " + str(FP))
print("FN = " + str(FN))
print("TN = " + str(TN))

print("Number of double-cropped fields is", str(TP + FN))

In [None]:
total_size = TP + TN + FP + FN

d = {'col' : ["Actual double-cropped", "actual not-double-cropped"], 
     'predicted_double_peak': [TP, FP],
     'predicted_not_double_peak': [FN, TN],
     'total_sum': [total_size, total_size],
     'params': [params, params]
    }
curr_confusion = pd.DataFrame(data=d)

In [None]:
curr_confusion

In [None]:
clean_a_data = a_data.copy()
clean_a_data.drop(["peak_Doy", "peak_value" , "peak_count"], axis=1, inplace=True)

# dropping ALL duplicte values 
clean_a_data.drop_duplicates(inplace = True) 

In [None]:
clean_a_data.head(2)

In [None]:
clean_a_data.shape

In [None]:
clean_a_data["geo"].nunique()

double_crop_double_peak.to_csv("/Users/hn/Desktop/double_crop_double_peak.csv", index = False)
double_crop_NotDouble_peak.to_csv("/Users/hn/Desktop/double_crop_NotDouble_peak.csv", index = False)
NotDouble_cropped_double_peak.to_csv("/Users/hn/Desktop/NotDouble_cropped_double_peak.csv", index = False)
NotDouble_cropped_NotDouble_peak.to_csv("/Users/hn/Desktop/NotDouble_cropped_NotDouble_peak.csv", index = False)

In [None]:
params = broken_pieces[0] + " = " + broken_pieces[1] + ", " + broken_pieces[2] + " = " + broken_pieces[3]
TP = double_crop_double_peak.shape[0]
FN = double_crop_NotDouble_peak.shape[0]
FP = NotDouble_cropped_double_peak.shape[0]
TN = NotDouble_cropped_NotDouble_peak.shape[0]
total_size = TP + TN + FP + FN

d = {'col' : ["Actual double-cropped", "actual not-double-cropped"], 
     'predicted_double_peak': [TP, FP],
     'predicted_not_double_peak': [FN, TN],
     'total_sum': [total_size, total_size],
     'params': [params, params]
    }

df = pd.DataFrame(data=d)
df

In [None]:
FP

In [None]:
# a_data.Notes.unique()
# a_data.to_csv("/Users/hn/Desktop/a_data.csv", index = False)