In [1]:
import pandas as pd

data = pd.read_csv('~/ctgan/data/annual_KY_WI_3class_stdDev_6ft.csv')
data.head()

Unnamed: 0,Total Radiation (MJ/m^2),Total Rainfall (mm),Avg Air Temp (C),Avg Min Temp (C),Avg Max Temp (C),Avg Soil Moisture (%),Class
0,3273.52,804.6,17.607333,12.617071,22.598714,0.313481,2
1,3273.52,804.6,17.607333,12.617071,22.598714,0.313481,2
2,3273.52,804.6,17.607333,12.617071,22.598714,0.313481,2
3,3273.52,804.6,17.607333,12.617071,22.598714,0.313481,2
4,3273.52,804.6,17.607333,12.617071,22.598714,0.313481,2


In [2]:
from sdv.tabular import CTGAN

model = CTGAN(discriminator_lr=2e-2,generator_lr=2e-2,batch_size=10,epochs=10000)
model.fit(data)

In [3]:
new_data = model.sample(1000)

In [4]:
new_data.to_csv('~/ctgan/data/synthData/mlp_params_1k_snlt_02.csv')
new_data.head()

Unnamed: 0,Total Radiation (MJ/m^2),Total Rainfall (mm),Avg Air Temp (C),Avg Min Temp (C),Avg Max Temp (C),Avg Soil Moisture (%),Class
0,4299.63,917.0,23.790179,1.081506,16.225443,0.121775,2
1,4299.63,917.0,23.790179,1.081506,16.225443,0.121775,2
2,4299.63,917.0,12.552551,1.081506,17.673176,0.07,2
3,4299.63,917.0,12.552551,1.081506,17.673176,0.07,2
4,4299.63,917.0,12.552551,1.081506,17.673176,0.07,2


In [5]:
# get aggregate data
targetDataLoc = '~/ctgan/data/annual_GA_3class_1varPerYear_stdDev.csv'
#aggDataLoc = 'data/synth1_GA_only_063022.csv'

aggDf = new_data #pd.read_csv(aggDataLoc)
#aggDf = aggDf.drop("Unnamed: 0",axis=1)
targetDf = pd.read_csv(targetDataLoc)
#targetDf = targetDf.drop("Unnamed: 0",axis=1)

In [6]:
############## imports
# general
import statistics
import datetime
#from sklearn.externals import joblib # save and load models
import random
# data manipulation and exploration
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

## machine learning stuff
# preprocessing
from sklearn import preprocessing
# feature selection
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import f_regression
# pipeline
from sklearn.pipeline import Pipeline
# train/testing
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score  
# error calculations
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# models
from sklearn.linear_model import LinearRegression # linear regression
from sklearn.linear_model import BayesianRidge #bayesisan ridge regression
from sklearn.svm import SVR  # support vector machines regression
from sklearn.gaussian_process import GaussianProcessRegressor # import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor # k-nearest neightbors for regression
from sklearn.neural_network import MLPRegressor # neural network for regression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeRegressor # decision tree regressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor  # random forest regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor # adaboost for regression
# saving models
# from sklearn.externals import joblib
import joblib

# import the API
APILoc = 'API/'

import sys
sys.path.insert(0, APILoc)

from API import *

In [7]:
# filter out the features that will not be used by the machine learning models

# the features to keep:
# xColumnsToKeep = ["Julian Day", "Time Since Sown (Days)", "Time Since Last Harvest (Days)", "Total Radiation (MJ/m^2)",
#                "Total Rainfall (mm)", "Avg Air Temp (C)", "Avg Min Temp (C)", "Avg Max Temp (C)",
#                  "Avg Soil Moisture (%)", "Day Length (hrs)"], "Percent Cover (%)"]

# xColumnsToKeep = ["Julian Day", "Time Since Sown (Days)", "Total Radiation (MJ/m^2)",
#                "Total Rainfall (mm)", "Avg Air Temp (C)", "Avg Min Temp (C)", "Avg Max Temp (C)",
#                  "Avg Soil Moisture (%)"]

xColumnsToKeep = ["Total Radiation (MJ/m^2)","Total Rainfall (mm)", "Avg Air Temp (C)", "Avg Min Temp (C)", 
                  "Avg Max Temp (C)","Avg Soil Moisture (%)"]


#xColumnsToKeep = ["Julian Day", "Time Since Sown (Days)", "Total Radiation (MJ/m^2)", "Total Rainfall (mm)"]

# the target to keep
yColumnsToKeep = ["Class"]

# get a dataframe containing the features and the targets
xDf = aggDf[xColumnsToKeep]
test_xDf = targetDf[xColumnsToKeep]
yDf = aggDf[yColumnsToKeep]
test_yDf = targetDf[yColumnsToKeep]

# reset the index
xDf = xDf.reset_index(drop=True)
yDf = yDf.reset_index(drop=True)
test_xDf = test_xDf.reset_index(drop=True)
test_yDf = test_yDf.reset_index(drop=True)

pd.set_option('display.max_rows', 2500)
pd.set_option('display.max_columns', 500)

xCols = list(xDf)

In [8]:
# hide the warnings because training the neural network caues lots of warnings.
import warnings
warnings.filterwarnings('ignore')

# make the parameter grids for sklearn's gridsearchcv
rfParamGrid = {
        'model__n_estimators': [5, 10, 25, 50, 100], # Number of estimators
        'model__max_depth': [5, 10, 15, 20], # Maximum depth of the tree
        'model__criterion': ["gini"]
    }
knnParamGrid ={
        'model__n_neighbors':[2,5,10],
        'model__weights': ['uniform', 'distance'],
        'model__leaf_size': [5, 10, 30, 50]    
    }
svrParamGrid = {
        'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'model__C': [0.1, 1.0, 5.0, 10.0],
        'model__gamma': ["scale", "auto"],
        'model__degree': [2,3,4,5]
    }
nnParamGrid = {
        'model__hidden_layer_sizes':[(3), (5), (10), (3,3), (5,5), (7,7)],
        'model__solver': ['sgd', 'adam'],
        'model__learning_rate' : ['constant', 'invscaling', 'adaptive'],
        'model__learning_rate_init': [0.1, 0.01, 0.001]      
    }

linRegParamGrid = {}

bayesParamGrid={
        'model__n_iter':[100,300,500]
    }

dtParamGrid = {
    'model__criterion': ['gini'],
    'model__max_depth': [5,10,25,50,100]
    }

aModelList = [#(RandomForestClassifier(), rfParamGrid, "rfTup.pkl"),
              #(KNeighborsRegressor(), knnParamGrid, "knnTup.pkl"),
              #(SVR(), svrParamGrid, "svrTup.pkl"),
             (MLPClassifier(), nnParamGrid, "nnTup.pkl")]#,
             #(LinearRegression(), linRegParamGrid, "linRegTup.pkl"),
             #(BayesianRidge(), bayesParamGrid, "bayesTup.pkl"),
             #(DecisionTreeClassifier(), dtParamGrid, "dtTup.pkl")]

N = 10
workingDir = 'working_dir'
numFeatures = 6 # 11

In [9]:
saveMLResults(test_xDf, test_yDf, N, xDf, yDf, aModelList, workingDir, numFeatures, printResults=True)

model:  nnTup
Avg MAE:  0.0
Avg R squared:  1.0
Best MAE:  0.0
Best R squared:  1.0
Parameters of the best model:  {'model__hidden_layer_sizes': 3, 'model__learning_rate': 'constant', 'model__learning_rate_init': 0.1, 'model__solver': 'sgd'}
Features selected by best model:  ['Total Radiation (MJ/m^2)', 'Total Rainfall (mm)', 'Avg Air Temp (C)', 'Avg Min Temp (C)', 'Avg Max Temp (C)', 'Avg Soil Moisture (%)']
Accuracy:  1.0
f1 score:  1.0
mcc:  0.0
 
test results on our test data: 
results:
MAE:  0.375
R sq:  -0.04347826086956519
accuracy score for DA/TDA:  0.625
