In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, cm
from matplotlib.patches import Path, PathPatch
import time, datetime
import CustomKernels, CustomModels
from sklearn.model_selection import train_test_split


##Formatting functions
def format_for_learning(xlat, xlon, temp):
    X = np.column_stack((xlat, xlon))
    Y = np.asarray(temp).reshape(len(temp),1)
    return X, Y

def format_grid_for_prediction(ylat, ylon):
    grid = np.column_stack((np.hstack((ylat)),np.hstack((ylon))))
    return grid
    
def format_prediction_to_grid(prediction, nrow, ncol):
    matrix = np.reshape(prediction, (nrow,ncol))  
    return matrix

class Pipeline:
    
    report_path = ""
    data_path = "data/current-version/"
    datafile = ""
    report = False
    verbose = False
    
    def __init__(self, data_path="data/current-version/", data_file=None, verbose=False, report=False):
        if report:
            self.create_report_folder()
        self.data_path = data_path
        if not data_file is None:
            self.load_data(data_file)
        self.verbose = verbose
        self.report = report

    def load_data(self, filename):
        self.datafile = filename
        df = pd.read_csv(self.data_path + self.datafile, ',')
        df = df.drop(['Unnamed: 0'], axis=1)

        self.X_all = df[['Lat', 'Lon']].values
        self.y_all = df[['Temp']].values

    def create_report_folder(self):
        t = datetime.datetime.now()
        self.report_path = 'reports/{0:%Y_%m_%d-%H_%M_%S}'.format(t)
        os.mkdir(self.report_path)

    def export_report(self, df, report_type):
        df.to_csv(self.report_path + '/%s_%s.csv' % (report_type, name))

    def partition_train_test(self, test_size=0.2):
        if self.verbose:
            print("Partition data into train and test samples")
            
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X_all, 
                                                                                self.y_all, test_size=0.2)
        
        if self.verbose:
            print("Number of train samples: " + str(self.X_train.shape[0]))
            print("Number of test samples: " + str(self.X_test.shape[0]))
        
        
    def simple_interpolation(self, model, params=None):
        if not params is None:
            model.set_params(**params)
            
        score_train = model.train(self.X_train, self.y_train, eval_score=True)
        
        if self.verbose:
            print("Train MSE score: " + str(score_train))
            
        self.predictions, score_test = model.predict(self.X_test, self.y_test, eval_score=True)
        
        if self.verbose:
            print("Test MSE score: " + str(score_test))
            
        return score_train, score_test
            

    def simple_optimization(self, model, param_grid, cv=None, n_restart=5):
        if self.report:
            bestc, bests, CV_res = model.optimize(self.X_train, self.y_train, cv=cv, 
                                    n_splits=n_restart, info=True, **param_grid)
            CV_res.to_csv(self.report_path + '/CV_%s.csv' % model.get_name())
        else:
            bestc, bests = model.optimize(self.X_train, self.y_train, cv=cv, 
                                    n_splits=n_restart, info=True, **param_grid)
            
        return bestc, bests
            
    def benchmark(self, model_list, param_grid_list, optim=False, cv=None, n_restart=5):
        if self.report:
            save = pd.DataFrame(columns=['Name', 'Train Score', 'Test Score', 'Parameters', 'Database'])
            
        for i in range(0, len(model_list)):
            model = model_list[i]
            
            if optim:
                params = param_grid_list[i]
                print(params)
                model_parameters, score_train = self.simple_optimization(model, params, cv=cv, n_restart=n_restart)
                self.predictions, score_test = model.predict(self.X_test, self.y_test, eval_score=True)
                model_name = model.to_string()
                model_parameters = str(model_parameters)
                
            else:
                if not param_grid_list is None:
                    params = param_grid_list[i]
                else:
                    params = None
                score_train, score_test = self.simple_interpolation(model, params)
                model_name, model_parameters = model.to_string()
                
            if self.report:
                save.loc[i] = [model_name, score_train, score_test, model_parameters, datafile]
        
        if self.report:
            save.to_csv(self.report_path + '/Benchmark.csv')


In [2]:
datafile = "Temp-2019_01_04-15_47.csv"
pipe = Pipeline(data_file=datafile, verbose=True, report=True)
pipe.partition_train_test(test_size=0.2)

Partition data into train and test samples
Number of train samples: 754
Number of test samples: 189


In [3]:
model_list = [
    CustomModels.NearestNeighbor(),
    CustomModels.InverseDistanceWeighting(),
    CustomModels.RandomForest(),
]
param_grid_list = [
    {
        
    },
    {
        'radius' : 10
    },
    {
        'n_estimators' : 1000,
        'max_depth' : 10
    }
]
pipe.benchmark(model_list, param_grid_list, optim=False, cv=None, n_restart=5)

Train MSE score: -0.0
Test MSE score: -8.38585714286
Train MSE score: -0.0
Test MSE score: -8.38585714286
Train MSE score: -1.09400521058
Test MSE score: -6.72630882237


In [4]:
model_list = [
    CustomModels.NearestNeighbor(),
    CustomModels.InverseDistanceWeighting(),
    CustomModels.GaussianProcess(),
    CustomModels.GeographicallyWeightedRegressor(),
    CustomModels.RegressionTree(),
    CustomModels.RandomForest(),
    CustomModels.ExtraTrees(),
    CustomModels.SupportVectorRegression()
]
param_grid_list = [
    {
        
    },
    {
        'radius' : [10, 100, 1000]
    },
    {
        
    },
    {
        
    },
    {
        'max_depth' : [9, 10, 11]
    },
    {
        'n_estimators' : [1000, 5000],
        'max_depth' : [9, 10, 11]
    },
    {
        'n_estimators' : [1000, 5000],
        'max_depth' : [9, 10, 11]
    },
    {
        'gamma' : [0.004, 0.02, 0.1],
        'C' : [1.0, 10, 1e2],
        'epsilon' : [0.0001, 0.001, 0.01, 0.1]
    },
    {
        'max_degree' : [3, 4, 5],
        'penalty' : [1.0, 3.0, 9.0]
    }
]
pipe.benchmark(model_list, param_grid_list, optim=True, cv='ShuffleSplit', n_restart=2)

{}
{'radius': [10, 100, 1000]}
{}
{}
{'max_depth': [9, 10, 11]}
{'n_estimators': [1000, 5000], 'max_depth': [9, 10, 11]}
{'n_estimators': [1000, 5000], 'max_depth': [9, 10, 11]}
{'epsilon': [0.0001, 0.001, 0.01, 0.1], 'C': [1.0, 10, 100.0], 'gamma': [0.004, 0.02, 0.1]}


In [5]:
##Read cities list
datafile = "Temp-2019_01_04-15_47.csv"
X_learn, y_learn = load_data(datafile)

##Create report folder
report_path = create_report_folder()


NameError: name 'load_data' is not defined

In [None]:
##Map boundaries
lon_min = -15.56
lat_min = 24.65
lon_max = 49.88
lat_max = 79.17

##Create map
map = Basemap(llcrnrlon=lon_min,llcrnrlat=lat_min,urcrnrlon=lon_max,urcrnrlat=lat_max, resolution = 'l', epsg=4668)

##Interpolation resolution
nx = 100
ny = 100

glons, glats = map.makegrid(nx, ny)
gx, gy = map(glons, glats)

##Format data for interpolation
#X_learn, y_learn = format_for_learning(lat, lon, temps)
grid = format_grid_for_prediction(glats, glons)

##Partition data between train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_learn, y_learn, test_size=0.2)
print("Number of train samples: " + str(X_train.shape[0]))
print("Number of test samples: " + str(X_test.shape[0]))

tic = time.time()

##Choose the interpolation model
model_list = [
    CustomModels.NearestNeighbor(),
    CustomModels.InverseDistanceWeighting(),
    CustomModels.GaussianProcess(),
    CustomModels.GeographicallyWeightedRegressor(),
    CustomModels.RegressionTree(),
    CustomModels.RandomForest(),
    CustomModels.ExtraTrees(),
    CustomModels.SupportVectorRegression()
]

#CV_Reports = {}
#test_model = CustomModels.SupportVectorRegression()
#param_grid = {
#    'gamma' : [0.004, 0.02, 0.1],
#    'C' : [1.0, 10, 1e2],
#    'epsilon' : [0.0001, 0.001, 0.01, 0.1]
#}
#bestc, bests, CV_res = test_model.optimize(X_train, y_train, cv='ShuffleSplit', 
#                                             n_splits = 5, info=True, **param_grid)
#CV_Reports[test_model.get_name()] = CV_res

save = pd.DataFrame(columns=['Name', 'Train Score', 'Test Score', 'Parameters', 'Database'])
for i in range(0, len(model_list)):
    model = model_list[i]
    model_name, model_parameters = model.to_string()
    score_train = model.train(X_train, y_train, eval_score=True)
    print("Train MSE score: " + str(score_train))
    preds, score_test = model.predict(X_test, y_test, eval_score=True)
    print("Test MSE score: " + str(score_test))
    preds = model.predict(grid)
    save.loc[i] = [model_name, score_train, score_test, model_parameters, datafile]
    
print(save)
toc = time.time()
print("Time: " + str(1000*(toc-tic)) + "ms")

##Format the predictions for plotting
predict = format_prediction_to_grid(preds, nx, ny)


In [None]:
fig, ax = plt.subplots(figsize=(24,24))

map.drawmapboundary(fill_color='white')
#map.fillcontinents(color='coral',lake_color='white')
map.drawcoastlines()

lon = X_learn[:,1]
lat = X_learn[:,0]
temps = y_learn[:,0]
x, y = map(lon, lat)
cities_out_bounds = []

##Create annotations for temperature and only keep cities in bound
for i in range(0,len(x)):
    if lon[i] > lon_min and lon[i] < lon_max and lat[i] > lat_min and lat[i] < lat_max:
        plt.text(x[i], y[i], "{0:.1f}".format(temps[i]),fontsize=10,fontweight='bold', ha='center',va='center',color='k')

##Plot contours
clevs = [-24,-22,-20,-18,-16,-14,-12,-10,-8,-6,-4,-2,0,2,4,6,8,10,12,14,16,18,20,22]
cs = map.contourf(gx,gy,predict,clevs,cmap='Spectral_r')

##Display colorbar
cbar = map.colorbar(cs,location='bottom',pad="5%")
cbar.set_label('degrees Celsius')

##Getting the limits of the map:
x0,x1 = ax.get_xlim()
y0,y1 = ax.get_ylim()
map_edges = np.array([[x0,y0],[x1,y0],[x1,y1],[x0,y1]])

##Getting all polygons used to draw the coastlines of the map
polys = [p.boundary for p in map.landpolygons]

##Combining with map edges
polys = [map_edges]+polys[:]

##Creating a PathPatch
codes = [
    [Path.MOVETO] + [Path.LINETO for p in p[1:]]
    for p in polys
]
polys_lin = [v for p in polys for v in p]
codes_lin = [c for cs in codes for c in cs]
path = Path(polys_lin, codes_lin)
patch = PathPatch(path,facecolor='cyan',lw=0)

##Masking the data:
ax.add_patch(patch)

plt.show()

In [None]:
#for name, report in CV_Reports.iteritems():
#    report.to_csv(report_path + '/CV_%s.csv' % name)
save.to_csv(report_path + '/Benchmark.csv')

In [None]:
tic = time.time()
toc = time.time()
print("Time: " + str(1000*(toc-tic)) + "ms")

In [None]:
obs[['Lat', 'Lon']].values