In [1]:
import warnings
warnings.filterwarnings('ignore',category=RuntimeWarning)
import xarray as xr
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import glob,os,sys
from tqdm import tqdm
import proplot as plot
import json,pickle
import dask.array as da
import gc
from sklearn.decomposition import PCA
sys.path.insert(1, '/work/FAC/FGSE/IDYST/tbeucler/default/freddy0218/TCGphy/2020_TC_CRF/dev/freddy0218/')
from tools import derive_var,read_and_proc,preproc_noensemble
from tools.mlr import mlr,proc_mlrfcst,maria_IO
from tools.preprocess import do_eof,preproc_maria
#import feature_select
import pysnooper
from dask.distributed import Client
client = Client(processes=True, threads_per_worker=1,n_workers=2)
%matplotlib inline
plot.rc.update({'figure.facecolor':'w','axes.labelweight':'ultralight',
                'tick.labelweight':'ultralight','gridminor.linestyle':'--','title.weight':'normal','linewidth':0.5})

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43781 instead
  plot.rc.update({'figure.facecolor':'w','axes.labelweight':'ultralight',


In [2]:
############################################################################################################################################
# PCA_dict: Processed EOF objects; A_dict: Variable anomaly matrices; Af_dict: Flattened variable anomaly matrices
# *note that following EOF conventions, we derived EOFs with anomalies.
############################################################################################################################################
# Basic settings
folderpath='/work/FAC/FGSE/IDYST/tbeucler/default/freddy0218/TCGphy/2020_TC_CRF/dev/freddy0218/pca/output/uvwheat/'
expname = ['ctl','ncrf_36h','ncrf_60h','lwcrf']
varname = ['u','v','w']
nummem = [54,26,50,5,75,12,10]
#################################################################################
# 24-hr smoothing model
PCAdict = read_and_proc.depickle(folderpath+'PCA/ctl_PCA_dict1')
# Read files
flatvar = [read_and_proc.depickle(folderpath+'preproc/'+str(expname[i])+'_'+'smooth_preproc_dict1b_g') for i in range(len(expname))]
vardicts = maria_IO.input_output(PCAdict,folderpath,varname,nummem).readvar_separate(listdict=flatvar,varname=['u','v','w','qv','theta','hdia','rad'])
del flatvar#,thetavar
gc.collect()
#################################################################################

235

In [4]:
from sklearn.feature_selection import SelectorMixin
from sklearn.metrics import r2_score
from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone
from sklearn.model_selection import cross_val_score
import numpy as np
import gc
import importlib
importlib.reload(proc_mlrfcst)
importlib.reload(mlr)

class forwardfeatureadder(BaseEstimator,SelectorMixin,MetaEstimatorMixin):
    """Transformer to add feature at a sequential order
    Parameters:
    estimator: Regression model
    n_features_to_select: number of features to add to the model
    cv: how many folds would we want during cross-validation
    n_jobs: Parallelization
    startfeatures: Features we would like to include in the model without cross-validation [we do this to accentuate the role of heating]
    
    Output:
    self instance
    """
    def __init__(self,estimator,n_features_to_select=None,cv=5,n_jobs=None,startfeatures=None,PCAdict=None,Afdict=None,numcomp=None,LT=None,optigoal='surface',Xsurf=None,Ysurf=None):
        self.estimator = estimator
        self.n_features_to_select = n_features_to_select
        self.cv = cv
        self.n_jobs = n_jobs
        self.startfeatures = startfeatures
        self.PCAdict = PCAdict
        self.Afdict = Afdict
        self.numcomp = numcomp
        self.LT = LT
        self.optigoal = optigoal
        self.Xsurf=Xsurf
        self.Ysurf=Ysurf
    
    def get_real_winds(self):
        temp1,temp2,temp3,temp4 = proc_mlrfcst.retrieve_cartesian(PCA_dict=self.PCAdict,Af_dict=self.Afdict,numcomp=self.numcomp,LT=self.LT,
                            forecastPC=None).windrates_real(LT=self.LT)
        return temp1,temp2,temp3,temp4
    
    def convert_forecast_winds(self,yforecast=None):
        try:
            temp1,temp2,temp3,temp4 = proc_mlrfcst.retrieve_cartesian(PCA_dict=self.PCAdict,Af_dict=self.Afdict,numcomp=self.numcomp,LT=self.LT,
                                                                      forecastPC=yforecast).output_reshapeRECON(forecast_eig=yforecast[int(self.LT[i]-1)])
            return temp1,temp2,temp3,temp4
        except:
            temp1,temp2 = proc_mlrfcst.retrieve_cartesian(PCA_dict=self.PCAdict,Af_dict=self.Afdict,numcomp=self.numcomp,LT=self.LT,
                                                                      forecastPC=yforecast,target='surface').output_reshapeRECON(forecast_eig=yforecast)        
            return temp1,temp2
    
    def fit(self, X,y=None):
        """Learn features to select from X.
        X (n_samples,n_features): Training vectors
        Y (n_samples): Target values
        """
        # Define basic settings
        n_features = X.shape[1]
        current_mask = np.zeros(shape=n_features,dtype=bool)
        for index in self.startfeatures:
            current_mask[index] = True
        n_iteractions = self.n_features_to_select
        
        # Do forward selection
        addinput,r2 = [],[]
        clone_estimator = clone(self.estimator)
        for _ in range(n_iteractions):
            new_feature_idx,r2t = self.get_best_new_feature_R2based(clone_estimator,X,y,current_mask)
            #new_feature_idx = self.get_best_new_feature(clone_estimator,X,y,current_mask)
            r2.append(r2t)
            current_mask[new_feature_idx] = True
            addinput.append(current_mask)
        
        self.support_ = current_mask
        self.new_feature = new_feature_idx
        self.r2 = r2
        return self
    
    def get_best_new_feature(self,estimator,X,y,current_mask):
        candidate_feature_indices = np.flatnonzero(~current_mask)
        scores={}
        for feature_idx in candidate_feature_indices:
            candidate_mask = current_mask.copy()
            candidate_mask[feature_idx] = True
            
            # Add a new feature
            X_new = X[:,candidate_mask]
            # Improvement
            scores[feature_idx] = cross_val_score(estimator,X_new,y,cv=self.cv,scoring=None,n_jobs=self.n_jobs).mean()
        return max(scores,key=lambda feature_idx: scores[feature_idx])
    
    #--------------------------------------------------------------
    # To do -> Add featureselector based on r2
    # Candidate mask -> Xnew
    # fit linear model with (Xnew,y)
    # {output r2 term [time consideration => target: surface u/v]}...repeat for all u/v/w/theta members
    # get component index that results in best r2 score
    # --------[[Exit loops when r2 reaches 0.75?]]-----------------
    # add to mask during fitting 
    #---------------------------------------------------------------------------------------------------
    def get_best_new_feature_R2based(self,estimator,X,y,current_mask):             
        candidate_feature_indices = np.flatnonzero(~current_mask)
        scores={}
        for feature_idx in candidate_feature_indices:
            candidate_mask = current_mask.copy()
            candidate_mask[feature_idx] = True
            # Add a new feature
            X_new = X[:,candidate_mask]
            # Improvement
            LDTME = np.linspace(0,44,45)+1
            y_forecast = mlr.model_fitpredict(X_new,y,estimator,LDTME).modelfit(singleLT=True)[0].predict(X_new)
            #################################################################################################################################################################################################################
            # Forecast winds
            #################################################################################################################################################################################################################
            if self.optigoal=='surface':
                teMP1,teMP2 = self.convert_forecast_winds(y_forecast)
                teMP1s,teMP2s = (teMP1.reshape(teMP1.shape[0],39,360,167)[:,0,:,:]).reshape(teMP1.shape[0],360*167),(teMP2.reshape(teMP2.shape[0],39,360,167)[:,0,:,:]).reshape(teMP2.shape[0],360*167)
                del teMP1,teMP2
                gc.collect()
                
                scores[feature_idx] = r2_score(np.concatenate((self.Xsurf,self.Ysurf),axis=0),np.concatenate((teMP1s,teMP2s),axis=0))
            else:
                teMP1,teMP2,teMP3,teMP4 = self.convert_forecast_winds(y_forecast)                
                scores[feature_idx] = r2_score(np.concatenate((reteMP1,reteMP2,reteMP3,reteMP4),axis=0),np.concatenate((teMP1,teMP2,teMP3,teMP4),axis=0))
        return max(scores,key=lambda feature_idx: scores[feature_idx]),max(scores.values())
    
    def _get_support_mask(self):
        return self.support_

In [5]:
from tools.mlr import mlr,proc_mlrfcst
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

path = '/work/FAC/FGSE/IDYST/tbeucler/default/freddy0218/TCGphy/2020_TC_CRF/dev/freddy0218/pca/output/complex/heatcomp_separate/'
path2 = '/work/FAC/FGSE/IDYST/tbeucler/default/freddy0218/TCGphy/2020_TC_CRF/dev/freddy0218/pca/output/complex/Q95_complex/'
file2 = [path+'inputoutput/'+'inputQ95_dict_smoo6_withtheta',path+'inputoutput/'+'outputQ95_dict_smoo6_withtheta']#,path+'NOIR_dict_smoo6_withtheta',path+'IR_dict_smoo6_withtheta']
mlr_inputdict,mlr_output = read_and_proc.depickle(file2[0]),read_and_proc.depickle(file2[1])

In [None]:
# Initiate model
nummem = [54,26,50,75]
linreg = LinearRegression()
mlrIN,mlrOUT = mlr.SimpleIOhandler(LT=24,auxIN=None).transform(mlr_inputdict['dtthuvw'],mlr_output)
reteMP1,reteMP2,_,_  = forwardfeatureadder(linreg,1,3,2,[int(obj) for obj in np.linspace(241-36,240,36)],PCAdict,vardicts,nummem,24,'surface').get_real_winds()
reteMP1s,reteMP2s = (reteMP1.reshape(reteMP1.shape[0],39,360,167)[:,0,:,:]).reshape(reteMP1.shape[0],360*167),(reteMP2.reshape(reteMP2.shape[0],39,360,167)[:,0,:,:]).reshape(reteMP2.shape[0],360*167)
del reteMP1,reteMP2
gc.collect()

# Train model
def generator():
    while True:
        yield

i=0
model,reducedX,new_feature = [],[],[]
holdmem = [int(obj) for obj in np.linspace(217,227-1,10)]
for i in (range(10)):#np.asarray(mlrIN).shape[1]-36)):
    #seq_temp = forwardfeatureadder(linreg,i+1,3,2,[int(obj) for obj in np.linspace(241-36,240,36)],\
    #                              PCAdict,vardicts,nummem,24,'surface',reteMP1s,reteMP2s).fit(np.asarray(mlrIN),mlrOUT)
    #seq_temp = forwardfeatureadder(linreg,i+1,3,2,[int(obj) for obj in np.linspace(241-36,240,36)],\
    #                              PCAdict,vardicts,nummem,24,'surface',None,None).fit(np.asarray(mlrIN),mlrOUT)
    seq_temp = forwardfeatureadder(linreg,1,5,2,holdmem,\
                                   PCAdict,vardicts,nummem,24,'surface',reteMP1s,reteMP2s).fit(np.asarray(mlrIN),mlrOUT)
    holdmem.append(seq_temp.new_feature)
    new_feature.append(seq_temp.new_feature)
    Xn = seq_temp.transform(np.asarray(mlrIN)) #Remove unimportant features
    ridge_reduced = LinearRegression().fit(Xn,mlrOUT) #Train model with reduced input
    model.append(ridge_reduced)
    reducedX.append(Xn)
    ####################################################################################################
    # Check r2 score
    ####################################################################################################
    #if r2_score(mlrOUT,ridge_reduced.predict(Xn))<=0.9:
    #    continue
    #else:
    #    break

[33m[2mSource path:... [22m/tmp/ipykernel_2198616/2362939780.py[0m
[32m[2mStarting var:.. [22mself = REPR FAILED[0m
[32m[2mStarting var:.. [22mestimator = LinearRegression()[0m
[32m[2mStarting var:.. [22mn_features_to_select = 1[0m
[32m[2mStarting var:.. [22mcv = 3[0m
[32m[2mStarting var:.. [22mn_jobs = 2[0m
[32m[2mStarting var:.. [22mstartfeatures = [205, 206, 207, 208, 209, 210, 211, 212, 213, 21...231, 232, 233, 234, 235, 236, 237, 238, 239, 240][0m
[32m[2mStarting var:.. [22mPCAdict = {'u': PCA(), 'v': PCA(), 'w': PCA(), 'qv': PCA()... PCA(), 'hdia': PCA(), 'rad': PCA(), 'ir': PCA()}[0m
[32m[2mStarting var:.. [22mAfdict = {'ctl': [array([[  0.16094698,   0.38626236,   0...61916432e-05, -1.60627460e-05]], dtype=float32)]}[0m
[32m[2mStarting var:.. [22mnumcomp = [54, 26, 50, 75][0m
[32m[2mStarting var:.. [22mLT = 24[0m
[32m[2mStarting var:.. [22moptigoal = 'surface'[0m
[32m[2mStarting var:.. [22mXsurf = None[0m
[32m[2mStarting var

In [26]:
holdmem

[217,
 218,
 219,
 220,
 221,
 222,
 223,
 224,
 225,
 226,
 25,
 114,
 3,
 185,
 178,
 193,
 28,
 195,
 47,
 106]

In [19]:
listt = [int(obj) for obj in np.linspace(217,227-1,10)]
listt.append(176)

In [20]:
for index in listt:
    current_mask[index] = True

In [13]:
def comprehension_loop():
    return [task(image) for image in pics]

%time test2 = comprehension_loop()

CPU times: user 1.46 s, sys: 4.18 ms, total: 1.47 s
Wall time: 1.48 s


In [11]:
from joblib import Parallel, delayed
def joblib_loop():
    results = Parallel(n_jobs=2)(delayed(task)(i) for i in pics)
    return results

%time 
test = joblib_loop()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs


In [15]:
test[0]==test2[0]

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])