In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
#https://github.com/mwv/zca
from zca.zca import zca
import pandas as pd
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.neighbors import KernelDensity
import matplotlib.pyplot as plt
import scipy
import numpy as np
import scipy.stats as st
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import GridSearchCV

In [None]:
class ss_yj_Transformer(BaseEstimator, TransformerMixin):
    # add another additional parameter, just for fun, while we are at it
    
    def __init__(self, feature_names=[], additional_param = ""):  
        self.ss_X = StandardScaler()
        self.pt_X = PowerTransformer(standardize=True)
        self.zca_X = zca.ZCA()
        self.ss_y = StandardScaler()
        self.pt_y = PowerTransformer(standardize=True)  
        self.zca_y = zca.ZCA()
        self.feature_names = feature_names
        self.additional_param = additional_param
        self.medians = pd.DataFrame()
        self.mads = []
        self.mad_transformed = pd.DataFrame()

    def fit(self, X, y = None):
        self.ss_X.fit(X)
        self.medians = X.median()
        self.mads = pd.DataFrame(scipy.stats.median_abs_deviation(X,scale='normal'),index=X.columns).T#scipy.stats.median_abs_deviation(X)
        #self.pt_X.fit(self.ss_X.transform(X))
        self.mad_transformed = ((X-X.median())/scipy.stats.median_abs_deviation(X,scale='normal'))
        self.pt_X.fit(self.mad_transformed)        
        self.zca_X.fit(self.pt_X.transform(self.ss_X.transform(X)))
        if (y is not None):
            self.ss_y.fit(y)
            self.pt_y.fit(self.ss_y.transform(y))
            self.zca_y.fit(self.pt_y.transform(self.ss_y.transform(y)))
        return self

    def transform(self, X, y = None):
        #pt_X_ = pd.DataFrame(self.zca_X.fit_transform(self.pt_X.fit_transform(self.ss_X.fit_transform(X))),columns=X.columns,index=X.index)
        #pt_X_ = pd.DataFrame(self.zca_X.fit_transform(self.pt_X.fit_transform(self.mad_transformed)),columns=X.columns,index=X.index)
        pt_X_ = pd.DataFrame(self.zca_X.fit_transform(self.pt_X.fit_transform(self.mad_transformed)),columns=X.columns)
        if (y is None):
            return pt_X_
        else:
            pt_y_ = pd.DataFrame(self.zca_y.fit_transform(self.pt_y.fit_transform(self.ss_y.fit_transform(X))),columns=Y.columns,index=Y.index)
            return pt_X_, pt_y_
        
    def inverse_transform(self, X, y = None):
        #target_X = pd.DataFrame(self.ss_X.inverse_transform(self.pt_X.inverse_transform(self.zca_X.inverse_transform(X))),index=X.index,columns=X.columns)
        target_X = pd.DataFrame((self.pt_X.inverse_transform(self.zca_X.inverse_transform(X))),columns=X.columns)*np.array(self.mads)+np.array(self.medians)
        if (y is None):
            return target_X
        else:
            target_y = pd.DataFrame(self.ss_Xy.inverse_transform(self.pt_y.inverse_transform(self.zca_y.inverse_transform(y))),index=y.index,columns=y.columns)
            return target_X, target_y
        

In [None]:
exclude = 'States'

In [None]:
bandwidths = np.linspace(1.0, 1.2, 10)

In [None]:
bandwidths

In [None]:
all_data = pd.read_csv('../data/raw/states.csv')

X = all_data[set(all_data.columns).difference([exclude])].copy()
display(X.describe())
X.index=all_data[exclude]
display(X)

In [None]:
scaler = ss_yj_Transformer()
scaler.fit(X)
X_ = scaler.transform(X)
X_.index=all_data[exclude]
#display(X_)
display(X_.describe())
display(X_.min())
display(X_.max())
fig = plt.figure(figsize =(10, 7))
ax = fig.add_axes([0, 0, 1, 1])

# Creating plot
bp = ax.boxplot(X_)

In [None]:
grid = GridSearchCV(KernelDensity(kernel='gaussian'),
                        {'bandwidth': bandwidths},
                        )

kde_pct = pd.DataFrame()
raw_pct = pd.DataFrame()

for i in X.columns:
    print(i)
    og = X_[[i]].sort_values(kind="quicksort", by=i,ascending=True)
    og_pct = pd.DataFrame(st.norm.cdf(og),index=og.index)
    plt.plot(og_pct)
    plt.xticks(rotation = 90,size=8)
    grid.fit(og_pct)
    print("bandwidth: " + str(grid.best_params_['bandwidth']))
    kde = KernelDensity(bandwidth=grid.best_params_['bandwidth'], kernel='gaussian')
    kde.fit(og_pct)
    logprobX = np.cumsum(np.exp(kde.score_samples(og_pct)))/np.sum(np.exp(kde.score_samples(og_pct)))
    logprobX = (pd.DataFrame(logprobX).shift(1).replace([np.inf, -np.inf, np.NaN], 0)+pd.DataFrame(logprobX))/2
    logprobX.index = og.index
    kde_pct = pd.concat([kde_pct,logprobX],axis=1)
    raw_pct = pd.concat([raw_pct,og_pct],axis=1)
    plt.plot(logprobX)
    plt.show()
    display(st.norm.ppf(logprobX)[0])
    display(st.norm.ppf(logprobX)[len(logprobX)-1])
    display(X_[[i]].min())
    display(X_[[i]].max())



In [None]:
raw_pct.columns = X.columns
kde_pct.columns = X.columns

In [None]:
fig = plt.figure(figsize =(10, 7))
ax = fig.add_axes([0, 0, 1, 1])

# Creating plot
bp = ax.boxplot(kde_pct)

In [None]:
fig = plt.figure(figsize =(10, 7))
ax = fig.add_axes([0, 0, 1, 1])

# Creating plot
bp = ax.boxplot(raw_pct)

In [None]:
random_ = pd.DataFrame(np.random.rand(10000,10),columns=X.columns)*(kde_pct.max()-kde_pct.min())+kde_pct.min()
#random_ = pd.DataFrame(np.random.rand(10000,10),columns=X.columns)*(raw_pct.max()-raw_pct.min())+raw_pct.min()
random_set = scaler.inverse_transform(pd.DataFrame(st.norm.ppf(random_),columns=X_.columns)).describe()


In [None]:
random_set.describe()

In [None]:
dataset = pd.DataFrame()

for p in range(0,1000):
    row = list()
    indices = np.random.choice(raw_pct.shape[0], size=len(raw_pct.columns))
    for i in range(0,len(raw_pct.columns)):
        row.append(raw_pct[[raw_pct.columns[i]]].iloc[indices[i]][0])
        #print()
        
    data = pd.DataFrame(row).T
    data.columns = raw_pct.columns    

    dataset = pd.concat([dataset,data],axis=0)
    
scaler.inverse_transform(pd.DataFrame(st.norm.ppf(dataset),columns=data.columns)).describe()

In [None]:
scaler.inverse_transform(random_).describe()

In [None]:
display(X_.min())
display(X_.max())

In [None]:
testzca = zca.ZCA()
testzca.fit(X)
testzca_X = pd.DataFrame(testzca.transform(X),index=all_data[exclude],columns=X.columns)

In [None]:
#here I impute new values into the "random noise", aka whitened matrix
test1 = testzca_X.loc[['New Mexico']]
test2 = testzca_X.loc[['New Mexico']]
test3 = testzca_X.loc[['New Mexico']]

In [None]:
test1['Crime']=testzca_X['Crime'].describe()['25%']
test2['Crime']=0
test3['Crime']=testzca_X['Crime'].describe()['75%']

In [None]:
display(scaler.inverse_transform(test1))
display(scaler.inverse_transform(test2))
display(scaler.inverse_transform(test3))

In [None]:
#here I impute new values into the "random noise", aka whitened matrix
test4 = X_.loc[['Alaska']]
test5 = X_.loc[['Alaska']]
test6 = X_.loc[['Alaska']]

In [None]:
test4['Crime']=X_['Crime'].min()
test5['Crime']=0
test6['Crime']=X_['Crime'].max()

In [None]:
display(scaler.inverse_transform(test4))
display(scaler.inverse_transform(test5))
display(scaler.inverse_transform(test6))

In [None]:
scaler.inverse_transform(pd.DataFrame(np.zeros(shape=(1,len(X.columns))),columns=X.columns))

In [None]:
scaler.inverse_transform(X_).describe()

In [None]:
X_.hist()

In [None]:
scaler.inverse_transform(X_).describe()