<a href="https://colab.research.google.com/github/yahyagec/dimensionality-reduction/blob/master/union.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install dcor
!pip install lightgbm
!pip install xgboost
!pip install catboost
#!pip install tsfresh
!pip install keras

In [0]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.cross_validation import train_test_split as tts
import dcor
from sklearn.cluster import KMeans as km
from scipy import stats
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFea1tures
from sklearn.pipeline import Pipeline
from itertools import groupby
import math
import io
from sklearn.random_projection import SparseRandomProjection
from multimethod import multimethod
from sklearn.model_selection import RandomizedSearchCV as rscv

class numerics():
    def __init__(num =  ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
        self.num = num
    
class AlgType(Enum):
    xgboost_regressor = XGBRegressor()
    xgboost_classifier = XGBClassifier()
    lightgbm_regressor = LGBMRegressor()
    lightgbm_classifier = LGBMClassifier()
    catboost_regressor = CatBoostRegressor()
    catboost_classifier = CatBoostClassifier()
    
class DimensionalityReduction():
    
    def __init__(self, raw_data, target_name, meta_data=None):
        self.raw_data = raw_data
        assert(isinstance(target_name, str), 'target_name must be string!') 
        self.values = raw_data.drop([target_name], axis=1) 
        self.target = raw_data[target_name]
        self.meta_data = meta_data
        
        
#    def get_data(self, target_name):
#        assert(isinstance(target_name, str), 'target_name must be string!')
#        values = self.raw_data.drop([target_name], axis=1) 
#        target = self.raw_data[target_name]
#        return values, target

  
    def encoder(self, values=self.values, meta_data=self.meta_data):
        categorics = np.array([y for x,y in meta_data['TYPE', 'VARIABLE'] if x == 'CATEGORIC'])
        ohe = OneHotEncoder(categorical_features=categorics)
        values = ohe.fit_transform(values)
        self.values = values
        return values
        
                
    
    def default_ratio(self, values=self.values, meta_data=self.meta_data, thresh=0.95, default=[0]):                         
        res = pd.DataFrame(columns=values.columns)
        a = default
        for i in values:
            if meta_data != None: 
                a = [x for x,y in meta_data['DEFAULT_VALUES', 'VARIABLE'] if y==i]
            res[i] = 1-(values[values[i].isin(a)].count()/len(values[i]))
            res_new = res[res[i]>thresh]
        return values[res_new], res.columns.difference(res_new.columns), res_new.columns

                         
    def normalize(self, values=self.values, thresh=0.1):
        values_norm = stats.zscore(values.select_dtypes(include=numerics().num))
        b = np.argwhere(np.abs(values_norm)>3)
        grouped_b = [list(j) for i, j in groupby(b[:,0])]
        values_norm = pd.DataFrame(values_norm)
        values_norm.columns = values.columns
        for i in range(len(grouped_b)):    
            if (len(grouped_b[i])/values_norm.shape[1]) >thresh:
                values_norm = values_norm.drop(grouped_b[i][0], axis=0)
        return values_norm

    def reduct_low_var(self, values=self.values, threshold):
        values = values.select_dtypes(include=numerics().num)
        var = values.var(skipna = True)
        res = []
        for v in range(len(var)):
            if var[v] >= threshold:
                res.append(var.index.values[v]) 
        features_eliminated = choose_eliminated(values, res)    
        return values[res], features_eliminated, res

    def reduct_dcor(self, values = self.values, n_features=100): '''formatı hakkında fikir geliştirilmeli kolaylaştırıcı (fazla radikal)'''
        values = values.select_dtypes(include=numerics().num)
        corr = np.zeros([len(values.columns),2])

        for i in range(len(values.columns)):
            corr[i,:] = np.array([i, dcor.distance_correlation(values.iloc[:,i], target)])
        
        if n_features > len(corr):
            features_chosen = values.columns[corr[:,0]]
        else:    
            corr = pd.DataFrame(corr).sort_values(by=corr.columns[1], ascending=False).iloc[:n_features,:] 
            corr = np.array(corr)
            kmeans = km(n_clusters = n_features).fit(corr[:,1].reshape(-1,1))
            labels = kmeans.labels_

            x=[]
            for l in range(n_features):
                x.append([i for i,d in enumerate(labels) if d==l])

            features_chosen = np.zeros(n_features)
            features_chosen = features_chosen-1
            maxx = np.zeros(n_features)
            for l in range(len(labels)):
                a = abs(dcor.distance_correlation(values.iloc[:,x[labels[l]]], values.iloc[:,int(corr[l,0])]))
                if a >= maxx[labels[l]]:
                    maxx[labels[l]] = a
                    features_chosen[labels[l]] = corr[l,0]

            features_chosen = np.array([int(i) for i in features_chosen if i>=0])
            features_chosen = values.columns[features_chosen]

        features_eliminated = choose_eliminated(values, features_chosen)
        return values[features_chosen], features_eliminated, features_chosen
     
    def reduct_dcor_sqr(self, values=self.values, n_features=100):
        values = values.select_dtypes(include=numerics().num)
        corr = np.zeros([len(values.columns),2])

        for i in range(len(values.columns)):
            corr[i,:] = np.array([i, dcor.u_distance_correlation_sqr(values.iloc[:,i], target)])
        
        if n_features > len(corr):
            features_chosen = values.columns[corr[:,0]]
        else:    
            corr = pd.DataFrame(corr).sort_values(by=corr.columns[1], ascending=False).iloc[:n_features,:] 
            corr = np.array(corr)
            
            kmeans = km(n_clusters = n_features).fit(corr[:,1].reshape(-1,1))
            labels = kmeans.labels_

            x=[]
            for l in range(n_features):
                x.append([i for i,d in enumerate(labels) if d==l])

            features_chosen = np.zeros(n_features)
            features_chosen = features_chosen-1
            maxx = np.zeros(n_features)
            for l in range(len(labels)):
                a = abs(dcor.u_distance_correlation_sqr(values.iloc[:,x[labels[l]]], values.iloc[:,int(corr[l,0])]))
                if a >= maxx[labels[l]]:
                    maxx[labels[l]] = a
                    features_chosen[labels[l]] = corr[l,0]

            features_chosen = np.array([int(i) for i in features_chosen if i>=0])
            features_chosen = values.columns[features_chosen]

        features_eliminated = choose_eliminated(values, features_chosen)
        return values[features_chosen], features_eliminated, features_chosen

    def choose_eliminated(values, features_chosen):
        values = values.drop(features_chosen, axis=1)
        features_eliminated = values.columns
        return features_eliminated

    def assign_zero(self, values=self.values):
        values = values.select_dtypes(include=numerics().num)
        values_new = values.dropna(thresh=int(values.shape[1]*0.95))
        values_new = values_new.fillna(0)
        values_new = values_new.drop([i for i in values_new.columns if values_new[i].sum()==0], axis = 1)
        return values_new, values.columns.difference(values_new.columns), values_new.columns

    def reduct_boostedtree(self, values=self.values, target=self.target, algtype, nb_features=100):
        model = algtype.value
        parameters = {'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1), 'kernel': ['rbf'], 'class_weight':['balanced', None]}
        clf = rscv(model, parameters)
        clf.fit(values, target)
        score = clf.best_score_
        model.fit(values, target, clf.best_params_)
        
        feature_weights = model.feature_importances_
        
        feature_weights_new = pd.DataFrame(feature_weights).sort_values(by=feature_weights.columns[1], ascending=False).iloc[:nb_features,:] 
        feature_weights_new = np.array(feature_weights)
        
        features_chosen = values.columns[np.transpose(np.argwhere(feature_weights_new))[0].tolist()]
        features_eliminated = choose_eliminated(values, features_chosen)
        return values[features_chosen], features_eliminated, features_chosen, score, feature_weights
      
    #def l2_regularization(values, target, degree = 10, alpha = 10):
    #    model = Pipeline([('poly', PolynomialFeatures(degree=degree)), ('l2', Ridge(alpha=alpha))])
    #    model = model.fit(values, target)
    #    model.named_steps['l2'].coef_
    #    output_features = model.get_feature_names(input_features = values.columns)
    #    return output_features  # i have no idea what is going to pop up because of this code


    def calculate_psi(expected, actual, buckettype='bins', buckets=10, axis=0):
        '''Calculate the PSI (population stability index) across all variables
        Args:
           expected: numpy matrix of original values
           actual: numpy matrix of new values, same size as expected
           buckettype: type of strategy for creating buckets, bins splits into even splits, quantiles splits into quantile buckets
           buckets: number of quantiles to use in bucketing variables
           axis: axis by which variables are defined, 0 for vertical, 1 for horizontal
        Returns:
           psi_values: ndarray of psi values for each variable
        Author:
           Matthew Burke
           github.com/mwburke
           worksofchart.com
        '''

        def psi(expected_array, actual_array, buckets):
            '''Calculate the PSI for a single variable
            Args:
               expected_array: numpy array of original values
               actual_array: numpy array of new values, same size as expected
               buckets: number of percentile ranges to bucket the values into
            Returns:
               psi_value: calculated PSI value
            '''

            def scale_range (input, min, max):
                input += -(np.min(input))
                input = input/(np.max(input) / (max - min))
                input += min
                return input


            breakpoints = np.arange(0, buckets + 1) / (buckets) * 100

            if buckettype == 'bins':
                breakpoints = scale_range(breakpoints, np.min(expected_array), np.max(expected_array))
            elif buckettype == 'quantiles':
                breakpoints = np.stack([np.percentile(expected_array, b) for b in breakpoints])



            expected_percents = np.histogram(expected_array, breakpoints)[0] / len(expected_array)
            actual_percents = np.histogram(actual_array, breakpoints)[0] / len(actual_array)

            def sub_psi(e_perc, a_perc):
                '''Calculate the actual PSI value from comparing the values.
                   Update the actual value to a very small number if equal to zero
                '''
                if a_perc == 0:
                    a_perc = 0.0001
                if e_perc == 0:
                    e_perc = 0.0001

                value = (e_perc - a_perc) * np.log(e_perc / a_perc)
                return(value)

            psi_value = np.sum(sub_psi(expected_percents[i], actual_percents[i]) for i in range(0, len(expected_percents)))

            return(psi_value)

        if len(expected.shape) == 1:
            psi_values = np.empty(len(expected.shape))
        else:
            psi_values = np.empty(expected.shape[axis])

        for i in range(0, len(psi_values)):
            if len(psi_values) == 1:
                psi_values = psi(expected, actual, buckets)
            elif axis == 0:
                psi_values[i] = psi(expected[:,i], actual[:,i], buckets)
            elif axis == 1:
                psi_values[i] = psi(expected[i,:], actual[i,:], buckets)

        return(psi_values)


    def calculate_all_psi(expected,acual,buckets=10, ax=0):
        inputs_expected = np.array(expected)
        inpts_actual =    np.array(acual)

        psi_all=pd.DataFrame(columns=['VARIABLE_INDEX','PSI_SCORE'])

        for i in range(inputs_expected.shape[1]):
            psi_i=calculate_psi(inputs_expected[:,i], inpts_actual[:,i], buckettype='bins', buckets=buckets, axis=ax)
            psi_all.loc[i]=[i,psi_i]       

        return psi_all

    def reduct_psi(self, values=self.values, test=self.test, thresh = 0.8):
        psi_all = calculate_all_psi(values, test)
        #thresh = max(np.percentile(psi_all['PSI_SCORE'], 10), ((max(psi_all['PSI_SCORE'])-min(psi_all['PSI_SCORE']))/thresh)+min(psi_all['PSI_SCORE']))
        features_chosen = [int(psi_all['VARIABLE_INDEX'][i]) for i in range(psi_all.shape[0]) if psi_all['PSI_SCORE'][i]<=thresh]
        features_eliminated = choose_eliminated(values, values.columns[features_chosen])
        return values[features_chosen], features_eliminated, features_chosen

 #   def rmsle(y, y_pred):    
 #       assert len(y) == len(y_pred)
 #       terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
 #       return ((sum(terms_to_sum) * (1.0/len(y))) ** 0.5)

    def reduct_sparse_rp(self, values=self.values, test=self.test, eps): '''sütunlar kayboluyor doğal olarak'''
        values_new = 'test data is required!'
        if test != None:
            transformer = SparseRandomProjection(eps=eps)
            values_new = transformer.fit_transform(values)
        return values_new
      
    def solid(self, values=self.values, target=self.target, test=self.test, threshold=0, model_type):
        if model_type==1:
            a = self.reduct_boostedtree(values=self.values, target=self.target, algtype='xgboost_regressor', nb_features=100)
            b = self.reduct_boostedtree(values=self.values, target=self.target, algtype='lightgbm_regressor', nb_features=100)
            c = self.reduct_boostedtree(values=self.values, target=self.target, algtype='catboost_regressor', nb_features=100)
            res = np.array(a[-1])/a[-2] + np.array(b[-1])/b[-2] + np.array(c[-1])/c[-2]
            res = res[res>threshold]
        elif model_type==2:
            a = self.reduct_boostedtree(values=self.values, target=self.target, algtype='xgboost_classifier', nb_features=100)
            b = self.reduct_boostedtree(values=self.values, target=self.target, algtype='lightgbm_classifier', nb_features=100)
            c = self.reduct_boostedtree(values=self.values, target=self.target, algtype='catboost_classifier', nb_features=100)
            res = np.array(a[-1])/a[-2] + np.array(b[-1])/b[-2] + np.array(c[-1])/c[-2]
            res = res[res>threshold]
        return values[res], values.columns.difference(values[res].columns), res.index 
        

In [0]:
# Install a Drive FUSE wrapper.
# https://github.com/astrada/google-drive-ocamlfuse
!apt-get update -qq 2>&1 > /dev/null
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
!pip install --upgrade -q gspread
import gspread
# Generate auth tokens for Colab
from google.colab import auth
auth.authenticate_user()

# Generate creds for the Drive FUSE library.
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
gc = gspread.authorize(creds)
import getpass
# Work around misordering of STREAM and STDIN in Jupyter.
# https://github.com/jupyter/notebook/issues/3159
prompt = !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass(prompt[0] + '\n\nEnter verification code: ')
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
# Create a directory and mount Google Drive using that directory.
!mkdir -p drive
!google-drive-ocamlfuse drive

test = pd.read_csv('drive/ColabNotebooks/test.csv')
train = pd.read_csv('drive/ColabNotebooks/train.csv')

# Create a file in Drive.
!echo "This newly created file will appear in your Drive file list." > drive/created.txt

In [0]:
print((res.iloc[:,0]<0).count())

In [0]:
values, target = get_data(train)
values = assign_zero(values)
#col = values.columns
#values = reduct_sparse_rp(values, 0.3)
#values = pd.DataFrame(values)
#values.columns = col[:values.shape[1]]
values_train, values_test, target_train, target_test = tts(values, target, test_size=0.2)

a = reduct_low_var(values_train, 0.001)
values_train = values_train.drop(a, axis=1)
print("reductedby_low_var completed: ", a)
#b = reduct_dcor(values_train, target_train)
#reductedby_dcor = values_train.drop(b ,axis=1)
#print("reductedby_dcor completed: ", b)
#c = reduct_dcor_sqr(values_train, target_train)
#reductedby_dcor_sqr = values_train.drop(c, axis=1)
#print("reductedby_dcor_sqr completed: ", c)
#d = reduct_psi(values_train, test[values_train.columns])
#reductedby_psi = values_train.drop(d, axis=1)
#print("reductedby_psi completed: ", d)
e = reduct_xgbregressor(values_train, target_train)
values_train = values_train.drop(e, axis=1)
print("reductedby_xgbregressor completed: ", e)
#b = reduct_dcor(values_train, target_train)
#reductedby_dcor = values_train.drop(b ,axis=1)
#print("reductedby_dcor completed: ", b)
c = reduct_dcor_sqr(values_train, target_train)
values_train = values_train.drop(c, axis=1)
print("reductedby_dcor_sqr completed: ", c)
#f = choose_eliminated(values_train, l2_regularization(values_train, target_train))
#reductedby_l2 = values_train.drop(f, axis=1)
#print("reductedby_l2 completed: ", f)

#a = 0
res = pd.DataFrame(columns = ['values_train'])

#for i in [reductedby_low_var, reductedby_dcor, reductedby_dcor_sqr, reductedby_psi, reductedby_xgbregressor]:
#    print("exporting: ", res.columns[i])
#    file_name = str(res.columns[a] + 'sparse')
#    sh = gc.create(file_name)
#    worksheet = gc.open(file_name).sheet1
#    cell_list = worksheet.range(1, 1, reducedby_low_var.shape[1], 1)
#    a = 0
#    for cell in cell_list:
#        cell.value = i[a]
#        a+=1
#    worksheet.update_cells(cell_list)

rmsl = []    
a=0    
for i in [values_train]:
    model = XGBRegressor()
    model.fit(i, target_train)
    res.iloc[:,a] = np.array(model.predict(values_test[i.columns]))
    rmsl.append(rmsle(target_test.tolist(), res.iloc[:,a].tolist()))
    a += 1
    
print(rmsl)

In [0]:
values, target = get_data(train)
values_train, values_test, target_train, target_test = tts(values, target, test_size=0.2)
values_train = assign_zero(values_train)
values_train = normalize(values_train, 0.2)
values_test = values_test[values_train.columns]

a = reduct_low_var(values_train, 0.001)
reductedby_low_var = values_train.drop(a, axis=1)
print("reductedby_low_var completed: ", a)
#b = reduct_dcor(values_train, target_train)
#reductedby_dcor = values_train.drop(b ,axis=1)
#print("reductedby_dcor completed: ", b)
#c = reduct_dcor_sqr(values_train, target_train)
#reductedby_dcor_sqr = values_train.drop(c, axis=1)
#print("reductedby_dcor_sqr completed: ", c)
d = reduct_psi(values_train, test[values_train.columns])
reductedby_psi = values_train.drop(d, axis=1)
print("reductedby_psi completed: ", d)
e = reduct_xgbregressor(values_train, target_train)
reductedby_xgbregressor = values_train.drop(e, axis=1)
print("reductedby_xgbregressor completed: ", e)
#f = choose_eliminated(values_train, l2_regularization(values_train, target_train))
#reductedby_l2 = values_train.drop(f, axis=1)
#print("reductedby_l2 completed: ", f)

a = 0
res = pd.DataFrame(columns = ['reductedby_low_var',''' 'reductedby_dcor', 'reductedby_dcor_sqr',''' 'reductedby_psi', 'reductedby_xgbregressor'])

for i in [reductedby_low_var, '''reductedby_dcor, reductedby_dcor_sqr,''' reductedby_psi, reductedby_xgbregressor]:
    print("exporting: ", res.columns[i])
    sh = gc.create(res.columns[a])
    worksheet = gc.open(res.columns[a]).sheet1
    cell_list = worksheet.range(1, 1, reducedby_low_var.shape[1], 1)
    a = 0
    for cell in cell_list:
        cell.value = i[a]
        a+=1
    worksheet.update_cells(cell_list)

    
rmsle = []    
a=0    
for i in [reductedby_low_var, reductedby_dcor, reductedby_dcor_sqr, reductedby_psi, reductedby_xgbregressor]:
    print("modeling: ", res.columns[i])    
    model = XGBRegressor()
    model.fit(i, target_train)
    res.iloc[:,a] = model.predict(values_test[i.columns])
    rmsle.append(rmsle(target_test, res.iloc[:,a]))
    a += 1
    
print(rmsle)

In [0]:
values, target = get_data(train)
values_train, values_test, target_train, target_test = tts(values, target, test_size=0.2)
values_train = assign_zero(values_train)
values_train = normalize(values_train, 0.2)
values_test = values_test[values_train.columns]

a = reduct_low_var(values_train, 0.001)
reductedby_low_var = values_train.drop(a, axis=1)
print("reductedby_low_var completed: ", a)
b = reduct_dcor(values_train, target_train)
reductedby_dcor = values_train.drop(b ,axis=1)
print("reductedby_dcor completed: ", b)
c = reduct_dcor_sqr(values_train, target_train)
reductedby_dcor_sqr = values_train.drop(c, axis=1)
print("reductedby_dcor_sqr completed: ", c)
d = reduct_psi(values_train, test[values_train.columns])
reductedby_psi = values_train.drop(d, axis=1)
print("reductedby_psi completed: ", d)
e = reduct_xgbregressor(values_train, target_train)
reductedby_xgbregressor = values_train.drop(e, axis=1)
print("reductedby_xgbregressor completed: ", e)
#f = choose_eliminated(values_train, l2_regularization(values_train, target_train))
#reductedby_l2 = values_train.drop(f, axis=1)
#print("reductedby_l2 completed: ", f)

a = 0

for i in [reductedby_low_var, reductedby_dcor, reductedby_dcor_sqr, reductedby_psi, reductedby_xgbregressor]:
    print("exporting: ", res.columns[i])
    sh = gc.create(res.columns[a])
    worksheet = gc.open(res.columns[a]).sheet1
    cell_list = worksheet.range(1, 1, i.shape[1], 1)
    a = 0
    for cell in cell_list:
        cell.value = i[a]
        a+=1
    worksheet.update_cells(cell_list)

lll = [reductedby_low_var, reductedby_dcor, reductedby_dcor_sqr, reductedby_psi, reductedby_xgbregressor]

fin = pd.DataFrame(np.ones([len(values.columns),len(lll)]))
fin.index = values.columns
fin.columns = ['reductedby_low_var', 'reductedby_dcor', 'reductedby_dcor_sqr', 'reductedby_psi', 'reductedby_xgbregressor']

a = 0
for l in lll:
    fin.iloc[l.columns,a] = np.zeros(len(l.columns))
    a+=1
    
sh = gc.create('fin')
worksheet = gc.open('fin').sheet1
for z in range(len(lll)):
    cell_list = worksheet.range(1, z+1, fin.shape[1], z+1)
    a = 0
    for cell in cell_list:
        cell.value = fin.iloc[a,z]
        a+=1

        worksheet.update_cells(cell_list)

In [0]:
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGB
from sklearn.svm import SVC
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


title = "Learning Curves"
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

estimator = XGBRegressor()
plot_learning_curve(estimator, title, values_train, target_train, ylim=(0.7, 1.01), cv=cv, n_jobs=4)

title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
# SVC is more expensive so we do a lower number of CV iterations:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator = SVC(gamma=0.001)
plot_learning_curve(estimator, title, values_train, target_train, (0.7, 1.01), cv=cv, n_jobs=4)

plt.show()

In [0]:
t = 0
lis =  ['reductedby_low_var', 'reductedby_dcor', 'reductedby_dcor_sqr']
for i in [reductedby_low_var, reductedby_dcor, reductedby_dcor_sqr]:
    sh = gc.create(lis[t])
    worksheet = gc.open(lis[t]).sheet1
    cell_list = worksheet.range(1, 1, i.shape[1], 1)
    a = 0
    for cell in cell_list:
        cell.value = i.columns[a]
        a+=1
    worksheet.update_cells(cell_list)
    t+=1

lll = [reductedby_low_var, reductedby_dcor, reductedby_dcor_sqr]

fin = pd.DataFrame(np.ones([len(values.columns),len(lll)]))
fin.index = values.columns
fin.columns = lis

a = 0
for l in lll:
    fin.loc[l.columns, [lis[a]]] = 0
    a+=1
    
sh = gc.create('fin')
worksheet = gc.open('fin').sheet1
for z in range(len(lll)):
    cell_list = worksheet.range(1, z+1, fin.shape[0], z+1)
    a = 0
    for cell in cell_list:
        cell.value = fin.iloc[a,z]
        a+=1

        worksheet.update_cells(cell_list)

In [0]:
from google.colab import files

fin.to_csv('fin.csv')
files.download('fin.csv')


In [0]:
from google.colab import files

values, target = get_data(train)
values_train, values_test, target_train, target_test = tts(values, target, test_size=0.2)
values_train = assign_zero(values_train)
values_train = normalize(values_train, 0.2)
values_test = values_test[values_train.columns]

a = reduct_low_var(values_train, 0.001)
reductedby_low_var = values_train.drop(a, axis=1)
'''b = reduct_dcor(values_train, target_train)
reductedby_dcor = values_train.drop(b ,axis=1)
c = reduct_dcor_sqr(values_train, target_train)
reductedby_dcor_sqr = values_train.drop(c, axis=1)'''
d = reduct_psi(values_train, test[values_train.columns], thresh=0.25)
reductedby_psi = values_train.drop(d, axis=1)
e = reduct_xgbregressor(values_train, target_train)
reductedby_xgbregressor = values_train.drop(e, axis=1)

#f = choose_eliminated(values_train, l2_regularization(values_train, target_train))
#reductedby_l2 = values_train.drop(f, axis=1)
#print("reductedby_l2 completed: ", f)

lis = ['reductedby_low_var', 'reductedby_psi', 'reductedby_xgbregressor', 'reductedby_dcor', 'reductedby_dcor_sqr']
lll = [reductedby_low_var, reductedby_psi, reductedby_xgbregressor]

fin = pd.DataFrame(np.ones([len(values.columns),5]))
fin.index = values.columns
fin.columns = lis
fin['reductedby_dcor'] = np.array(reds['reductedby_dcor'])
fin['reductedby_dcor_sqr'] = np.array(reds['reductedby_dcor_sqr'])

a = 0
for l in lll:
    fin.loc[l.columns, [lis[a]]] = 0
    a+=1
    
fin.to_csv('fin.csv')
files.download('fin.csv')

In [0]:
from google.colab import files

uploaded = files.upload()

In [0]:
import pandas as pd
import io

reds = pd.read_csv(io.StringIO(uploaded['dimensionality reduction.csv'].decode('utf-8')))
#reds.index = reds.iloc[:,0]

#a = [i for i in reds.index if reds.loc[i,'reductedby_low_var']==0]


In [0]:
print(values_train['cde9c35e8'])

In [0]:
#from xgboost import XGBRegressor

#values, target = get_data(train)
#values_train, values_test, target_train, target_test = tts(values, target, test_size=0.2)
#values_train = assign_zero(values_train)
#values_train = normalize(values_train, 0.2)
#values_test = values_test[values_train.columns]

model = XGBRegressor()

model.fit(values_train[a], target_train)
res = model.predict(values_test[a])
rmsle_ = rmsle(target_test.tolist(), res)

print(rmsle_)

In [0]:
a = [i for i in fin.index if fin.loc[i,'reductedby_dcor']==0]
print(a)

In [0]:
rmsl = []    
res = pd.DataFrame(columns = ['reductedby_low_var',  'reductedby_psi', 'reductedby_xgbregressor', 'reductedby_dcor', 'reductedby_dcor_sqr', 'values_train', 'aaa'])
a=0
reductedby_dcor = values_train[[i for i in fin.index if fin.loc[i,'reductedby_dcor']==0]]
reductedby_dcor_sqr = values_train[[i for i in fin.index if fin.loc[i,'reductedby_dcor_sqr']==0]]
aaa, values_test, target_train, target_test = tts(values, target, test_size=0.2)
for i in [reductedby_low_var, reductedby_dcor, reductedby_dcor_sqr, reductedby_psi, reductedby_xgbregressor, values_train, aaa]:   
    model = XGBRegressor()
    model.fit(i, target_train)
    res.iloc[:,a] = model.predict(values_test[i.columns])
    rmsl.append(rmsle(target_test.tolist(), res.iloc[:,a].tolist()))
    a += 1
    
print(rmsl)

In [0]:
rmsl = 5
n_opt_j = -1
n_opt_i = -1

values, target = get_data(train)
values_train, values_test, target_train, target_test = tts(values, target, test_size=0.2)
values_train = values_train.fillna(0)

corr = np.zeros([len(values_train.columns),2])
    
for i in range(len(values_train.columns)):
    corr[i,:] = np.array([i, dcor.u_distance_correlation_sqr(values_train.iloc[:,i], target_train)])
    
for j in range(5):
    thresh = np.nanpercentile(corr[:,1], 10+(4*j))
    corr_new = np.array([i for i in corr if i[1] >= thresh])   
    for i in range(7):
        n_features = 4*(2**i)
        if n_features > len(corr_new):
            features_chosen = values_train.columns[corr_new[:,0]]
            features_eliminated = choose_eliminated(values_train, features_chosen)
            re = values_train.drop(features_eliminated, axis = 1)
           
        else:     
            kmeans = km(n_clusters = n_features).fit(corr_new[:,1].reshape(-1,1))
            labels = kmeans.labels_

            x=[]
            for l in range(n_features):
                x.append([m for m,d in enumerate(labels) if d==l])
            features_chosen = np.zeros(n_features)
            features_chosen = features_chosen-1
            maxx = np.zeros(n_features)
            for l in range(len(labels)):
                a = abs(dcor.u_distance_correlation_sqr(values_train.iloc[:,x[labels[l]]], values_train.iloc[:,int(corr_new[l,0])]))
                if a >= maxx[labels[l]]:
                    maxx[labels[l]] = a
                    features_chosen[labels[l]] = corr_new[l,0]
            features_chosen = np.array([int(z) for z in features_chosen if z>=0])
            features_chosen = values.columns[features_chosen]
            features_eliminated = choose_eliminated(values_train, features_chosen)   
            re = values_train.drop(features_eliminated, axis = 1)
            
        model = XGBRegressor()
        model.fit(re, target_train)
        pred = model.predict(values_test[re.columns])
        score = rmsle(target_test.tolist(), pred.tolist())
        if rmsl > score:
            rmsl = score
            n_opt_i = i
            n_opt_j = j
        print(rmsl, n_opt_i, n_opt_j)    

In [0]:
from sklearn.metrics import explained_variance_score as evs
evs()

In [0]:
from sklearn.feature_selection import RFECV as rfe
from xgboost import XGBRegressor
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
selector = rfe(XGBRegressor(), cv=cv, scoring = 'mean_squared_log_error')
values_train = selector.fit_transform(values_train, target_train)

In [0]:
from sklearn.cross_validation import train_test_split as tts

values, target = get_data(train)
values_train, values_test, target_train, target_test = tts(values, target, test_size=0.2)
#values_train = values_train.fillna(0)

#corr = np.zeros([len(values_train.columns),2])
    
#for i in range(len(values_train.columns)):
#    corr[i,:] = np.array([i, dcor.u_distance_correlation_sqr(values_train.iloc[:,i], target_train)])
   

In [0]:
a = np.where(corr[:,1] == max(corr[:,1]))
kmeans = KMeans(values_train['target'])
values_train[a]

In [0]:
thresh = max(np.nanpercentile(corr[:,1], 10), ((max(corr[:,1])-min(corr[:,1]))*0.1)+min(corr[:,1]))
corr_new = np.array([i for i in corr if i[1] >= thresh])
values_train_new = values_train.iloc[:,corr_new[:,0]]

In [0]:

thresh = max(np.nanpercentile(corr[:,1], 10), ((max(corr[:,1])-min(corr[:,1]))*0.1)+min(corr[:,1]))
corr_new = np.array([i for i in corr if i[1] >= thresh])
corr_new.shape

kmeans = km(n_clusters = 8).fit(corr_new[:,1].reshape(-1,1))
labels = kmeans.labels_

x=[]
for l in range(8):
    x.append([i for i,d in enumerate(labels) if d==l])

features_chosen = np.zeros(8)
features_chosen = features_chosen-1
maxx = np.zeros(8)
for l in range(len(labels)):
    a = abs(dcor.u_distance_correlation_sqr(values.iloc[:,x[labels[l]]], values.iloc[:,int(corr[l,0])]))
    if a >= maxx[labels[l]]:
        maxx[labels[l]] = a
        features_chosen[labels[l]] = corr[l,0]

features_chosen = np.array([int(i) for i in features_chosen if i>=0])
features_chosen = values_train.columns[features_chosen]
features_eliminated = choose_eliminated(values_train, features_chosen)   
re = values_train.drop(features_eliminated, axis = 1)


In [0]:
from sklearn.preprocessing import PolynomialFeatures

re_new = PolynomialFeatures(5).fit_transform(re)

In [0]:
corr_re_new = np.zeros([re_new.shape[1],2])
    
for i in range(re_new.shape[1]):
    corr_re_new[i,:] = np.array([i, dcor.u_distance_correlation_sqr(re_new[:,i], target_train)])

In [0]:
thresh = max(np.nanpercentile(corr_re_new[:,1], 3), ((max(corr_re_new[:,1])-min(corr_re_new[:,1]))*0.03)+min(corr_re_new[:,1]))
corr_re_new_new = np.array([i for i in corr_re_new if i[1] >= thresh])
corr_re_new_new.shape

In [0]:
import math
            
model = XGBRegressor()
target_train_new = target_train.apply(lambda x: math.log(x))
model.fit(values_train_new.iloc[:,np.array([int(i) for i in corr_new[:,0]])], target_train_new)
pred = model.predict(values_test[values_train_new.columns])[:,np.array([int(i) for i in corr_new[:,0]])]
pred = [math.pow(math.e,i) for i in pred]
score = rmsle(target_test.tolist(), pred)
print(score)

In [0]:
pred = model.predict(test[re.columns])

In [0]:
pred = [math.pow(math.e,i) for i in pred]

In [0]:
submission = pd.DataFrame(columns=['target'])

submission['target'] = np.array(pred)
submission.index = test['ID']

In [0]:
from google.colab import files

submission.to_csv('submission.csv')
files.download('submission.csv')

In [0]:
print(score)

In [0]:
#import math
#from lightgbm import LGBMRegressor as gbm
#from xgboost import XGBRegressor as xgb
#from sklearn.preprocessing import StandardScaler as ss
#from sklearn.cross_validation import train_test_split as tts
#from sklearn.linear_model import ElasticNet as el
from catboost import CatBoostRegressor as cbr

score = 0
for t in range(50):
    values, target = get_data(train)
    values = values[bins]
    values_train, values_test, target_train, target_test = tts(values, target, test_size=0.2)
    values_train = assign_zero(values_train)

    #model = gbm()
    #ress = target_train.apply(lambda x: math.log(x))
    #model.fit(values_train, ress)
    #pred1 = model.predict(values_test[values_train.columns])
    #pred1 = [np.exp(i) for i in pred1]

    #model = xgb()
    #ress = target_train.apply(lambda x: math.log(x))
    #model.fit(values_train, ress)
    #pred2 = model.predict(values_test[values_train.columns])
    #pred2 = [np.exp(i) for i in pred2]
    
    model = cbr()
    ress = target_train.apply(lambda x: math.log1p(x))
    model.fit(values_train, ress)
    pred4 = model.predict(values_test[values_train.columns])
    pred4 = [np.expm1(i) for i in pred4]
    
    #model = el(no)
    #ress = target_train.apply(lambda x: math.log(x))
    #model.fit(values_train, ress)
    #pred3 = model.predict(values_test[values_train.columns])
    #pred3 = [np.exp(i) for i in pred3]

    #pred = []
    #for i in range(len(pred1)):
    #    pred.append(pred1[i]*0.5+pred2[i]*0.5)

    score += rmsle(target_test.tolist(),pred4)

print(score/50)
  

In [0]:
model.fit(values[values_train.columns], ress)

In [0]:
print(x)

In [0]:
from sklearn.cross_validation import train_test_split as tts
import math
from xgboost import XGBRegressor as xgb


minscore = 10
opteps = 0
for i in range(300):
    eps = 0.25 + i*0.001
    values, target = get_data(train)
    values = values.fillna(0)
    col = values.columns
    values = reduct_sparse_rp(values, eps)
    values = pd.DataFrame(values)
    values.columns = col[:values.shape[1]]
    values_train, values_test, target_train, target_test = tts(values, target, test_size=0.2)

    ress = target_train.apply(lambda x: math.log(x))

    model = xgb()
    model.fit(values_train, ress)
    pred = model.predict(values_test)
    pred = [math.e**i for i in pred]
    score = rmsle(target_test.tolist(),pred)
    if score < minscore:
        minscore = score
        opteps = eps

In [0]:
print(test.iloc[:,1:])

In [0]:
from sklearn.cross_validation import train_test_split as tts
import math
from xgboost import XGBRegressor as xgb

values, target = get_data(train)
values_ = values.fillna(0)
test_ = test
test_ = test_.fillna(0)
col = values.columns
len_val = values.shape[0]
temp = pd.concat([values_, test_.iloc[:,1:]], axis=0)

minscore = 10
opteps = 0
for i in range(50):
    eps = 0.25 + i*0.006
    temp_ = reduct_sparse_rp(temp, eps)
    temp_ = pd.DataFrame(temp_)
    values_ = temp_.iloc[:len_val,:]
    test_ = temp_.iloc[len_val:,:]
    me=0
    for t in range(5):
        values_train, values_test, target_train, target_test = tts(values_, target, test_size=0.2)
        ress = target_train.apply(lambda x: math.log(x))
        model = xgb()
        model.fit(values_train, ress)
        pred = model.predict(values_test)
        pred = [math.e**i for i in pred]
        for i in range(len(pred)):
            a = values_test.iloc[i,:]
            for j in range(values_test.shape[1]):
                if (pred[i] >= 0.9*values_test.iloc[i,j]) & (pred[i] <= 1.1*values_test.iloc[i,j]):
                    pred[i] = values_test.iloc[i,j]
                    j = values_test.shape[1]-1
        score = rmsle(target_test.tolist(), pred)
        me += score
    if me < minscore*5:
        minscore = me/5
        opteps = eps
         
    print(me,eps)

In [0]:
from sklearn.cross_validation import train_test_split as tts
import math
from xgboost import XGBRegressor as xgb

values, target = get_data(train)
values_ = values.fillna(0)
test_ = test
test_ = test_.fillna(0)
col = values.columns
len_val = values.shape[0]
temp = pd.concat([values_, test_.iloc[:,1:]], axis=0)

temp_ = reduct_sparse_rp(temp, 0.256)
temp_ = pd.DataFrame(temp_)
values_ = temp_.iloc[:len_val,:]
test_ = temp_.iloc[len_val:,:]
ress = target.apply(lambda x: math.log(x))
model = xgb()
model.fit(values_, ress)
pred = model.predict(test_)
pred = [math.e**i for i in pred]

for i in range(len(pred)):
    a = test_.iloc[i,:]
    for j in range(test_.shape[1]):
       if (pred[i] >= 0.95*test_.iloc[i,j]) & (pred[i] <= 1.05*test_.iloc[i,j]):
          pred[i] = test_.iloc[i,j]
          j = test_.shape[1]-1

1.428

In [0]:
from sklearn.cross_validation import train_test_split as tts
import math
from lightgbm import LGBMRegressor as lgbm
from sklearn.metrics import r2_score as r2

values, target = get_data(train)
values = values[a]
values_train, values_test, target_train, target_test = tts(values, target, test_size=0.2)
values_train = assign_zero(values_train)
values_test = values_test[values_train.columns]

corr = np.zeros([len(values_train.columns),2])
    
for i in range(len(values_train.columns)):
    corr[i,:] = np.array([i, dcor.u_distance_correlation_sqr(values_train.iloc[:,i], target_train)])

In [0]:
#thresh = max(np.nanpercentile(corr_re_new[:,1], 3), ((max(corr_re_new[:,1])-min(corr_re_new[:,1]))*0.03)+min(corr_re_new[:,1]))
thresh = 0.08
corr = np.array([i for i in corr if i[1] >= thresh])
print(corr.shape)

In [0]:
feat = np.array([int(i) for i in corr[:,0]])
classy = pd.DataFrame(columns = values_train.columns[feat])

target_train_log = target_train.apply(lambda x: math.log(x))
classy['target_train'] = (target_train_log - target_train_log.mean())/target_train_log.std()
for i in range(len(classy['target_train'])):
    if classy.loc['target_train',i]>3:
        p3.append()

for i in classy:
    classy[i] = (values_train[i] - values_train[i].mean())/values_train[i].std()
    
classy_test = pd.DataFrame(columns = values_train.columns[feat])
for i in classy_test:
    classy_test[i] = (values_test[i] - values_test[i].mean())/values_test[i].std()


#kmeans = km(n_clusters = n_features).fit(target_train.apply(lambda x: math.log(x)).reshape(-1,1))  #n_features should be determined based on a performance metrics
#labels = kmeans.labels_

target_test_log = target_test.apply(lambda x: math.log(x))
target_test_log = (target_test_log - target_test_log.mean())/target_test_log.std() 

model = lgbm()
model.fit(classy.drop(['target_train'], axis=1), classy['target_train'])
pred = model.predict(classy_test)
#pred = [math.e**i for i in pred]
score = r2(target_test_log,pred)

In [0]:
print(values_train)

In [0]:
values_test['f190486d6']

In [0]:
import pandas as pd
import io

reds = pd.read_csv(io.StringIO(uploaded['dimensionality reduction.csv'].decode('utf-8')))
reds.index = reds.iloc[:,0]

a = [i for i in reds.index if (reds.loc[i,'reductedby_low_var']==0) & (reds.loc[i,'reductedby_xgbregressor']==0.0) & (reds.loc[i,'reductedby_psi']==0.0)]
a = a[:-1]

In [0]:
#values_train_new.iloc[4,:]=np.array([int(i) for i in values_train.iloc[4,:]==target_train.iloc[4]])
#print(sum(values_train_new.iloc[4,:]))


In [0]:
from sklearn.cross_validation import train_test_split as tts
from sklearn.linear_model import LogisticRegression as lr

values, target = get_data(train)
values_train, values_test, target_train, target_test = tts(values, target, test_size=0.2)
values_train = assign_zero(values_train)
values_test = values_test[values_train.columns]

dec = []
nm = []
values_train_new = values_train
for k in range(values_train.shape[0]):
    values_train_new.iloc[k,:] = np.array([int(i) for i in values_train.iloc[k,:]==target_train.iloc[k]])


In [0]:
perm = values_train_new
temp = values_train_new
freq = values_train_new.sum(axis=1)
while freq.sum()>0:
    inn = freq[freq==max(freq)].index[0]
    tar_train = perm.iloc[:,inn]
    part_train = values_train.drop(values_train.columns[inn], axis=1)
    part_test = values_test.drop(values_train.columns[inn], axis=1)
    temp = temp.drop((temp.iloc[:,inn]==1).index, axis=0)
    freq = temp.sum(axis=1)
    model = lr()
    model.fit(part_train, tar_train)
    dec.append(model.predict_proba(part_test))
    nm.append(values_train.columns[inn])

In [0]:
#from sklearn.feature_selection import RFECV as rfe
#from sklearn.model_selection import ShuffleSplit
import pandas as pd
import numpy as np
from keras.models import Sequential
from sklearn.cross_validation import train_test_split as tts
import math
from keras.layers import Dense, LSTM, Dropout
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
#from sklearn.ensemble import RandomForestRegressor as rf

#train = pd.read_csv(r'C:\Users\StjYahyaG\reduction\train.csv')
values = train.iloc[:,2:]
col = values.columns
scaler = MinMaxScaler(feature_range=(-1, 1))
values = pd.DataFrame(scaler.fit_transform(values))
values.columns = col
target = train.iloc[:,1]
scaler2 = MinMaxScaler(feature_range=(-1, 1))
target = scaler2.fit_transform(target.values.reshape(-1, 1))

#cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
#selector = rfe(rf(), cv=cv, scoring = scorer(msle(),values_train, target_train))
#values = selector.fit_transform(values, target)
#from tsfresh import extract_features

#from lightgbm import LGBMRegressor as lgbm


#col = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992', 'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7' ,'1931ccfdd', '703885424' ,'70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', '6619d81fc', '1db387535']
values_train, values_test, target_train, target_test = tts(values, target, test_size=0.2)
values_train = values_train.dropna(thresh=int(values_train.shape[1]*0.95))
values_train = values_train.fillna(0)
values_train = values_train.drop([i for i in values_train.columns if values_train[i].sum()==0], axis = 1)
#values_train_new = pd.DataFrame(np.array(values_train).reshape(values_train.shape[0], 1, values_train.shape[1]))
#values_train_new.columns = col
values_test = values_test[values_train.columns]
#new_features_train = extract_features(values_train['f190486d6'])
#fin = pd.concat(values_train, new_features_train, axis=1)
#new_features_test = extract_features(values_test['f190486d6'], column_id='bla')
#values_test_fin = pd.concat(values_test, new_features_test, axis=1)

#target_train_log = target_train.apply(lambda x: math.log(x))
model = Sequential()
model.add(Dense(128, input_shape=(values_train.shape[1],), activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.compile(loss='mean_squared_logarithmic_error', optimizer='Adam', metrics=['accuracy'])
model_info = model.fit(values_train, target_train, epochs = 100, validation_split=0.2)
plot_model_history(model_info)

pred = model.predict(values_test)
pred = scaler2.inverse_transform(pred)
#pred = [math.e**i for i in pred]

score = rmsle(target_test.tolist(), pred.tolist())
print(score)

In [0]:
import matplotlib.pyplot as plt
plot_model_history(model_info)

pred = model.predict(values_test)
pred = scaler2.inverse_transform(pred)
#pred = [math.e**i for i in pred]

score = rmsle(target_test.tolist(), pred.tolist())
print(score)

In [0]:
np.array(values_train).reshape(values_train.shape[0], 1, values_train.shape[1])

In [0]:
def plot_model_history(model_history):
    fig, axs = plt.subplots(1,2,figsize=(15,5))
    # summarize history for accuracy
    axs[0].plot(range(1,len(model_history.history['acc'])+1),model_history.history['acc'])
    axs[0].plot(range(1,len(model_history.history['val_acc'])+1),model_history.history['val_acc'])
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].set_xticks(np.arange(1,len(model_history.history['acc'])+1),len(model_history.history['acc'])/10)
    axs[0].legend(['train', 'val'], loc='best')
    # summarize history for loss
    axs[1].plot(range(1,len(model_history.history['loss'])+1),model_history.history['loss'])
    axs[1].plot(range(1,len(model_history.history['val_loss'])+1),model_history.history['val_loss'])
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
    axs[1].legend(['train', 'val'], loc='best')
    plt.show()