# Importing libraries and loading data

In [None]:
import tensorflow as tf
import xgboost as xgb
import xgboost
import tensorflow
import numpy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras import regularizers
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import LSTM
from tensorflow.python.keras import initializers
from keras.wrappers.scikit_learn import KerasRegressor
from datetime import datetime as dt
from utils import *
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from features_engineering import TIME_SERIES_FEATURES_ENGINEERING
import sklearn
start=dt.now()

# Process weathertypes, remove empty windspeed and store (ignore if it's already done)

In [None]:
#If you want to generate the one_hot file uncomment the last line of this cell
def update_one_hot_data(combined_path,weather_combined_path):
    if not check_if_the_tsv_hot_file_already_has_processed_columns(weather_combined_path):        
        
        df = pd.read_csv(combined_path,
                         sep='\t',
                         header=0) 
        weather_unique_combinations = df.weather_condition.unique()

        #Get list of all unique weather types
        types = []
        for el in weather_unique_combinations:
            for wc in el.split('.'):
                if(wc != ''):            
                    types.append(wc.strip())
        true_unique =  set(types)

        #Add columns with default value 0 for all unique weather types
        for unique_weather_type in true_unique:
            df[unique_weather_type]=0

        #Loop over all records and set value to 1 for their corresponding weather_types
        for index, row in df.iterrows():
            row_types = []
            for wc in row["weather_condition"].split('.'):
                if(wc != ''):            
                    row_types.append(wc.strip())
            for t in row_types:
                df.at[index,t]=1


        #remove empty windspeeds
        df = df[df.wind_speed.apply(lambda x: str(x).isnumeric())]

        #remove original weather_condition column and store, to avoid rerun
        df =  df.drop(columns="weather_condition")
        df.to_csv(weather_combined_path,
                  sep='\t',
                  index=False,
                  header=True) 
        TIME_SERIES_FEATURES_ENGINEERING(weather_combined_path)
    else:
        print("File already processed.")
        
combined_path = ".././data/combined_data.tsv"
weather_combined_path=".././data/combined_one_hot_data.tsv"
#update_one_hot_data(combined_path,weather_combined_path)

# Loadin data with one_hot weather types

In [None]:
def select_data_from_lat_csv(data,latitude):

    return data[data['latitude']==latitude]
def obtain_latitudes_list(latitude_data):
    return latitude_data.unique()
weather_combined_path=".././data/combined_one_hot_data.tsv"
df = pd.read_csv(weather_combined_path,
                 sep='\t',
                 header=0)
print(df.shape)
lat_list=obtain_latitudes_list(df.latitude)
pole=0
#df=select_data_from_lat_csv(df,lat_list[pole])
#print("Selecting data for bikes station on latitude "+str(lat_list[pole]))
#print(lat_list)
print(df.shape)

#choose one station to test on
#df=df[df['device_name']=='CB1143']
#print(df)

# Remove outliers (bike_count)

In [None]:
def remove_outlier(df_in, col_name):
    """Removes all outliers on a specific column from a given dataframe.

    Args:
        df_in (pandas.DataFrame): Iput pandas dataframe containing outliers
        col_name (str): Column name on which to search outliers

    Returns:
        pandas.DataFrame: DataFrame without outliers
    """         
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1  # Interquartile range
    fence_low = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    return df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)] 
start_size=df.shape[0]
df = remove_outlier(df, "bike_count")
print(df.shape)
print("We have removed "+str(start_size-df.shape[0])+" outliers tuples (which is "+str((start_size-df.shape[0])/start_size*100)+" % of total).")

# Correlation of variables


In [None]:
import matplotlib.pyplot as plt



fig, ax = plt.subplots(figsize=(30,30))    

correlation_matrix = df.corr().round(2)
#print(correlation_matrix)
sns.heatmap(correlation_matrix, annot=True, fmt=".001f",ax=ax)

# Feature selection

In [None]:
def remove_features(df):
    return df.drop(columns=[        
    'device_name',
    'timestamp_until',
    'bike_avg_speed',
    'weather_timestamp',
    'wind_direction',
    'wind_speed',
    'barometer',
    'visibility',
    'Ice fog',
    'Thundershowers',
    'Sprinkles',
    'Broken clouds',
    'Rain showers',
    'Snow flurries',
    'Light fog',
    'Sleet',
    'Cloudy',
    'Quite cool'    
])
def remove__weather_features(df):
    return df.drop(columns=[        
    'temperature',
    'humidity',
    'Scattered showers',
    'Low clouds',
    'Snow',
    'Snow showers',
    'Thunderstorms',
    'Partly sunny',
    'Light freezing rain',
    'Sunny',
    'Light rain',
    'Freezing rain',
    'Light snow',
    'Passing clouds',
    'Fog',
    'Cool',
    'Partly cloudy',
    'Haze',
    'Hail',
    'Scattered clouds',
    'Drizzle',
    'Clear',
    'Rain',
    'Chilly'    
        
])

df=remove_features(df)
#df=remove__weather_features(df)
#print(df.head())
print(df.shape)

# Build models

In [None]:
def build_normal_model():
    #look for regularization with keras 
    initializer = initializers.VarianceScaling(scale=1.0, mode='fan_in', distribution='normal', seed=None)
    model = Sequential()   
    
    model.add(Dense(28, input_dim=28,kernel_regularizer=regularizers.l2(0.0001),kernel_initializer='normal', activation='relu'))
    model.add(Dense(14,activation='relu'))    
    model.add(Dense(1, activation='relu'))
    optimizer = opt=tf.keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)    
    model.compile(loss='mse', optimizer=optimizer, metrics=['mse','mae'])
    model.summary()
    return model

from sklearn.ensemble import RandomForestRegressor
def build_Random_forest_model():
    #model = RandomForestRegressor(max_depth=2, random_state=0,n_estimators=100)
    model = RandomForestRegressor(max_depth=7,n_estimators=8)
    return model
def build_SVM_REGRESSION_model():
    model=SVR()
    return model
def build_XGBOOST_reg_model():
    model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.01,max_depth = 7, alpha = 0, n_estimators = 400)
    return model
normal_model = build_normal_model()
#normal_model = build_Random_forest_model()
#normal_model=build_SVM_REGRESSION_model()
# normal_model=build_XGBOOST_reg_model()
model = normal_model

# Defining model execution functions

In [None]:
def build_normal_model_grid_search(input_nodes,hidden_layer_nodes,init_kernel,optimizer,loss_item,activation_item,regularizer_parameter):
    #look for regularization with keras 
    
    model = Sequential()   
    
    model.add(Dense(input_nodes, input_dim=28,kernel_regularizer=regularizers.l2(regularizer_parameter),kernel_initializer=init_kernel, activation=activation_item))
    model.add(Dense(hidden_layer_nodes,activation=activation_item))    
    model.add(Dense(1, activation=activation_item))    
    model.compile(loss=loss_item, optimizer=optimizer, metrics=['mse','mae'])
    model.summary()
    return model
def gridSearch_neural_network(X_train, y_train,baseline_model):
    input_layer_nodes=[28]
    hidden_layer_nodes=[14]
    glorot='glorot_uniform'
    init = ['normal']
    RMSprop = tf.keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
    SGD=tf.keras.optimizers.SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)    
    Adam=tf.keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)    
    optimizer_list=[Adam]
    
    mean_squared_error="mean_squared_error"
    mean_absolute_error="mean_absolute_error"         
    hinge="hinge"
    
    losses_f_list=[mean_squared_error]
        
    activation_list=['relu']
    
    num_inodes=input_layer_nodes[0]
    num_hnodes=hidden_layer_nodes[0]
    init_kernel_name=init[0]
    optimizer_alg=optimizer_list[0]
    loss_function_f=losses_f_list[0]
    activation_f=activation_list[0]
    best_performance=1000
    
    learning_rate_combinations_list=[0.0001,0.0003,0.001,0.003,0.01,0.03]
    b_learning_rate=learning_rate_combinations_list[0]
    b_regularization_rate_parameter=learning_rate_combinations_list[0]
    
    historical_performance_list=[]
    counter=0
    for inodes in input_layer_nodes:
        for hnodes in hidden_layer_nodes:
            for init_k in init:
                for opt in optimizer_list:
                    for loss_item in losses_f_list:
                        for activation_item in activation_list:   
                            for learning_rate in learning_rate_combinations_list:
                                opt=tf.keras.optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)    
                                for reg_parameter in learning_rate_combinations_list:
                                    for i in range(0,1):
                                        baseline_model=build_normal_model_grid_search(inodes,hnodes,init_k,opt,loss_item,activation_item,reg_parameter)
                                        history=baseline_model.fit(X_train, y_train,epochs=100,verbose=0)
                                        counter+=1

                                        if best_performance > history.history['mean_squared_error'][len(history.history['mean_squared_error'])-1]:
                                            best_performance = history.history['mean_squared_error'][len(history.history['mean_squared_error'])-1]
                                            num_inodes=inodes
                                            num_hnodes=hnodes
                                            init_kernel_name=init_k
                                            optimizer_alg=opt
                                            loss_function_f=loss_item
                                            activation_f=activation_item
                                            b_learning_rate=learning_rate
                                            b_regularization_rate_parameter=reg_parameter
                                            print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
                                            print(history.history['mean_squared_error'])
                                            print("We have a new BEST PERFORMANCE: "+str(best_performance))
                                            print("num input nodes "+str(num_inodes))
                                            print("num idden layer nodes: "+str(num_hnodes))
                                            print("init kernel name: "+str(init_kernel_name))
                                            print("optimizer algorith: "+str(optimizer_alg))
                                            print("loss function: "+str(loss_function_f))
                                            print("activation function: "+str(activation_f))
                                            print("b_learning_rate: "+str(b_learning_rate))
                                            print("b_regularization_rate_parameter: "+str(b_regularization_rate_parameter))
                                            print("COUNTER= "+str(counter))
                                            historical_performance_list.append([counter,best_performance])
                                            print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

                    
    print("*****************************************************")
    print("FINALLY OUR BEST PERFORMANCE: "+str(best_performance))
    print(num_inodes)
    print(num_hnodes)
    print(init_kernel_name)
    print(optimizer_alg)
    print(loss_function_f)
    print(activation_f)
    print("***************************************************************************")
    print("historical_performance_list:")
    print(historical_performance_list)
    print("***************************************************************************")
def build_Random_fores_model_grid_search(estimator,depth):
    return RandomForestRegressor(max_depth=depth,n_estimators=estimator)
    
def gridSearch_Random_forest(x_in, y_in,baseline_model):
    test_size = 0.2
    total = x_in.shape[0]
    train_idx = round((1-test_size) * total)



    x_train = x_in[0:train_idx]
    #x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1])

    x_test = x_in[train_idx+1:total-1]
    #x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1])

    y_train = y_in[0:train_idx]
    y_test = y_in[train_idx+1:total-1]

    
    n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
    max_depths = numpy.linspace(1, 32, 32, endpoint=True)
    
    best_performance=1000
    b_estimator=n_estimators[0]
    b_depth=max_depths[0]
    
    counter=0
    historical_performance_list=[]
    for estimator in n_estimators:
        for depth in max_depths:
            for i in range(0,3):

                grid_search_model=build_Random_fores_model_grid_search(int(estimator),depth)
                grid_search_model.fit(x_train, y_train)
                predictions = grid_search_model.predict(x_test)
                #print(predictions)
                # Calculate the absolute errors
                errors = abs(predictions - y_test)
                #print(errors.shape)
                #print('Mean Absolute Error:', round(numpy.mean(errors), 2), 'bikes.')
                mse = mean_squared_error(predictions, y_test)


                counter+=1
                if best_performance > mse:
                    best_performance = mse
                    b_estimator=estimator
                    b_depth=depth
                    print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
                    print(mse)
                    print("We have a new BEST PERFORMANCE: "+str(best_performance))
                    print("b_estimator: "+str(b_estimator))
                    print("b_depth: "+str(b_depth))                
                    print("COUNTER= "+str(counter))
                    historical_performance_list.append([counter,best_performance])
                    print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
    print("*****************************************************")
    print("FINALLY OUR BEST PERFORMANCE: "+str(best_performance))
    print(b_estimator)
    print(b_depth)
    print("We had  "+str(counter)+" iterations.")
    print("***************************************************************************")
    print("historical_performance_list:")
    print(historical_performance_list)
    print("***************************************************************************")      
def build_XGBOOST_reg_grid_search_model(learning_r,max_depth,alpha,estimator):
    return xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = learning_r,max_depth = max_depth, alpha = alpha, n_estimators = estimator)    
def gridSearch_xgBoost(x_in, y_in,baseline_model):
    test_size = 0.2
    total = x_in.shape[0]
    train_idx = round((1-test_size) * total)



    x_train = x_in[0:train_idx]
    #x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1])

    x_test = x_in[train_idx+1:total-1]
    #x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1])

    y_train = y_in[0:train_idx]
    y_test = y_in[train_idx+1:total-1]

    
    learning_rate_list = [0.01,0.03,0.1,0.3]
    max_depths_list = [2,5,7,8]
    alpha_list=[0,7,9,10,14]
    n_estimators_list=[100,200,300,400]
    
    
    best_performance=1000
    b_learning_r=learning_rate_list[0]
    b_max_depth=max_depths_list[0]
    b_alpha=alpha_list[0]
    b_estimator=n_estimators_list[0]
    
    
    counter=0
    historical_performance_list=[]
    for learning_r in learning_rate_list:
        for max_depth in max_depths_list:
            for alpha in alpha_list:
                for estimator in n_estimators_list:
                    for i in range(0,2):
                        
                        grid_search_model=build_XGBOOST_reg_grid_search_model(learning_r,max_depth,alpha,estimator)
                        grid_search_model.fit(x_train, y_train)
                        predictions = grid_search_model.predict(x_test)
                        #print(predictions)
                        # Calculate the absolute errors
                        errors = abs(predictions - y_test)
                        #print(errors.shape)
                        #print('Mean Absolute Error:', round(numpy.mean(errors), 2), 'bikes.')
                        mse = mean_squared_error(predictions, y_test)


                        counter+=1
                        if best_performance > mse:
                            best_performance = mse
                            b_learning_r=learning_r
                            b_max_depth=max_depth
                            b_alpha=alpha                       
                            b_estimator=estimator
                            
                            print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
                            print(mse)
                            print("We have a new BEST PERFORMANCE: "+str(best_performance))
                            print("b_learning_r: "+str(b_learning_r))
                            print("b_max_depth: "+str(b_max_depth))                
                            print("b_alpha: "+str(b_alpha))
                            print("b_estimator: "+str(b_estimator))
                            print("COUNTER= "+str(counter))
                            historical_performance_list.append([counter,best_performance])
                            print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
    print("*****************************************************")
    print("FINALLY OUR BEST PERFORMANCE: "+str(best_performance))    
    print("We had  "+str(counter)+" iterations.")
    print("***************************************************************************")
    print("historical_performance_list:")
    print(historical_performance_list)
    print("***************************************************************************")      
                            
            
    
def save_model(dataFrame,model_name,min_value,max_value):
    import pickle
    file_name_string='model_data_prediction/'+model_name+"_predictions"
    if min_value!=None or max_value!=None:
        file_name_string+="_from_"+str(min_value)+"_to_"+str(max_value)
    file_name_string+=".bin"
    print("this is the filename string")
    print("Saving model on "+file_name_string)
   
    dataFrame[min_value:max_value].to_pickle(file_name_string)
def execute_Sequential_model(model,df):
    df.set_index('timestamp_from', inplace=True)
    df = df.sort_values(by=['timestamp_from'])

    #set bike_count as Y
    df_y = df.bike_count
    df_x = df.drop(columns="bike_count")
    #Normalization
    x = df_x.values
    y = df_y.values.reshape(-1, 1)

    x_scaler = preprocessing.MinMaxScaler()
    x_normalized = x_scaler.fit_transform(x)

    y_scaler = preprocessing.MinMaxScaler()
    y_normalized = y_scaler.fit_transform(y)

    x_in = pd.DataFrame(x_normalized)
    y_in = pd.DataFrame(y_normalized)


    #Prepare train & test dataset
    test_size = 0.2
    total = x_in.shape[0]
    train_idx = round((1-test_size) * total)



    x_train = x_in[0:train_idx].values
    #x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1])

    x_test = x_in[train_idx+1:total-1].values
    #x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1])

    y_train = y_in[0:train_idx].values
    y_test = y_in[train_idx+1:total-1]



    print("x_train shape {}".format(x_train.shape))
    print("y_train shape {}".format(y_train.shape))
    def show_loss_graph(hist,variable):
        plt.plot(history.history[variable])    
        plt.title('model '+variable)
        plt.ylabel(variable)
        plt.xlabel('epoch')
        plt.legend(['train '+variable], loc='upper left')
        plt.show()
        
    #gridSearch_neural_network(x_train, y_train,model)
    
    
    history=model.fit(x_train, y_train,epochs=100)
    
    model.save('.././model/model-23-jun.bin')
    show_loss_graph(history,"loss")
    show_loss_graph(history,"mean_squared_error")
    def evaluate_model_and_show_graph(x_test,y_test,model,min_value=None,max_value=None):
        model.evaluate(x_test, y_test)
        y_prediction = model.predict(x_test)
        print(y_prediction)
        y_pred_scaled = y_scaler.inverse_transform(y_prediction)
        y_test_scaled = y_scaler.inverse_transform(y_test)
        mse = mean_squared_error(y_pred_scaled[min_value:max_value], y_test_scaled[min_value:max_value])
        mae=mean_absolute_error(y_pred_scaled[min_value:max_value], y_test_scaled[min_value:max_value])
        print("MSE: "+str(mse))
        print("MAE: "+str(mae))
        plt.rcParams['figure.figsize'] = [18, 18]
        #---SAVING PREDICTIONS DATAFRAME
        pred_df=pd.DataFrame(y_pred_scaled)     
        y_test_df=pd.DataFrame(y_test_scaled)          
        save_model(pred_df,'FNN',min_value,max_value)
        
        '''y_test_df=y_test.reset_index(inplace=False)  
        y_test_df=y_test_df.drop(columns="timestamp_from")
        save_model(y_test_df,'Ground_t',min_value,max_value)'''
        #---END SAVING PREDICTIONS DATAFRAME   
        l1, = plt.plot(y_test_scaled[min_value:max_value], 'g')
        l2, = plt.plot(pred_df[min_value:max_value] ,'r', alpha=0.7)
        plt.legend(['Ground truth', 'Predicted'])
        plt.show()
    evaluate_model_and_show_graph(x_test,y_test,model)
    update_one_hot_data('.././data/combined_data_22-jun.tsv','.././data/combined_data_22-jun_one_hot_data.tsv')
    df_new_data = pd.read_csv('.././data/combined_data_22-jun_one_hot_data.tsv',
                     sep='\t',
                     header=0)
    start_size=df_new_data.shape[0]
    print(df_new_data.shape)
    df_new_data = remove_outlier(df_new_data, "bike_count")
    print("We have removed "+str(start_size-df_new_data.shape[0])+" outliers tuples (which is "+str((start_size-df_new_data.shape[0])/start_size*100)+" % of total).")
    df_new_data=remove_features(df_new_data)    
    df_new_data.set_index('timestamp_from', inplace=True)
    df_new_data = df_new_data.sort_values(by=['timestamp_from'])
    print(df_new_data.shape)


    #set bike_count as Y
    df_y = df_new_data.bike_count
    df_x = df_new_data.drop(columns="bike_count")


    #Normalization
    x = df_x.values
    y = df_y.values.reshape(-1, 1)

    x_scaler = preprocessing.MinMaxScaler()
    x_normalized = x_scaler.fit_transform(x)

    y_scaler = preprocessing.MinMaxScaler()
    y_normalized = y_scaler.fit_transform(y)

    x_in = pd.DataFrame(x_normalized)
    y_in = pd.DataFrame(y_normalized)
    #this line is commented in order to obtain the correct .bin data on data/data_model_prediction
    #evaluate_model_and_show_graph(x_in,y_in,model)
    print("You are running a Sequential model")
    return model
def execute_Random_forest_model(model,df_x,df_y): 
    #gridSearch_Random_forest(df_x, df_y,model)
    model.fit(df_x, df_y)
    print("You are running a RandomForest model")
    return model
def execute_SVM_regression_model(model,df_x,df_y):
    model.fit(df_x, df_y)
    print("You are running a SVM-Regression model")
    return model
def execute_XGBOOST_model(model,df_x,df_y):
    #gridSearch_xgBoost(df_x,df_y,model)
    model.fit(df_x, df_y)
    print("You are running a XGBOOST model")
    return model
    
    

# Running models

In [None]:
x_test=None
y_test=None
print(df.shape)
print(type(model))
#input()
def EXECUTE_MODEL(model,df):
    if isinstance(model,sklearn.ensemble.forest.RandomForestRegressor):
        df.set_index('timestamp_from', inplace=True)
        df = df.sort_values(by=['timestamp_from'])
        #set bike_count as Y
        df_y = df.bike_count
        df_x = df.drop(columns="bike_count")
        #Prepare train & test dataset
        test_size = 0.2
        total = df_x.shape[0]
        train_idx = round((1-test_size) * total)



        x_train = df_x[0:train_idx].values
        #x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1])
        global x_test
        global y_test

        x_test = df_x[train_idx+1:total-1].values
        #x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1])

        y_train = df_y[0:train_idx].values
        y_test = df_y[train_idx+1:total-1]
        return execute_Random_forest_model(model,x_train,y_train)
    if isinstance(model,sklearn.svm.classes.SVR):
        df.set_index('timestamp_from', inplace=True)
        df = df.sort_values(by=['timestamp_from'])
        #set bike_count as Y
        df_y = df.bike_count
        df_x = df.drop(columns="bike_count")
        #Prepare train & test dataset
        test_size = 0.2
        total = df_x.shape[0]
        train_idx = round((1-test_size) * total)



        x_train = df_x[0:train_idx].values
        #x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1])
             

        x_test = df_x[train_idx+1:total-1].values
        #x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1])

        y_train = df_y[0:train_idx].values
        y_test = df_y[train_idx+1:total-1]        
        return execute_SVM_regression_model(model,x_train,y_train)
    if isinstance(model,xgboost.sklearn.XGBRegressor):
        df.set_index('timestamp_from', inplace=True)
        df = df.sort_values(by=['timestamp_from'])
        #set bike_count as Y
        df_y = df.bike_count
        df_x = df.drop(columns="bike_count")
        #Prepare train & test dataset
        test_size = 0.2
        total = df_x.shape[0]
        train_idx = round((1-test_size) * total)



        x_train = df_x[0:train_idx].values
        #x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1])
        

        x_test = df_x[train_idx+1:total-1].values
        #x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1])

        y_train = df_y[0:train_idx].values
        y_test = df_y[train_idx+1:total-1]
        return execute_XGBOOST_model(model,x_train,y_train)        
    else:
        return execute_Sequential_model(model,df)
    
    
#print(type(model))
#print(df)
model=EXECUTE_MODEL(model,df)

# Feature Importances 

In [None]:
if isinstance(model,sklearn.ensemble.forest.RandomForestRegressor):
    #print(model.feature_importances_)
    feature_list = list(df.drop(columns="bike_count").columns)
    importances = list(model.feature_importances_)
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Print out the feature and importances 
    [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
    #print((model.feature_importances_.shape))
    def evaluate_random_forest_model(x_test,y_test,min_value=None,max_value=None):
        
        predictions = model.predict(x_test)
        #print(predictions)
        # Calculate the absolute errors
        errors = abs(predictions - y_test)
        #print(errors.shape)
        #print('Mean Absolute Error:', round(numpy.mean(errors), 2), 'bikes.')
        mse = mean_squared_error(predictions[min_value:max_value], y_test[min_value:max_value])
        mae=mean_absolute_error(predictions[min_value:max_value], y_test[min_value:max_value])
        print("MSE",mse)
        print("MAE",mae)
        plt.rcParams['figure.figsize'] = [18, 18]        
        temp_df=y_test.reset_index(inplace=False)
        temp_df=temp_df.drop(columns="timestamp_from")  
        #---SAVING PREDICTIONS DATAFRAME
        pred_df=pd.DataFrame(predictions)  
        '''print(type(pred_df))
        print(pred_df.shape)
        print(pred_df)
        input()'''
        save_model(pred_df,'RandomForestRegressor',min_value,max_value)
        y_test_df=y_test.reset_index(inplace=False)     
        y_test_df=y_test_df.drop(columns="timestamp_from")
        save_model(y_test_df,'Ground_t',min_value,max_value)
        #---END SAVING PREDICTIONS DATAFRAME  
        
        l1, = plt.plot(temp_df[min_value:max_value], 'g')
        l2, = plt.plot(pred_df[min_value:max_value], 'r', alpha=0.7)
        plt.legend(['Ground truth', 'Predicted'])
        plt.show()
    evaluate_random_forest_model(x_test,y_test)
if isinstance(model,sklearn.svm.classes.SVR):
    #print(model.feature_importances_)
    
    #print((model.feature_importances_.shape))
    def evaluate_random_SVM_model(x_test,y_test,min_value=None,max_value=None):
        
        predictions = model.predict(x_test)
        #print(predictions)
        # Calculate the absolute errors
        errors = abs(predictions - y_test)
        #print(errors.shape)
        #print('Mean Absolute Error:', round(numpy.mean(errors), 2), 'bikes.')
        mse = mean_squared_error(predictions[min_value:max_value], y_test[min_value:max_value])
        mae=mean_absolute_error(predictions[min_value:max_value], y_test[min_value:max_value])
        print("MSE",mse)
        print("MAE",mae)
        plt.rcParams['figure.figsize'] = [18, 18]        
        temp_df=y_test.reset_index(inplace=False)
        temp_df=temp_df.drop(columns="timestamp_from")  
        #---SAVING PREDICTIONS DATAFRAME
        pred_df=pd.DataFrame(predictions)      
        save_model(pred_df,'SVR',min_value,max_value)
        y_test_df=y_test.reset_index(inplace=False)  
        y_test_df=y_test_df.drop(columns="timestamp_from")
        save_model(y_test_df,'Ground_t',min_value,max_value)
        #---END SAVING PREDICTIONS DATAFRAME        
        l1, = plt.plot(temp_df[min_value:max_value], 'g')
        l2, = plt.plot(pred_df[min_value:max_value], 'r', alpha=0.7)
        plt.legend(['Ground truth', 'Predicted'])
        plt.show()
    evaluate_random_SVM_model(x_test,y_test)
if isinstance(model,xgboost.sklearn.XGBRegressor):
    def evaluate_xgboost_model(x_test,y_test,min_value=None,max_value=None):
        
        predictions = model.predict(x_test)
        print(type(predictions))
        
        #print(predictions)
        # Calculate the absolute errors
        predictions[predictions<0]=0
        errors = abs(predictions[min_value:max_value] - y_test[min_value:max_value])
        #print(errors.shape)
        #print('Mean Absolute Error:', round(numpy.mean(errors), 2), 'bikes.')
        mse = mean_squared_error(predictions[min_value:max_value], y_test[min_value:max_value])
        mae=mean_absolute_error(predictions[min_value:max_value], y_test[min_value:max_value])
        print("MSE",mse)
        print("MAE",mae)
        plt.rcParams['figure.figsize'] = [18, 18]        
        temp_df=y_test.reset_index(inplace=False)
        temp_df=temp_df.drop(columns="timestamp_from")         
        #---SAVING PREDICTIONS DATAFRAME
        
        pred_df=pd.DataFrame(predictions) 
        
        
        save_model(pred_df,'XGBRegressor',min_value,max_value)
        y_test_df=y_test.reset_index(inplace=False)   
        y_test_df=y_test_df.drop(columns="timestamp_from")
        save_model(y_test_df,'Ground_t',min_value,max_value)
        #---END SAVING PREDICTIONS DATAFRAME
        l1, = plt.plot(temp_df[min_value:max_value], 'g')
        l2, = plt.plot(pred_df[min_value:max_value], 'r', alpha=0.7)
        plt.legend(['Ground truth', 'Predicted'])
        plt.show()    
    evaluate_xgboost_model(x_test,y_test)
   

In [None]:
 print(dt.now()-start)