# Imports

In [1]:
# Import some globally used libraries
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import sqlite3 as sql
import seaborn as sns
import geopandas
from math import radians, cos, sin, asin, sqrt
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from scipy.stats import zscore
from PIL import Image


# Constants

In [2]:
# Path to the data folder
DATA_PATH=os.path.join('.','data')

# Path to the image folder
IMG_PATH=os.path.join('.','img')

# SQLLite database filename
SQLITE_DATAFILE=os.path.join(DATA_PATH,'capstone-db.sqlite')

# NPZ data filename
NPZ_DATAFILE=os.path.join(DATA_PATH, 'capstone-data.npz')

# NPZ normalized data filename
NPZ_NORMALIZED_DATAFILE=os.path.join(DATA_PATH, 'capstone-data-normalized.npz')

# Tablename of the final dataset
STATION_TABLENAME='stations'
TRAVEL_TABLENAME='travel_improved'
WEATHER_CAT_TABLENAME='weather_cat_improved'
WEATHER_NUM_TABLENAME='weather_num_improved'

# Pandas library default options
PANDAS_DISPLAY_WIDTH=100
PANDAS_DISPLAY_MAX_COLUMN=50
PANDAS_DISPLAY_MAX_ROWS=15

# Set default value for train dataset size
TRAIN_SIZE_DEFAULT=0.8

# Frac parameter used when loading data for ML training
FRAC_VALUE_FOR_ML=0.1

# Random state value passed to fix rendom results
RANDOM_STATE=47

# Default settings

In [3]:
# Set some default display for the pandas lib
pd.set_option("display.width", PANDAS_DISPLAY_WIDTH)
pd.set_option("display.max_columns", PANDAS_DISPLAY_MAX_COLUMN)
pd.set_option("display.max_rows", PANDAS_DISPLAY_MAX_ROWS)



# Data functions

In [4]:
def load_csv(dataset = 'travel', low_memory=False) -> pd.DataFrame:
    """
    Open csv file passed as parameter from {DATA_PATH} folder.
    This function will take care of splitted dataset like nyc_travel_X.csv
    and will rebuild a complete dataset by merging multiple files.
    
    This function will seek for files on disk using the following algorythm:
      - "nyc_{}_{}.csv".format(dataset,<part_number>) with part number varying depending on
        the number of dataset split.
    
    Returns
    -------
    pandas.DataFrame
    """    
    
    return pd.concat(
        [
            pd.read_csv(f'{DATA_PATH}/{filename}', low_memory=low_memory) 
            for filename in os.listdir(DATA_PATH) 
            if (
                filename.endswith('.csv') and
                filename.startswith(f'nyc_{dataset}')
            )
        ]
    )


In [5]:
print("Opening connection to database")
sql_database_connection=sql.connect(SQLITE_DATAFILE)


def get_sql_connection() -> sql.Connection:
    """
    
    Returns SQLite connection object to the SQL database file of the project
    
    This function shouldn't be used directly, it's a convenient function to be used by
    load_sql() and save_sql() functions
    
    Returns:
    --------
    sqlite3.Connection
    
    """
   
    # Return SQLite connection object
    return sql_database_connection


Opening connection to database


In [6]:

def pythagore(lat_1, long_1, lat_2, long_2) -> float:
    """
    Returns square distance between two location, given their latitude and longitude values,
    using the Pythagore formula.
    
    Note: The distance is returned as the power of 2 of it, to reduce number of operations
    The goal here is to find the nearest, not an absolute distance.
    
    I've decided to implement the pythagore() function in the SQLite connection object with
    the sqltite3.Connection.create_function. More details on the SQLite3 documentation:
    https://docs.python.org/2/library/sqlite3.html#sqlite3.Connection.create_function
    
    This pythagore function() will be used in different queries, that why I made it available
    on the global connection object
    
    Returns:
    --------
    float
    
    """
    return (lat_1 - lat_2)**2 + (long_1 - long_2)**2


# Add new SQL function to connection
print("Add pythagore() function to SQLite engine")
con=sql_database_connection
con.create_function('pythagore', 4, pythagore)



Add pythagore() function to SQLite engine


In [7]:
def drop_table(tablename, sql_connection=None) -> bool:
    """
    Drop a table registered in the SQL database.    
    Returns True if DROP operation is succesfful, False otherwise
    
    The sql_connection parameter can be used to pass a connection object to a database where the saving
    process should be done. If set to None (default value), the connection used is the one returned
    by the get_sql_connection() function.
    
    
    Code copied from: https://www.tutorialspoint.com/python_data_access/python_sqlite_drop_table.htm
    Returns:
    --------
    bool
    
    """
    
    # Get SQLite connection if not provided
    if sql_connection==None:
        sql_connection = get_sql_connection()
               
    #Creating a cursor object using the cursor() method
    cursor = sql_connection.cursor()

    try:
        #Doping tablename table if already exists
        cursor.execute(f'DROP TABLE IF EXISTS {tablename}')
        print(f"'{tablename}' table dropped from SQL database")

        #Commit your changes in the database
        sql_connection.commit()
    except Exception as e:
        print(f"Error while dropping table: {tablename}")
        print(e)
        return False
               
    # Everything is OK,
    return True


In [8]:
def save_sql(dataset, tablename = 'travel', sql_connection=None, if_exists = 'replace') -> bool:
    """
    Dump dataset passed as parameter in an SQL table named 'tablename' of the SQLITE_DATAFILE.
    
    The 'if_exists' parameter is the same usage as in pandas.to_sql() method, it value is passed as is to the
    pandas.to_sql() call.
    
    It will return a boolean value set to True if saving process was done completly.
    
    The sql_connection parameter can be used to pass a connection object to a database where the saving
    process should be done. If set to None (default value), the connection used is the one returned
    by the get_sql_connection() function.
    
    Returns
    -------
    bool
    """
    
    # Get SQLite connection if not provided
    if sql_connection==None:
        sql_connection = get_sql_connection()

    # Write dataset content to SQL database
    try:
        dataset.to_sql(f'{tablename}', sql_connection, if_exists=if_exists, index=False) # index=False
    except Exception as e:
        print("[ERROR] while writting data to SQL datafile")
        print(e)
        return False
    finally:
        print("Saving OK")
        return True
    


In [9]:
def load_sql(tablename='travel', query=None, limit=None, offset=0, verbose=True, sql_connection=None) -> pd.DataFrame:
    """
    
    Load data from the SQL database and returns result as a dataframe.
    
    This function has two different behaviour, depending on the 'query' parameter.
    
    - query=None: In that situation, the whole content of the 'tablename' is loaded into a pd.Dataframe using
      the following query: 'SELECT * FROM {tablename}'
      
    - query=A valid SQL query: In that case, the dataframe returned by the function is filled in with the data
      rtrived using the 'query' SQL command. THe 'tablename' parameter is ignored.
      
    It accepts two optional int parameter, limit and offset, which if set, append 'LIMIT limit' and 'OFFSET offest' to the query.
    This is mostly used to retrieve a small subset of the table or query.
    Of course, the 'OFFSET' directive is only set if 'LIMIT' is set.
    
    Verbose parameter, is set to True, will display the query that will be used. Default value is True
    
    The sql_connection parameter can be used to pass a connection object to a database where the saving
    process should be done. If set to None (default value), the connection used is the one returned
    by the get_sql_connection() function.

    Returns
    -------
    pandas.DataFrame
    
    """
    
    
    # Get SQLite connection if not provided
    if sql_connection==None:
        sql_connection = get_sql_connection()
    
    # Initialize empty variable to store resulting dataframe
    df=None
    
    # Check 'query' parameter
    if query==None:
        query=f'SELECT * FROM {tablename}'
        
    # Check 'limit' parameter
    if limit!=None:
        query+=' LIMIT {} OFFSET {}'.format(limit, offset)
    
    # Print query if verbose=True
    if verbose==True:
        print("Query: {}".format(query))
    # Run SQL 'query'
    try:
        df=pd.read_sql_query(query, sql_connection)
    except Exception as e:
        print("[ERROR] while running SQL query:", query)
        print(e)
        return None
    finally:
        return df


In [10]:
def get_slice_list(dataset, slice_interval=100000) -> list :
    """
    Returns a list of tuples representing slices of lines of the dataset
    passed as first parameter.
    
    The size of the slices is set by the 'slice_interval' parameter (default is 100'000)
    
    The dataset passed as parameter is used to determine the number of lines.
    
    The list of tuples returned looks like:
      [(0,99999), (100000,199999), (200000, 299999), ..., (1400000, <number of lines>)]
    
    Returns:
    --------
    list
    
    """
    
    # slices list that the function will return
    intervals=[]

    # Get number of lines
    nb_lines=len(dataset.index)
    
    # set internal counter to 0
    i=0
    
    # Loop while inernal counter is lower than number of lines
    while i < nb_lines:
        if i+slice_interval < nb_lines:
            intervals.append((i, i+slice_interval-1))
        else:
            intervals.append((i,nb_lines))
        i+=slice_interval

    return(intervals)


In [11]:
def save_npz(dataset, y_column='km_per_hour', y_dtype='float', npz_filename=NPZ_NORMALIZED_DATAFILE) -> None:
    """
    Function used to save the dataset passed as parameter in an NPZ file, along with the datasets
    created during the Data Preparation of this project.
    
    The parameter 'y_column' is used to define, in the dataset passed as parameter, wich feature is
    the Y vector. Defaults to 'km_per_hour'. 'y_dtype' defines the type of the 'y_column'. Default to 'float'.
    
    The content of the NPZ file will be made of:
    
    - The dataset passed as parameter
    - The categorical and numerical column names of this dataset, based on their dtype      
    - The dependent variable name (as list with one element) and its dtype
    - The *stations* dataset
    - The *travel improved* dataset
    - The *weather categorical improved* dataset
    - The *weather numerical improved* dataset
    
    Returns:
    --------
    None
    
    """
    
    # Get numerical feature names from dataset
    numerical_features=dataset.head(1).drop(y_column, axis=1).select_dtypes('float').columns
    print("Numerical features: ", ','.join(numerical_features))
    
    # Get categorical feature names from dataset
    categorical_features=dataset.head(1).drop(y_column, axis=1).select_dtypes('int').columns
    print("Categorical features: ", ','.join(categorical_features))
    
    # Build the dict that will be saved into NPZ file
    # Add features
    print("Build dict to pass to savez_compressed...")
    npz_dict={
        'features_numerical'          : list(numerical_features),
        'features_categorical'        : list(categorical_features),
        'features_y'                  : [y_column],
        'features_y_dtype'            : [y_dtype],

        'dataset_full'                : dataset,
        'dataset_full_columns'        : list(dataset.columns),

        'dataset_stations'            : load_sql(STATION_TABLENAME),
        'dataset_stations_columns'    : list(load_sql(STATION_TABLENAME, limit=1).columns),

        'dataset_travel'              : load_sql(TRAVEL_TABLENAME),
        'dataset_travel_columns'      : list(load_sql(TRAVEL_TABLENAME, limit=1).columns),

        'dataset_weather_cat'         : load_sql(WEATHER_CAT_TABLENAME),
        'dataset_weather_cat_columns' : list(load_sql(WEATHER_CAT_TABLENAME, limit=1).columns),

        'dataset_weather_num'         : load_sql(WEATHER_NUM_TABLENAME),
        'dataset_weather_num_columns' : list(load_sql(WEATHER_NUM_TABLENAME, limit=1).columns),

    }

    # Save npz_dict to an NPZ file
    print("Save dict to NPZ file", npz_filename)
    np.savez_compressed(npz_filename, **npz_dict)

    print("Process terminated")


In [12]:
def load_npz_as_dict(dataset='full', frac=1, random_state=RANDOM_STATE, verbose=True, y_dtype='float', npz_filename=NPZ_NORMALIZED_DATAFILE) -> dict:
    """
    This function returns one of the dataset stored in the NPZ file passed as parameter,
    and if the dataset claimed is the full one, then its feature names are added to the
    dict returned by the function.
    
    The dict structure returned looks like this:
        - features: (if requested dataset is the full one)
            - numerical
            - categorical
            - all
            - y
        - dataset
        - frac  
    
    The NPZ file passed should contain a Python dict built in Notebook No 17
    
    The dataset parameter is used to determine which dataset the function should return.
    Dafault value is 'full'
    
    The frac parameter, if < 1, will be used as parameter to the sample() method to keep only
    a fraction of the dataset. frac=0.1 means 10% of the lines.
    
    When frac is used, the random_state parameter could be used to change the random seed
    of the sample() method. Default is 5.
        
    Note: This function takes care of reapplying dtypes to numerical and categorical columns
    before returning dataframe (this is lost in save/load NPZ process)

    Returns:
    --------
    dict
    
    """
    
    # initialize returned variable
    ret_dict=dict()
    
    # Load the NPZ file
    if verbose:
        print(f"Loading dataset '{dataset}' from NPZ file", npz_filename)
              
    with np.load(npz_filename) as npz_file:
        
        # initialize retruned dict
        ret_dict={
            'features': dict(),
            'dataset': None
        }
        
        if dataset=='full':
            # Load feature names
            for feature in ['numerical', 'categorical', 'y']:
                # Load features
                ret_dict['features'][feature]=npz_file[f'features_{feature}']
                
                # Remap types. If len of features == 1, convert to str, list otherwise.
                if(len(ret_dict['features'][feature]) == 1):
                    ret_dict['features'][feature]=''.join(ret_dict['features'][feature])
                else:
                    ret_dict['features'][feature]=list(ret_dict['features'][feature])
            # Load y_dtype
            ret_dict['features']['y_dtype']=''.join(npz_file[f'features_y_dtype'])

        ret_dict['features']['all']=list(npz_file[f'dataset_{dataset}_columns'])
        ret_dict['dataset']=pd.DataFrame(npz_file[f'dataset_{dataset}'], columns=ret_dict['features']['all'])
        
        # Apply dtype to full dataset
        if dataset=='full':
            # Build dtype column dictionnary
            if verbose:
                print("Apply correct dtype to dataset column")
            column_dtype={ret_dict['features']['y']: ret_dict['features']['y_dtype']}
            for col in ret_dict['features']['numerical']:
                column_dtype[col] = 'float'
            for col in ret_dict['features']['categorical']:
                column_dtype[col] = 'int'
            if verbose:
                print(column_dtype)
            
            # Apply dtype to dataset columns
            ret_dict['dataset']=ret_dict['dataset'].astype(column_dtype)         
        
        # sample dataframe if frac < 1
        if verbose:
            print(f"Building sample from dataset (frac={frac})")
        if frac < 1:
            ret_dict['dataset']=ret_dict['dataset'].sample(frac=frac, random_state=random_state)
            
        if verbose:
            print(' Dataset shape: {}'.format(ret_dict['dataset'].shape))
            print("\n")

    if verbose:
        print("Dataset loaded, returning dict")
    # Return the structure prepared by this function
    return ret_dict


In [13]:
def load_dataset(frac=1, random_state=RANDOM_STATE, verbose=True, y_dtype='float', npz_filename=NPZ_NORMALIZED_DATAFILE) -> tuple:
    """
    Convenient wrapper around the load_npz_as_dict() function that returns the full dataset, its feature and result vector name
    as a tuple.
    
    This function exists to simplify the code when loading full dataset. For example, the follwing instruction
    loads the full dataset store it in df variable, feature variable will contain a list of the rest of the tuple
    
        df,*features=load_dataset()
        
    Parameters are passed as is to the load_npz_as_dict() function.
    
    Returned tuple is:
        - dataset
        - all features names (without y)
        - y result vector name
        - numerical feature names
        - categorical feature names
    
    Returns:
    --------
    tuple
    
    """
    
    npz_dict=load_npz_as_dict(frac=frac, random_state=random_state, verbose=verbose, y_dtype=y_dtype, npz_filename=npz_filename)
    
    npz_dict['features']['all'].remove(npz_dict['features']['y'])

    return npz_dict['dataset'], npz_dict['features']['all'], npz_dict['features']['y'], npz_dict['features']['numerical'], npz_dict['features']['categorical']


In [14]:
def load_Xy_as_dict(train_size=TRAIN_SIZE_DEFAULT, frac=1, random_state=RANDOM_STATE, verbose=True, npz_filename=NPZ_NORMALIZED_DATAFILE) -> dict:
    """
    Used to get features and vector result of the 'full' dataset as X and y np.array, splitted into two daatset: A train
    and valid one.
    
    The 'train_size' parameter may be used to fix the train size (defaults 0.8). This parameter is passed as is to the
    'sklearn.model_selection.train_test_split()' method.
    
    The value returned is a dict object:
    
        - train:
            - X:    Train set of X features
            - y:    Train set of y vector result
            
        - valid:
            - X:    Validation set of X features
            - y:    Validation set of y vector result
            
        - all:
            - X:    Complete set of X features
            - y:    Complete set of y vector result
            
        - features: List of feature names
        - result:   Name of the y result vector
    
    The 'full' dataset is retrived using the 'load_dataset()' function.
    
    Returns:
    --------
    dict
    
    """
    # Load 'full' dataset
    if verbose:
        print("Loading dataset...")
    df, x_col, y_col, *_=load_dataset(frac=frac, random_state=random_state, verbose=False, y_dtype='float', npz_filename=npz_filename)

    # Split data in two dataframe    
    if verbose:
        print("Splitting dataset...")    
    tr_df, va_df = train_test_split(df, test_size = 1-train_size, train_size=train_size, random_state=random_state)

    # initialize returned value
    ret_dict={
        'train'   : {
            'X': tr_df.drop([y_col], axis=1).values,
            'y': tr_df[y_col].values
        },
        'valid'   : {
            'X': va_df.drop([y_col], axis=1).values,
            'y': va_df[y_col].values
        },
        'all'     : {
            'X': df.drop([y_col], axis=1).values,
            'y': df[y_col].values
        },
        'features': x_col,
        'result'  : y_col
    }
    
    if verbose:
        print("Load and split process terminated")    
        
        print("  Shape of X train variable:", ret_dict['train']['X'].shape)
        print("  Shape of y train variable:", ret_dict['train']['y'].shape)

        print("  Shape of X valid variable:", ret_dict['valid']['X'].shape)
        print("  Shape of y valid variable:", ret_dict['valid']['y'].shape)

        print("  Shape of X variable   :", ret_dict['all']['X'].shape)
        print("  Shape of y variable   :", ret_dict['all']['y'].shape)
        
    return ret_dict


In [15]:
def load_Xy(train_size=TRAIN_SIZE_DEFAULT, frac=1, random_state=RANDOM_STATE, verbose=True, npz_filename=NPZ_NORMALIZED_DATAFILE) -> tuple:
    """
    A wrapper function around load_Xy_as_dict() that returns X_tr, y_tr, X_va and y_va as a tuple.
    
    This function aims to simplify the code in Notebooks
    
    Returns:
    --------
    (X_tr, y_tr, X_va, y_va)
    
    """
    
    data=load_Xy_as_dict(train_size=train_size, frac=frac, random_state=random_state, verbose=False, npz_filename=npz_filename)
    
    return data['train']['X'], data['train']['y'], data['valid']['X'], data['valid']['y']


# Weather Station functions

In [16]:
def get_stations(station_list=[]) -> pd.DataFrame:
    """
    Loads the 'weather' dataset and returns a dataframe containing the list of stations.
    
    'list' parameter can be used to get a subset of the stations dataset. This parameter is a list
    and must contains the name of the STATION we'd like to retrieve from database.
    
    This list is taken from the SQL Database.
    
    Returns:
    --------
    pandas.DataFrame
    
    """
    
    if(len(station_list) > 0):
        query="select * from stations where STATION IN ('{}')".format("','".join(station_list))
        return load_sql(query=query, verbose=False)
    else:
        return load_sql('stations', verbose=False)


In [17]:
def get_nearest_station_from_location(latitude, longitude) -> str:
    """
    Returns the nearest station id from the latitude and longitude position received as parameter
    
    This function assumes that the SQL Table stations exists.
    !! Do not use this function before running the 83 Weather Stations notebook
    
    To get the nearest station, I will use Pythagore in an SQL query to get the distance between stations and
    the designated location, sort by the distance in descending order and return the first station returned
    by the query.
    
    Returns:
    --------
    str
    
    """

    query="SELECT station, latitude, longitude, pythagore(latitude, longitude, {}, {}) as distance FROM stations".format(latitude, longitude)
    query += " ORDER BY distance ASC LIMIT 1"

    # Get connection to SQL database
    con=get_sql_connection()
    
    # Return STATION cell of the first dataframe row
    return load_sql(query=query, sql_connection=con, verbose=False).loc[0]['STATION']


# Drawing function

In [18]:
def plot_rectangle(lower_left=[1,1], upper_right=[2,2], color='r', linestyle='-', linewidth=1 ):
    """
    Draw a rectangle in a pyplot canvas, based on two points:
    - downleft is the lower left corner of the rectangle defined as a [x,y]
    - upperright is the upper right corner of the rectangle
    
    Returns:
    None
    """
    
    plt.plot([lower_left[0], lower_left[0]], [lower_left[1], upper_right[1]], color=color, linestyle=linestyle, linewidth=linewidth)
    plt.plot([upper_right[0], upper_right[0]], [lower_left[1], upper_right[1]],color=color, linestyle=linestyle, linewidth=linewidth)
    plt.plot([lower_left[0], upper_right[0]], [lower_left[1], lower_left[1]],color=color, linestyle=linestyle, linewidth=linewidth)
    plt.plot([lower_left[0], upper_right[0]], [upper_right[1], upper_right[1]],color=color, linestyle=linestyle, linewidth=linewidth)


In [19]:
def get_nyc_map_geodataframe() -> geopandas.geodataframe.GeoDataFrame:
    """
    Return the Geopandas Dataframe that will be used to draw the nyc map.
    
    Returns:
    --------
    geopandas.geodataframe.GeoDataFrame
    """

    # Return the GeoDataFrame of New-York
    return geopandas.read_file('data/nyu_2451_34490')



In [20]:
def draw_nyc_map(latitude=[], longitude=[]):
    """
    Minimalist function that draws a map of New-York city
    
    The map definition is based on the shape provided by the University of Texas in Austin:
    https://archive.nyu.edu/retrieve/74689/nyu_2451_34490.zip
    
    TODO: Add parameters to draw plots on the map.
    
    Returns:
    --------
    None
    """

    # Load shape of New-York
    nyc_df = get_nyc_map_geodataframe()

    # Convert position to the same metric system of our datasets
    nyc_df=nyc_df.to_crs(epsg=4326)
    
    if len(latitude) == 0:
        
        nyc_df.plot(color='lightblue', edgecolor='black', figsize=(10,10), alpha=0.5)

    else:
       # Build dataframe with latitude and longitude values provided
        points = pd.DataFrame(
            {
                'Latitude': latitude,
                'Longitude': longitude
            }
        )
        
        # Initialize data point to draw on the map
        gdf = geopandas.GeoDataFrame(points, geometry=geopandas.points_from_xy(points.Longitude, points.Latitude)) 

        # Set drawing options
        ax = nyc_df.plot(color='lightblue', edgecolor='black', figsize=(10,10), alpha=0.5)

        # Draw the map with data points
        gdf.plot(ax=ax, color='red')


    plt.show()    


In [21]:
def draw_correlation_matrix(dataset, title='Correlation Matrix', figsize=(10,10), fontsize=10,
                                sns_style='darkgrid', center=0.5) -> None:
    """
    Draw a correlation matrix using a Seaborn heatmap for better visual search of data correlation
    
    Expects a dataset as parameter, and optional features like title, figsize, fontsize, and sns style.
    
    Returns:
    --------
    None
    
    """
    
    # Set figure figsize parameter
    plt.figure(figsize=figsize)
    
    # Draw heatmap
    sns.heatmap(dataset.corr(), annot=True, center=center, cmap='Blues')
    
    # Set title
    plt.title(title, weight="bold", fontsize=2*fontsize, pad=30)
    
    # Set X and Y labels font size
    plt.xticks(weight="bold", fontsize=fontsize)
    plt.yticks(weight="bold", fontsize=fontsize)


In [22]:
def draw_scatter_plot(dataset, x_columns, y_column, z_factor=2, graph_per_line=4, figsize=(25,15), ylim=[0,2], polyfit_deg=0) -> None:
    '''
    Function that draws scatter plots from the dataframe passed as first parameter (df), using the column name passed parameter
    x_column as X-axis, and column name passed as second y_column parameter as Y-axis. Number of Scatter plots equals the
    lenght of the x_column parameter, the 'graph_per_line' parameter is used to fix how many plots are displayed per line.
    The number of lines of scatter plots is equal to "( (len(x_column)-1) / graph_per_line ) + 1". Trust me ;-)
    
    Before plotting the result, this function will remove lines where x values are == 0, and remove outliers on both columns
    using a 'zscore' approach. The z_factor can be set using the 'z_factor' parameter.
    
    Figure size and Y-axis limit can be set using 'figsize' and 'y-limit' parameter.
    
    The last parameter, 'polyfit_deg', if > 0 determine the degrees of polyfit to be calculated and drawn along with the scatter plots.


    Returns:
    --------
    None
    
    '''

    nb_columns=len(x_columns)
    nb_line=int(((nb_columns-1)/graph_per_line)+1)
    
    print("Drawing graphs {} x {} (number of x_columns = {})".format(graph_per_line, nb_line, nb_columns))
    
    graph_col=0
    graph_line=0

    
    # Create Figure and Axes instances
    if(nb_line > 1):
        fig, axs = plt.subplots(nb_line, graph_per_line, sharey=True, figsize=figsize)
    else:
        fig, axs = plt.subplots(1, graph_per_line, figsize=figsize)
        

    for c in x_columns:

        # Log information
        print(f'Plotting feature: {c}')
        
        # Remove 0 values
        temp_df = dataset[dataset[c]!=0][[c, y_column]]
    
        # Calculate zscore of each cells of the columns passed as parameter
        zscore_values = zscore(temp_df[[c,y_column]])
    
        # Build an arraqy of boolen where True = np.abs(zscore) is lower than factor
        zscore_bool = np.abs(zscore_values) < z_factor
    
        # Use the all(axis=1) function on zscore_bool array to get a 1-row-array containing boolean values
        # where True means that at least one cell of the zscore_bool corresponding line is True
        # This 1-row-array will be used to filter the temp dataframe
        temp_df = temp_df[zscore_bool.all(axis=1)]
    
        # Make plot, set axes labels
        if(nb_line>1):
            ax=axs[graph_line, graph_col]
        else:
            ax=axs[graph_col]

        x=temp_df[c]
        y=temp_df[y_column]
        
        ax.scatter(x, y, s=10, c=y, cmap='viridis')
        ax.set_ylabel(y_column)
        ax.set_xlabel("{} - ({} lines)".format(c, x.shape[0]))
        ax.set_title("({}) / ({})".format(c,y_column))
        ax.set_ylim(ylim)
        
        
        # Draw polyfit from deg=1 to deg=polyfit_deg
        # Do nothing if polyfit_deg == 0 or > 3
        if polyfit_deg > 0 and polyfit_deg < 4:
            colors=['r','b','g']
        
            
            for i,color in enumerate(colors[:polyfit_deg]):
                coef=np.polyfit(x,y,deg=i+1)
                x_values=np.linspace(x.min(), x.max())
                y_values=np.polyval(coef, x_values)
                ax.plot(x_values, y_values, label='poly(deg={})'.format(i+1), c=color)
                ax.legend()
        
        # increment axes index
        graph_col+=1
        if(graph_col > graph_per_line-1):
            graph_col=0
            graph_line+=1

    # set the spacing between subplots
    plt.subplots_adjust(left=0.1,
                        bottom=0.1, 
                        right=0.9, 
                        top=0.9, 
                        wspace=0.2, 
                        hspace=0.3)
    
    print("Processing done, display result (may take some time)")
    
    # display graph
    plt.show()


# Datetime & Distance functions

In [23]:
def get_days(start='2016-01-01', end='2016-06-30') -> list:
    """
    Functions used to get a list of days, encoded as '%Y-%m-%d', between a start day and an end day.
    
    Start and end day are passed as parameters to the function. Default values are:    
    - start : '2016-01-01'
    - end   : '2016-06-30'
    
    
    Returns:
    --------
    list    
    
    """
    day_list=[]
    start_date=datetime.strptime(start, '%Y-%m-%d')
    end_date=datetime.strptime(end, '%Y-%m-%d')
        
    while (start_date <= end_date):
        day_list.append(start_date.strftime('%Y-%m-%d'))
        start_date=start_date+timedelta(days=1)

    return day_list


In [24]:
def is_weekend(value) -> int:
    """
    Returns 1 if the day of week of the datetijme64 value received as parameter is
    on a week-end (Saturday or Sunday)
        
    Returns:
    --------
    int
    
    """
    
    # Get day of week
    dayofweek=value.dayofweek
    
    # Check day of week value and return 1 or 0
    if dayofweek < 5:
        return 0
    else:
        return 1

In [25]:
def get_time_category(value):
    """
    Returns the category of the time value received as parameter.
    Note that the parameter format is pandas.dattime[64]
    
    Categories are morning, afternoon, evening, night
    
    Returns:
    --------
    str
    
    """
    
    # Retrieve hour from datetime parameter
    hour=int(value.strftime('%H'))
    
    if hour < 6:
        return 'night'
    elif hour <12:
        return 'morning'
    elif hour < 18:
        return 'afternoon'
    elif hour < 22:
        return 'evening'
    else:
        return 'night'

In [26]:
def get_distance_in_km(lat1, lon1, lat2, lon2) -> pd.core.series.Series:
        """
        Python 3 program to calculate Distance Between Two Points on Earth, based
        on latitude and longitude in degrees.
        
        It expects latitude and longitude pandas.Series of the two datapoints in degrees, and returns
        the distance between them in kilometers as a pandas.Series
        
        The code of this function has been copied from https://www.geeksforgeeks.org/program-distance-two-points-earth/
        
        Returns:
        --------
        pandas.core.series.Series
        
        """
        # The math module contains a function named
        # radians which converts from degrees to radians.
        lon1 = np.radians(lon1)
        lat1 = np.radians(lat1)

        lon2 = np.radians(lon2)
        lat2 = np.radians(lat2)

        # Haversine formula
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2

        c = 2 * np.arcsin(np.sqrt(a))

        # Radius of earth in kilometers. Use 3956 for miles
        r = 6371

        # calculate the result
        return(c * r)


# Machine Learning functions

In [27]:
# Display fraction of the datasets used to train models
print("Fraction of the dataset used to train models: {:.2f}%".format(FRAC_VALUE_FOR_ML*100))

Fraction of the dataset used to train models: 10.00%


In [28]:
def get_model_filename(model_name) -> str:
    """
    Basic function that will return the filename used to store on disk
    the model passed as parameter
    
    Returns:
    --------
    str
    
    """
    return os.path.join(DATA_PATH, f'model-{model_name}.sav')

In [29]:
import pickle
import tensorflow.keras as keras

def save_model(model, name) -> None:
    """
    Function that saves on disk the fitted model passed as first
    parameter using pickle or keras library, depending on model type
    It uses the function getModelFilename() with the 'name'
    parameter to get the filename where to save the model.
    
    Returns:
    --------
    None
    
    """
    filename=get_model_filename(name)
    # Save model to disk
    if isinstance(model, keras.Sequential):
        print("Saving model {} to {} using 'keras.models.save_model' library".format(name, filename))
        keras.models.save_model(model, filename, overwrite=True)
    else:
        print("Saving model {} to {} using 'pickle' library".format(name, filename))
        pickle.dump(model, open(filename, 'wb'))

In [30]:
def load_model(name):
    """
    Function that loads from disk the model of which name is passed
    as first parameter. It uses the function getModelFilename() with
    the 'name' parameter to get the filename from where to load the model.
    
    Returns:
    --------
    Fitted model
    
    """
    filename=get_model_filename(name)
    # load the model from disk
    print("Loading model from ", filename)
    model=None
    try:
        model=pickle.load(open(filename, 'rb'))
        print("Model loaded using pickle()")
    except:
        model=keras.models.load_model(filename)
        print("Model loaded using keras.models.load_model()")
    finally:
        return model

In [31]:
def mae(y_pred, y) -> np.array:
    """
    Returns 10^mean_absolute_error() between the two result vector passed as parameter.
    
    Returns:
    --------
    10^(mean absolute error)
    
    """
    
    return 10**mean_absolute_error(y_pred, y)

In [32]:
def mape(y_pred, y) -> float:
    """
    Define a performance metric in percentage
    
    Returns:
    --------
    mean absolute percentage score
    
    """
    
    # Return percentage value
    return 100 - np.mean(100 * (mean_absolute_error(y_pred, y) / y))


In [33]:
# Define custom scoring function using mape()
custom_scorer = make_scorer(mape, greater_is_better=True)


In [34]:
def plot_grid_search_results(results_df,
                             x_param,
                             y_param=['mean_test_score', 'mean_train_score'],
                             semilogx=True,
                             xlabel='',
                             ylabel='Score (%)',
                             title='GridSearch results',
                             figsize=(15,10),
                             std_params={'mean_test_score': 'std_test_score'},
                             std_factor=1,
                             show_best_result=['mean_test_score'],
                             greater_is_best=False
                            ) -> None:
    """
    Function to graph data points from GridSearchCV results., used to graph
    the mean test score of a GridSearchCV fitted object.
    
    Mandatory parameters are:
        results_df: A dataframe built from GridSearchCV.cv_results_ property
        x_param: The column name of the results_df dataframe to be used as X axis
        y_param: An array of column to be plotted on the Y axis.
        
    Optionnal parameters:
        semilogx: If True, the X data points are plotted using a log10 scale
        xlabel: Label of the X axis
        ylabel: Label of the Y axis
        title: Title of the graph
        figsize: Size ot the graph
        std_param: A dict with key=y_param element and value the corresponding
                   standard deviation column name.
                   This parameters is used to draw the std deviation of the
                   y_params as a filled area around the data plot
        std_factor: This parameter is used to amplify the standard deviation
                    when building the std dev filled area. Default value is 1 and
                    changing increasing it allows displaying standard 'small'
                    deviation behaviours.
                    Be warn that when changing this parameter to a value other
                    that 1, the filled area does not represent absolute values
                    but a trend of it.
            
    The function will also determine, for each of the y_param to be plotted,
    which is the plot with the highest y_param value, and use the coordinates
    to draw a red cross on the plotted line, along with horizontal and vertical
    lines to the X and Y axis.
    
    For that purpose, the function first sort the results_df dataframe using
    the x_param column in ascending order.
    
    Returns:
    --------
    None
    
    """
    # Order dataframe by xparam value
    temp_df=results_df.sort_values(x_param)

    # Define figsize
    plt.figure(figsize=figsize)

    # Store x_min, x_max, y_min and y_max values to set xlimit and ylimit of the graph
    x_min=0
    y_min=100
    x_max=0
    y_max=0
    
    # Loop for each yparam plot
    for i in y_param:
        # Find indices of the best y value
        if greater_is_best==True:
            best_idx=temp_df[i].idxmin()
        else:
            best_idx=temp_df[i].idxmax()

        # Get best x information
        best_x = temp_df[x_param][best_idx]
        # Get x plots
        x_values=temp_df[x_param].astype('float64')
        # Store x_min and x_max if needed
        if x_min>np.min(x_values):
            x_min=np.min(x_values)
        if x_max<np.max(x_values):
            x_max=np.max(x_values)



        # get best y information
        best_y=temp_df[i][best_idx] # Multiply by 100 to get %
        # Get y plots
        y_values=temp_df[i]
        # Store y_min and y_max if needed
        if y_min>np.min(y_values):
            y_min=np.min(y_values)-y_values.std()
        if y_max<abs(np.max(y_values)):
            y_max=np.max(y_values)+y_values.std()

        if semilogx:
            plt.semilogx(x_values, y_values, label=i)
        else:
            plt.plot(x_values, y_values, label=i)

        # Draw a cross on the best_x/best_accuracy point
        if i in show_best_result:
            # Write near of the cross the best_y/best_y value
            plt.scatter(best_x, best_y, marker='x', c='red', zorder=10, label=f'{i} best result')
            plt.text(best_x, best_y, 'x:{:.3f} y:{:.3f}'.format(best_x, best_y), color='red')

            # Draw vertical/horizontal line to help read best score
            plt.axvline(best_x, color='grey', linestyle='--')
            plt.axhline(best_y, color='grey', linestyle='--')

        #plt.plot([np.min(x_values), best_x], [best_y, best_y], c='red', alpha=0.5, linestyle='--')
        
        # Do we have any std_param set for this loop ?
        for key in std_params:
            if key==i:
                # Get the std deviation values from std_param column name
                std_values=temp_df[std_params[key]]
                # plot a filled area to represent the standard deviation
                if std_factor==1:
                    label=std_params[key]
                else:
                    label='{} x {}'.format(std_params[key], std_factor)
                plt.fill_between(x_values, y_values+std_factor*std_values, y_values-std_factor*std_values, alpha=0.4, label=label)
                       
    plt.title(title)
    if semilogx:
        xlabel=f'log({xlabel})'
    plt.xlabel(xlabel)        
    plt.ylabel(ylabel)
    plt.ylim(bottom=y_min, top=y_max)
    plt.legend()
    plt.show()

In [35]:
def draw_mape_per_speed_interval(dataset, columns, y='y', start=5, stop=50, step=5, title=None, figsize=(15,10), barwidth=1.5, draw_limit=True) -> None:
    """ 
    Draw a graph of the MAPE values per speed interval, using columns and
    y features from dataset passed as first paramters.
    Speed interval start, stop and step are optional parameters using
    default values.
    
    title, an optionnal parameter, may be used to set the graph title.
    If set to None, a default title is set.
    
    figsize is used to define the size of the graph. Default is (10,6)
    
    
    Returns:
    --------
    None
    
    """

    # Plot index and result to get MAPE per km/h interval
    plt.figure(figsize=figsize)

    # Offset to center bar on xticks
    offset=barwidth/2*(1-len(columns))
    
    # Initialize best_y to 0 and poor_y to 100
    best_y=0
    best_model=''
    poor_y=100
    poor_model=''
    
    for k in columns:
        
        # Build dataframe with y_pred and y_va
        df=pd.DataFrame({'y_pred': dataset[k], 'y_va': dataset[y]})

        # result variables
        index=[]
        result=[]
        label=[]

        # For loop in interval from start to stop, using step increment
        for i in range(start, stop, step):

            # filter df to get data from interval
            filter=np.logical_and(
                df['y_va'] >= np.log10(i),
                df['y_va'] <  np.log10(i+5)
            )

            # Append interval to index
            index.append(i)

            # Build index label
            label.append(f'{i} to {i+5} km/h')

            # Append MAPE to result, set to None if no data from filter
            if(len(df[filter]['y_va'])>0):
                result.append(mape(df[filter]['y_pred'], df[filter]['y_va']))
            else:
                result.append(None)

        # Build dataframe with index and result
        result_df=pd.DataFrame({'interval': index, 'mape': result})

        # plt.plot(result_df['interval'], result_df['mape'])
        plt.bar(result_df['interval'] + offset, result_df['mape'], width=barwidth, label=k)

        # Increase offset
        offset+=barwidth
        
        # Update best results
        best=result_df.sort_values('mape', ascending=False).values[0][1]
        if best > best_y:
            best_y=best
            best_model=k
        
        # Update poor results 
        poor=result_df.sort_values('mape', ascending=True).values[0][1]
        if poor < poor_y:
            poor_y=poor
            poor_model=k
            
    # Draw horizontal line at best score
    plt.axhline(y=best_y, color='green', linestyle='--', label='Best: {:.1f} % ({})'.format(best_y, best_model))
    #plt.text(11, best_y-5, '{:.2f}%'.format(best_y), color='green')

    # Draw horintal line at the lowest score
    plt.axhline(y=poor_y, color='red', linestyle='--', label='Worst: {:.1f} % ({})'.format(poor_y, poor_model))
    #plt.text(42, poor_y-5, '{:.2f}%'.format(poor_y), color='r')


    # Set X and Y axis ticks and labels
    plt.xticks(ticks=index, labels=label)
    if title:
        plt.title(title)
    else:
        plt.title('Mean Absolute Percent Error (%) per speed interval')
    plt.xlabel('Speed interval in km/h')
    plt.ylabel('Mean Average Percent Error')

    # Show legend
    plt.legend(loc='lower right')
    
    # Show graph
    plt.show()


# End of definitions

In [36]:
print("my_utils library loaded :-)")

my_utils library loaded :-)
