In [None]:
import gc
import os
from glob import glob


import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from scipy import stats

from scipy.stats import probplot, pearsonr
from scipy.stats import skew, norm, kurtosis

from tqdm import tqdm
import lightgbm as lgb
import joblib

from sklearn.model_selection import TimeSeriesSplit

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
def reduce_mem_usage(df, verbose=False):
    """
    Utility function to reduce the memory usage of pandas dataframes
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                # -128 to 127
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def visualization(df, x, y, figsize=(12,3), hue=None, scatter=False, cust_col='Set2', title='' ,xlabel='', ylabel='', rotation_angel=90):
    df = df.copy()
    
    fig = plt.figure(figsize=figsize)
    if (scatter):
        ax = sns.scatterplot(x=x,y=y,data=df,palette=cust_col, ci=None)
        
        plt.title(title)
        plt.xticks(rotation=rotation_angel)
        ax.set(xlabel=xlabel, ylabel=ylabel)
        
    elif (hue != None):
        ax = sns.lineplot(x=x,y=y,data=df,hue=hue,palette=cust_col, ci=None)
        
        plt.title(title)
        plt.xticks(rotation=rotation_angel)
        plt.legend(loc=legpos,bbox_to_anchor=loc,fontsize=size)
        ax.set(xlabel=xlabel, ylabel=ylabel)
    else:
        ax = sns.lineplot(x=x,y=y,data=df,palette=cust_col, ci=None)
        
        plt.title(title)
        plt.xticks(rotation=rotation_angel)
        ax.set(xlabel=xlabel, ylabel=ylabel)
    
    plt.show()

In [None]:
def visualize_target(df, target):
    
    print(f'{target}\n{"-" * len(target)}')
        
    print(f'Mean: {df[target].values.mean():.4f}  -  Median: {df[target].median():.4f}  -  Std: {df[target].std():.4f}')
    print(f'Min: {df[target].min():.4f}  -  25%: {df[target].quantile(0.25):.4f}  -  50%: {df[target].quantile(0.5):.4f}  -  75%: {df[target].quantile(0.75):.4f}  -  Max: {df[target].max():.4f}')
    print(f'Skew: {skew(df[target]):.4f}  -  Kurtosis: {kurtosis(df[target]):.4f}')
    missing_count = df[df[target].isnull()].shape[0]
    total_count = df.shape[0]
    print(f'Missing Values: {missing_count}/{total_count} ({missing_count * 100 / total_count:.4f}%)')

    fig, axes = plt.subplots(ncols=2, figsize=(24, 6), dpi=100)

    import warnings
    warnings.filterwarnings("ignore")
    
    sns.distplot(df[target], label=target, ax=axes[0])
    axes[0].axvline(df[target].values.mean(), label='Mean', color='r', linewidth=2, linestyle='--')
    axes[0].axvline(df[target].median(), label='Median', color='b', linewidth=2, linestyle='--')
    axes[0].legend(prop={'size': 15})
    probplot(df[target], plot=axes[1])
    
    for i in range(2):
        axes[i].tick_params(axis='x', labelsize=12.5)
        axes[i].tick_params(axis='y', labelsize=12.5)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
    axes[0].set_title(f'{target} distribution', fontsize=15, pad=12)
    axes[1].set_title(f'{target} probability plot', fontsize=15, pad=12)
    
    plt.show()

def visualize_time(df, column):
    
    print(f'{column}\n{"-" * len(column)}')
    print(f'Mean: {df[column].mean():.4f}  -  Median: {df[column].median():.4f}  -  Std: {df[column].std():.4f}')
    print(f'Min: {df[column].min():.4f}  -  25%: {df[column].quantile(0.25):.4f}  -  50%: {df[column].quantile(0.5):.4f}  -  75%: {df[column].quantile(0.75):.4f}  -  Max: {df[column].max():.4f}')
    print(f'Skew: {df[column].skew():.4f}  -  Kurtosis: {df[column].kurtosis():.4f}')
    missing_count = df[df[column].isnull()].shape[0]
    total_count = df.shape[0]
    print(f'Missing Values: {missing_count}/{total_count} ({missing_count * 100 / total_count:.4f}%)')

    fig, axes = plt.subplots(ncols=2, figsize=(24, 6), dpi=100)

    sns.kdeplot(df[column], label=column, fill=True, ax=axes[0])
    axes[0].axvline(df[column].mean(), label='Mean', color='r', linewidth=2, linestyle='--')
    axes[0].axvline(df[column].median(), label='Median', color='b', linewidth=2, linestyle='--')
    axes[0].legend(prop={'size': 15})
    axes[1].plot(df.set_index('time_id')[column], label=column)
    
    for i in range(2):
        axes[i].tick_params(axis='x', labelsize=12.5)
        axes[i].tick_params(axis='y', labelsize=12.5)
        axes[i].set_ylabel('')
    axes[0].set_xlabel('')
    axes[1].set_xlabel('time', fontsize=12.5)
    axes[1].set_ylabel('sample counts', fontsize=12.5)
    axes[0].set_title(f'{column} Distribution', fontsize=15, pad=12)
    axes[1].set_title(f'{column} by time', fontsize=15, pad=12)
    
    plt.show()


In [None]:
def pearsonr(preds, dataset):
    """
    Compute Pearson correlation 
    on validation dataset for LightGBM model.
    """
    labels = dataset.get_label() 
    return 'pearsonr', stats.pearsonr(preds, labels)[0], True 
