# Libraries

In [1]:
import math
import numpy as np
import pandas as pd
import scipy as scp
from scipy.stats import kde
import scipy.io
import statistics as stats

import calendar
import dateutil
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import time

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import ipywidgets as widgets
from ipywidgets import Layout
from ipywidgets import interact
import voila

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


import warnings
warnings.filterwarnings("ignore")


dark_colors = ["#A51C30", "#808080",
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (18, 12)
rcParams['figure.dpi'] = 150
rcParams["axes.grid"] =True
rcParams['agg.path.chunksize'] = 10000

rcParams['axes.prop_cycle'] = plt.cycler(color = dark_colors)
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = "white"
rcParams['axes.titlesize'] = 20      
rcParams['axes.labelsize'] = 17.5
rcParams['xtick.labelsize'] = 13 
rcParams['ytick.labelsize'] = 13
rcParams['legend.fontsize'] = 15.5
rcParams['patch.edgecolor'] = 'none'
rcParams['grid.color']="gray"   
rcParams['grid.linestyle']="-" 
rcParams['grid.linewidth'] = 0.3
rcParams['grid.alpha']=1
rcParams['text.color'] = "444444"
rcParams['axes.labelcolor'] = "444444"
rcParams['ytick.color'] = "444444"
rcParams['xtick.color'] = "444444"

# Exploratory Data Analysis

ALL OF THE GRAPHS SHOWN DOWN BELOW ARE INTERACTIVE, EITHER RUN EACH OF THE CELL ONCE TO GET THE SLIDERS AND GRAPHS TO SHOW OR CLICK ON "Voila" ON THE JUPYTER NOTEBOOK TOOLBAR TO RENDER IT AS A DASHBOARD.

1. For Pearson Correlation Matrix and Pair Plot, It will take a longer time to render due to the sheer number of features.  
2. This notebook is separated from the training notebook due to the incapability of Voila to limit which cells to run for the dashboard (i.e. it will run the whole thing).  
3. Issues have been raised by the community regarding Voila in which sometimes it doesn't render the notebook properly due to possible version conflict. As such, simply run all cells directly in this notebook and this should achieve the same result, sorry for the inconvenience caused.  

In [2]:
FILENAME = 'data.csv'
dst = pd.read_csv(FILENAME)

key_word = 'feat_'
key_word_list = []

for x in range(1, dst.shape[1]):
    key_word_list.append(key_word + str(x))
    
    if x == dst.shape[1] - 1:
        key_word_list.append('label')
        
dst.columns = key_word_list

print('-----' * 20)
print('Number of Null/Missing Values\t: {}'.format(dst.isnull().sum().sum()))
print('Number of Duplicated Entries\t: {}'.format(len(dst)-len(dst.drop_duplicates())))
print('-----' * 20)

----------------------------------------------------------------------------------------------------
Number of Null/Missing Values	: 98
Number of Duplicated Entries	: 0
----------------------------------------------------------------------------------------------------


In [3]:
feat_list = list(range(1, dst.shape[1] // 10 + 2, 1))

style = {'description_width': 'initial'}

feat_list_over = widgets.IntSlider(min = feat_list[0],
                                   max = feat_list[-1],
                                   step = 1,
                                   description = 'Feature Chunk (10 per Chunk) :',
                                   disabled = False,
                                   continuous_update = False,
                                   orientation = 'horizontal',
                                   readout = True,
                                   readout_format = 'd',
                                   style = style,
                                   layout = {'width': '500px'}
                                  )

@interact(feat_list_over = feat_list_over)
def get_feature_overview(feat_list_over):
    print('\n FEATURE DATA TYPE')
    print('-----' * 20)
    dst.iloc[:, feat_list_over * 10 - 10:feat_list_over * 10].info()
    print('-----' * 20)
    print('\n\n FEATURE DESCRIPTION')
    print('-----' * 20)
    print(dst.iloc[:, feat_list_over * 10 - 10:feat_list_over * 10].describe())
    print('-----' * 20)
    print('\n\n LABEL DESCRIPTION')
    print('-----' * 20)
    print(dst.label.value_counts())
    print('-----' * 20)

interactive(children=(IntSlider(value=1, continuous_update=False, description='Feature Chunk (10 per Chunk) :'…

In [4]:
feat_list_num = list(range(1, dst.shape[1], 1))
feat_pair_list = list(range(1, dst.shape[1] // 5 + 2, 1))

color = ["#F08080","#EEE8AA","#90EE90","#FFC0CB", "#89CFF0"]

style = {'description_width': 'initial'}

feat_list_plot = widgets.IntSlider(min = feat_list_num[0],
                                   max = feat_list_num[-1],
                                   step = 1,
                                   description = 'Features :',
                                   disabled = False,
                                   continuous_update = False,
                                   orientation = 'horizontal',
                                   readout = True,
                                   readout_format = 'd',
                                   style = style,
                                   layout = {'width': '500px'}
                                  )

@interact(feat_list_plot = feat_list_plot)
def get_feature_distribution(feat_list_plot):
    ax = sns.boxenplot(x = dst.label, y = dst.iloc[:, feat_list_plot - 1], palette = color, width = 0.6)
    ax.set(xlabel = 'Label', ylabel = 'Feature ' + str(feat_list_plot), title = 'Feature vs Labels')
    
    kde_choice = widgets.ToggleButtons(options = ['Yes', 'No'],
                                       value = 'No',
                                       description = 'Kernel Density Estimation Plot: ',
                                       disabled = False,
                                       style = style
                                      )
    
    pair_choice = widgets.ToggleButtons(options = ['Yes', 'No'],
                                        value = 'No',
                                        description = 'Pair Plot (5 per Chunk): ',
                                        disabled = False,
                                        style = style
                                       )
    
    corr_choice = widgets.ToggleButtons(options = ['Yes', 'No'],
                                        value = 'No',
                                        description = 'Pearson Correlation Matrix: ',
                                        disabled = False,
                                        style = style
                                       )
    
    
    @interact(kde_choice = kde_choice)
    def kde_plot(kde_choice):
        if kde_choice == 'Yes':
            ax2 = sns.kdeplot(data = dst.iloc[:, feat_list_plot - 1])
            mean_val = np.mean(dst.iloc[:, feat_list_plot - 1])
            median_val = stats.median(dst.iloc[:, feat_list_plot - 1])
            plt.axvline(mean_val, linestyle = 'dashed', linewidth = 3, color = 'g', label = 'Mean')
            plt.axvline(median_val, linestyle = 'dashed', linewidth = 3, color = 'y', label = 'Median')
            ax2.set(xlabel = 'Feature ' + str(feat_list_plot), title = 'KDE Plot for Feature ' + str(feat_list_plot))
            plt.legend()
            plt.grid(True)
            plt.show()

        else:
            None
            
    @interact(pair_choice = pair_choice)
    def pair_plot(pair_choice):
        if pair_choice == 'Yes':
            
            feat_pair_plot = widgets.IntSlider(min = feat_pair_list[0],
                                               max = feat_pair_list[-1],
                                               step = 1,
                                               description = 'Features :',
                                               disabled = False,
                                               continuous_update = False,
                                               orientation = 'horizontal',
                                               readout = True,
                                               readout_format = 'd',
                                               style = style,
                                               layout = {'width': '500px'}
                                              )
            
            @interact(feat_pair_plot = feat_pair_plot)
            def get_pair_plot(feat_pair_plot):
                if feat_pair_plot != feat_pair_list[len(feat_pair_list) - 1]:
                    dst_2 = dst.iloc[:, feat_pair_plot * 5 - 5:feat_pair_plot * 5].copy()
                    dst_2 = pd.concat([dst_2, dst.iloc[:,-1]], 
                                      axis = 1)
                
                else:
                    remainder_num = feat_list_num[len(feat_list_num) - 1] % 5
                    dst_2 = dst.iloc[:, feat_pair_plot * 5 - 5:feat_pair_plot * 5 - (5 - remainder_num)].copy()
                    dst_2 = pd.concat([dst_2, dst.iloc[:,-1]], 
                                      axis = 1)

                sns.pairplot(dst_2, hue = 'label', palette = color)
            
        else:
            None
            
    @interact(corr_choice = corr_choice)
    def corr_plot(corr_choice):
        if corr_choice == 'Yes':
            correlation_matrix = dst.corr(method = 'spearman')
            plt.matshow(correlation_matrix)

        else:
            None

interactive(children=(IntSlider(value=1, continuous_update=False, description='Features :', layout=Layout(widt…

# Conclusion

1. On first sight, the first RED FLAG here is the PREVALENCE of CLASS A in the label distribution.   
2. In other words, whatever predictive model generated will produce 1 strong learner (A) and 4 weak learners (B, C, D, E), essentially contributing to a highly-biased model with imminent OVER-FITTING issue. Please refer to the second notebook to see the exact percentages.  
3. One way to solve this issue is to re-sample the whole dataset into a subset of dataset.  
4. Secondly, the supposed features had no proper annotations/labels and thus minimal insight can be drawn from domain knowledge. Most of which are float and int64 types with no duplication but 98 null/missing values.  
5. Thirdly, the range of values across the entirety of the dataset spans from very small values (even negatives) to extremely high values. Therefore, scaling is mandatory to achieve optimum predictive power.  
6. Fourthly, the presence of outliers across the entire dataset could still remain negligible.  
7. The following methodologies have been considered to deal with outliers; (A) Clipping between 5 percentile and 95 percentile or (B) Logarithms Transformation Algorithm.
8. However, just has it been mentioned before, since the features themselves don't have proper labels and thus it's unclear what they stand for, the performance of any kinds of transformation protocols may potentially impact the outcome of the prediction model negatively and therefore has been decided against in the end.  
9. In this case, since the majority of features exhibit nearly Gaussian Distribution, the Standard Scaler would be chosen as the method to scale the dataset. 
10. Last but not least, from the Pearson Correlation Matrix, it can be inferred that there's simply NO viable correlation between all 153 features, most likely a consequence from an imbalance distribution in labels.  