In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn import metrics
import re
import time
from scipy import stats

import warnings
warnings.filterwarnings("ignore")

#Quality of image:
%config InlineBackend.figure_format = 'svg'
import os

pd.set_option('display.max_columns', 100)

In [1]:
def make_bar_plot(X, y, sort=False, x_label = None, y_label=None, title='Title', x_Rotation = 0, width=5, height=3, annot=False):
    fig, ax = plt.subplots(figsize=(width,height))
    df = pd.DataFrame({'X_Value':X, 'y_Value':y})
    if(sort == True):
        df.sort_values(by='y_Value', ascending=False, inplace=True)

    sns.barplot(ax=ax, x=df['X_Value'], y=df['y_Value'])
    plt.xlabel(x_label, fontsize=12)
    plt.ylabel(y_label, fontsize=12)
    
    plt.tick_params(axis='both', which='major', labelsize=12)
    plt.xticks(rotation=x_Rotation);
    ax.set_title(title)
    
    if(annot == True):
        for p in ax.patches:
            x=p.get_bbox().get_points()[:,0]
            y=p.get_bbox().get_points()[1,1]
            ax.annotate('{:.2f}'.format(y), (x.mean(), y), ha='center', va='bottom')    

def make_count_plot(df, cols, annotate=False, xRotation = 0, title=None, width=5, height=3):
    """
    This function will calculate and show count plot.
    Input: df: a dataframe, 
           cols: a list of feature names for count plot.
           annotate: a condition flag, when it is True the percentage of each bar will be observable. The default value is False. 
           xRotation: a degree for rotaion of x-axis ticks. The default value is 0.
           title: the title of the plot,
           width: the width of the plot. The default value is 5.
           height: the height of the plot. The default value is 3.
    output: This function only shows count plot.
    """
    totalcnt = df.shape[0]
    cnt = len(cols)
    if(cnt == 1):
        fig, ax = plt.subplots(figsize=(width,height*cnt))
        col = cols[0]
        sns.countplot(x=df[col], ax=ax);
        ax.set_title(title)
        ax.set_xlabel(col)
        ax.set_ylabel('Frequency');
        
        if(annotate == True):
                for p in ax.patches:
                    x=p.get_bbox().get_points()[:,0]
                    y=p.get_bbox().get_points()[1,1]
                    ax.annotate('{:.1f}%'.format(y*100/totalcnt), (x.mean(), y), ha='center', va='bottom')
    else:
        fig, axes = plt.subplots(cnt, 1, figsize=(5,3*cnt))
        for i, col in enumerate(cols):
            sns.countplot(x=df[col], ax=axes[i]);
            #axes[i].set_title('Distribution')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency');
            if(xRotation != 0):
                axes[i].tick_params(axis='x', labelrotation=xRotation)
            
            if(annotate == True):
                for p in axes[i].patches:
                    x=p.get_bbox().get_points()[:,0]
                    y=p.get_bbox().get_points()[1,1]
                    axes[i].annotate('{:.1f}%'.format(y*100/totalcnt), (x.mean(), y), ha='center', va='bottom')

        fig.tight_layout()
    return


def make_box_plot(df, cols):
    """
    This function will show box plot for the passed features.
    Input: df: a dataframe, 
           cols: a list of feature names for box plot.
    output: This function only show box plot.
    """
    cnt = len(cols)
    if cnt == 1:
        col = cols[0]
        fig, ax = plt.subplots(figsize=(5,3))
        sns.boxplot(x=df[col], ax=ax);
        #axes[i].set_title('Distribution')
        ax.set_xlabel(col)
    else:

        fig, axes = plt.subplots(cnt, 1, figsize=(5,3*cnt))

        for i, col in enumerate(cols):
            sns.boxplot(x=df[col], ax=axes[i]);
            #axes[i].set_title('Distribution')
            axes[i].set_xlabel(col)
            #axes[i].set_ylabel('Count');
        fig.tight_layout()
    return


def makeHeatmap(df, cols, annot = False, line=False, width=12, height=10):
    """
    This function will calculate and show heatmap of correlation
    Input: df: a dataframe, 
           cols: a list of numerical features.
           annot: a conditional flag. If True, then the value of correlation will be printed within heatmap.
           line = a conditional flag. If True, then cell border line will be printed inside heatmap.
    output: This function only shows heatmap.
    """
    #colormap = plt.cm.white
    colormap = plt.cm.YlGnBu
    sns.set(style="whitegrid")


    plt.figure(figsize=(width,height))
    plt.title('Correlation Heatmap', size=15)
    if(line == True):
        sns.heatmap(df[cols].corr(), vmax=1.0, cmap=colormap, annot=annot, square=True, linewidths=0.005, linecolor='gray');
    elif(line == False):
        sns.heatmap(df[cols].corr(), vmax=1.0, cmap=colormap, annot=annot, square=True);
    return

def make_categorydf(df, metadata, catName):
    cols = metadata[metadata['category']==catName].index.tolist()
    res_df = pd.DataFrame(columns={'feature', 'uniqueCnt'})
    res_df['feature'] = cols

    for col in cols:
        res_df.loc[res_df['feature']==col,'uniqueCnt'] = df[col].nunique()

    # To set order of columns in dataframe:
    res_df = res_df[['feature', 'uniqueCnt']]
    return res_df


#################################################################################

def make_typedf(df, metadata, typeName):
    cols = metadata[metadata['type']==typeName].index.tolist()
    
    if 'id' in cols:
        cols.remove('id')
    
    
    cnt = len(cols)
    
    res_df = pd.DataFrame(columns={'feature', 'uniqueCnt'})
    res_df['feature'] = cols

    for col in cols:
        res_df.loc[res_df['feature']==col,'uniqueCnt'] = df[col].nunique()

    # To set order of columns in dataframe:
    res_df = res_df[['feature', 'uniqueCnt']]
    
    
    return cols, cnt, res_df
    
def checkMissValue(df, metadata):

    res = pd.DataFrame(index = df.columns, columns = ['count', 'percentage', 'type'])

    for col in df.columns:
        temp = df[col]==-1
        try:
            res.loc[col, 'count'] = int(temp.value_counts()[1])
        except:
            res.loc[col, 'count'] = 0

    res['percentage'] = (res['count']/df.shape[0])*100
    res.sort_values(by = 'count', inplace=True, ascending=False)

    # we can also save the type of feature:
    for x in res.index:
        res.loc[x, 'type'] = metadata.loc[x, 'type']
        
    return res


def levelFrequency(df, cols):
    """
    This function checks that whether we have features that their values are are dominant only by one value or not?
    """
    frequencyLst = []
    for col in cols:
        dominant = df[col].value_counts().values[0]
        dominant = np.round(dominant*100/df.shape[0], 2)
        entry = {'feature':col, 'frequency':dominant}
        frequencyLst.append(entry)
    frequency_df = pd.DataFrame(frequencyLst, columns = ['feature', 'frequency'])
    frequency_df.sort_values(by='frequency', ascending=False, inplace=True, ignore_index=True)
    return frequency_df

def identify_single_unique(df):
    """Finds features with only a single unique value. NaN do not count as a unique value. """

    # Calculate the unique counts in each column
    unique_counts = df.nunique()
    unique_table = pd.DataFrame(unique_counts).reset_index().rename(columns = {'index': 'feature', 0: 'levels'})
    unique_table.sort_values('levels', inplace = True, ascending = True)

    # Find the columns with only one unique count
    single_count = unique_table[unique_table['levels']==1] 
   
    print('There are {:d} features with a single unique value.'.format(len(single_count)))
    return list(single_count['feature'])


def change_datatype(df): #minimize used memory
    for col in list(df.select_dtypes(include=['int']).columns):
        if df[col].max() < 2**7 and df[col].min() >= -2**7:
            df[col] = df[col].astype(np.int8)
        elif df[col].max() < 2**8 and df[col].min() >= 0:
            df[col] = df[col].astype(np.uint8)
        elif df[col].max() < 2**15 and df[col].min() >= -2**15:
            df[col] = df[col].astype(np.int16)
        elif df[col].max() < 2**16 and df[col].min() >= 0:
            df[col] = df[col].astype(np.uint16)
        elif df[col].max() < 2**31 and df[col].min() >= -2**31:
            df[col] = df[col].astype(np.int32)
        elif df[col].max() < 2**32 and df[col].min() >= 0:
            df[col] = df[col].astype(np.uint32)
    for col in list(df.select_dtypes(include=['float']).columns):
        df[col] = df[col].astype(np.float32)
        

def size_MB(obj):
    import sys
    size = sys.getsizeof(obj)
    size /=10**6
    size = round(size, 2)
    return size


def imputby_Regression(data_df, corr_df, useOneCorrFeature=True):
    from sklearn.linear_model import LinearRegression
    LR = LinearRegression()
    totalCnt = data_df.shape[0]

    for i in range(corr_df.shape[0]):
        refVar = corr_df.loc[i, 'miss_feat']

        if (useOneCorrFeature == True):
            if(corr_df.loc[i, 'weight_1'] >= corr_df.loc[i, 'weight_2']):
                corVar = corr_df.loc[i, 'corr_feat1']
                missCnt = corr_df.loc[i, 'miss_count1']
            else:
                corVar = corr_df.loc[i, 'corr_feat2']
                missCnt = corr_df.loc[i, 'miss_count2']
            
        elif(useOneCorrFeature == False):
            missCnt1 = corr_df.loc[i, 'miss_count1']
            missCnt2 = corr_df.loc[i, 'miss_count2']
            corVar1 = corr_df.loc[i, 'corr_feat1']
            corVar2 = corr_df.loc[i, 'corr_feat2']

        
        # I have found the regression works for numerical reference and categorical correlated,
        # Because, categorical features are initially encoded by digits in original dataset.
        # We dont care about the real concept of the value of the categorical, but it has a correlation with refVar.
        # So if the reference is numerical, I can use regression.

        nullindexRef = data_df[refVar].isnull().values
        
        if (useOneCorrFeature == True):
            if(missCnt==0):
                nullindex = nullindexRef
            else:
                nullindexCorr = data_df[corVar].isnull().values
                nullindex = [True if (nullindexRef[i] == True and nullindexCorr[i] == True) else False for i in range(totalCnt)]
        else:
            if(missCnt1==0 & missCnt2==0):
                nullindex = nullindexRef
            else:
                nullindexCorr1 = data_df[corVar1].isnull().values
                nullindexCorr2 = data_df[corVar2].isnull().values
                nullindex = [True if (nullindexRef[i] == True and nullindexCorr1[i] == True and nullindexCorr2[i] == True)
                         else False for i in range(totalCnt)]

        # For training Regression model, We need index of not-missing rows.
        notnullindex = ~nullindex


        # X1: is defined as the first correlated feature.
        # X2: is defined as the second correlated feature.
        # y: is defined the feature with miss data.
        
        if (useOneCorrFeature == True):
            XforTrain = data_df.loc[notnullindex, corVar1].values.reshape(-1,1)
            XforPred = data_df.loc[nullindex, corVar1].values.reshape(-1,1)
        
        if (useOneCorrFeature == False):
            X1forTrain = data_df.loc[notnullindex, corVar1].values
            X2forTrain = data_df.loc[notnullindex, corVar2].values
            X1forPred = data_df.loc[nullindex, corVar1].values
            X2forPred = data_df.loc[nullindex, corVar2].values
            
            
            XforTrain = pd.DataFrame({'x1':X1forTrain, 'x2':X2forTrain, 'x1x2': X1forTrain*X2forTrain})
            XforPred = pd.DataFrame({'x1':X1forPred, 'x2':X2forPred, 'x1x2': X1forPred*X2forPred})
        
        y = data_df.loc[notnullindex, refVar].values.reshape(-1,1)
        LR.fit(XforTrain, y)        
        # After training, we will fill the miss data:
        data_df.loc[nullindex, refVar] = LR.predict(XforPred)

  
        print('After imputing, Rows with missed data: {:d}'.format(data_df[refVar].isnull().sum()))
    
    return data_df


def make_correlated_table(df, colArray):
    """
    This function will make a correlation matrix, for imputing purposes of numerical features:
    input: df:a dataframe
           colArray: a list of numerical featur name 
    """

    correlated_feat1 = []
    correlated_feat2 = []

    correlated_val1 = []
    correlated_val2 = []

    for col in colArray:
        corr = df.corr()[col]
        corr = abs(corr).sort_values(ascending = False)

        correlated_feat1.append(corr.index[1])
        correlated_feat2.append(corr.index[2])

        correlated_val1.append(corr[1])
        correlated_val2.append(corr[2])


    time1 = time.process_time()
    colLst = ['miss_feat',
              'corr_feat1', 'corr_val1', 'miss_count1', 'weight_1',
              'corr_feat2', 'corr_val2', 'miss_count2', 'weight_2']
    row_list = []
    totalLen = df.shape[0]
    for i, col in enumerate(colArray):

        feat1 = correlated_feat1[i]
        val1 = correlated_val1[i]
        miss1 = df[feat1].isnull().sum()
        weight1 = val1 - miss1/totalLen

        feat2 = correlated_feat2[i]
        val2 = correlated_val2[i]
        miss2 = df[feat2].isnull().sum()
        weight2 = val2 - miss2/totalLen

        valueLst = [col, feat1, val1, miss1, weight1, feat2, val2, miss2, weight2]
        dic1 = {k:v for k,v in zip(colLst, valueLst)}
        row_list.append(dic1)

    corr_matrix = pd.DataFrame(row_list, columns = colLst)
    time2 = time.process_time()
    print('Elapsed time: {:6.3f} seconds'.format((time2- time1)))

    return corr_matrix


def findVariance(df):
   
    cols = df.columns.tolist()
    res_df = pd.DataFrame(columns = {'feature', 'variance'})
    res_df['feature'] = cols
    res_df['variance'] = [np.var(df[col]) for col in cols]

    # We set the order of columns
    colOrder = ['feature', 'variance']
    res_df = res_df[colOrder]

    #Sorting based on the absolute value of correlation
    res_df.sort_values(by='variance', inplace=True)
    return res_df  


def feature_selection_RandomForset(X_input, y_input, threshold_imp = 0.1, threshold_cumulative = 0.98, cv_input=5, alphas_input=[0.02, 0.05, 0.08]):
    
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.feature_selection import SelectFromModel
   
    if(y_input.iloc[:,0].dtypes == 'object'):
        rf = RandomForestRegressor(random_state = 0, n_jobs = -1)
    else:
        rf = RandomForestClassifier(random_state = 0, n_jobs = -1)
        
    # Fitting the classifier
    rf.fit(X_input, y_input)

    # Printing the name of each feature along with the gini value
    imp_values = list(rf.feature_importances_)
    imp_table = pd.DataFrame({'feature':list(X_input.columns), 'importance':imp_values})
    
 
    # Finally selecting the most important features
    sfm = SelectFromModel(rf, threshold=0.15)
    sfm.fit(X_input, y_input)
   

    # Extracting the index of important features
    index_selected = list(sfm.get_support())
    feature_selected = X_input.columns[index_selected]
    total = X_input.shape[1]
    selected = len(feature_selected)
    removed = total - selected
    
    # Sort features according to importance
    imp_table.sort_values('importance', ascending = False, inplace = True)
    imp_table.reset_index(drop=True, inplace=True)

    # Normalizint the feature importances:
    imp_table['normalized_importance'] = imp_table['importance'] / imp_table['importance'].sum()
    imp_table['cumulative_importance'] = np.cumsum(imp_table['normalized_importance'])

    # Extract the features with importance greter than threshold
    zero_imp = imp_table.loc[imp_table['importance'] == 0.0, ['feature', 'importance']]
    low_imp = imp_table[imp_table['importance'] < threshold_imp]
    high_imp = imp_table[imp_table['importance'] >= threshold_imp]    
    
    cumulative_imp = pd.DataFrame({'count': [i for i in range(1, X_input.shape[1]+1)]})
    cumulative_imp['cumulative_importance'] = imp_table['cumulative_importance']
    required_feature = 0
    for i in range(cumulative_imp.shape[0]):
        if cumulative_imp.iloc[i, 1]>= threshold_cumulative:
            required_feature = cumulative_imp.iloc[i, 0]
            break
    
    
    """
    print('\nThe original data has {:d} features.'.format(total))    
    print('After one-hot encoding, number of features becoms {:d}'.format(X.shape[1]))
    print('{:d} features has zero importance.'.format(zero_imp.shape[0]))
    print('{:d} features has lower importance than the given threshold.'.format(len(low_imp)))
    print('There are {:d} features for cumulative threshold of {:.3f} importance.'.format(len(cumulative_imp), threshold_cumulative))
    """    
    print('The original data has {:d} features. By applying importance threshold of {:0.3f}, we found:'.format(total, threshold_imp))    
    print('{:d} features has been selected.'.format(selected))
    print('{:d} features has been removed.'.format(removed))
    
    X_fs = sfm.transform(X_input)
    print('\nBefore feature selection: {}'.format(X_input.shape))
    print('After feature selection: {}'.format(X_fs.shape))
    
    #return index_selected
    return imp_table, low_imp, zero_imp, cumulative_imp