In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import re
import time
from scipy import stats

import warnings
warnings.filterwarnings("ignore")
import scipy.stats

#Quality of image:
%config InlineBackend.figure_format = 'svg'
import os

pd.set_option('display.max_columns', 100)

# Import my own functions from utils file:
import import_ipynb
#from utils_home_prices import *

In [2]:
def make_metadata(df):
    data = []
    for feature in df.columns:
        # Defining the role
        if feature == 'target':
            role = 'target'
        elif feature == 'id':
            role = 'id'
        else:
            role = 'input'

            
        # Defining the data type 
        dtype = df[feature].dtype

        uniqueCnt = df[feature].nunique()
        
        # Defining the type
        if 'bin' in feature or feature == 'target':
            type = 'binary'
        elif 'cat' in feature or feature == 'id':
            type = 'categorical'
        elif (df[feature].dtype == 'float32') | (df[feature].dtype == 'float64'):
            type = 'numeric'
        elif (df[feature].dtype == 'int32') | (df[feature].dtype == 'int64'):
            type = 'ordinal'
        elif (df[feature].dtype == 'object'):
            type = 'categorical'

        if (uniqueCnt == 2 and type == 'ordinal'):
            type = 'binary'


        # Creating a dictionary for adding a row to metadata:
        feature_dictionary = {
            'varname': feature,
            'role': role,
            'type': type,
            'dtype': dtype,
            'uniqueCnt': uniqueCnt
        }
        data.append(feature_dictionary)

    metadata = pd.DataFrame(data, columns=['varname', 'role', 'type', 'dtype', 'uniqueCnt'])
    metadata.set_index('varname', inplace=True)
    return metadata


def make_typedf(df, metadata, typeName):
    cols = metadata[metadata['type']==typeName].index.tolist()
    if 'id' in cols:
        cols.remove('id')
    
    cnt = len(cols)
    
    res_df = pd.DataFrame(columns={'feature', 'uniqueCnt'})
    res_df['feature'] = cols

    for col in cols:
        res_df.loc[res_df['feature']==col,'uniqueCnt'] = df[col].nunique()

    # To set order of columns in dataframe:
    res_df = res_df[['feature', 'uniqueCnt']]
    return cols, cnt, res_df


def find_nonNumerical_percent(df, col):
    """
    This function checks numerical status of a given column. In fact, it checks if the columns are 100% numerical
    or thay have non-digit as well.
    Input: df: a dataframe, 
           col: a name of a column.
    output: index of non-numerical rows.
    """
    if(df[col].dtype == 'O'):
        print('rexxx')
        check_df = df[col].apply(lambda x: x.isnumeric())
    else:
        print('alii')
        check_df = df[col].apply(lambda x: str(x).isnumeric())
    
    strIndex = check_df.loc[check_df == False].index
    
    if(len(strIndex>0)):
        print('{:0.2f}% of the "{}" column is not numerical.'.format(len(strIndex)*100/df.shape[0], col))
        print('The distribution is as follow:')
        print(df.loc[strIndex, col].value_counts())
        print('********************************************************')
    else:
        print('The "{}" column is verified; 100% numerical'.format(col))
    return strIndex



def check_numericalStatus(df, cols):
    """
    This function checks if columns passed to the function are numerical or not. 
    If there are some non numerical columns, it will return those columns.
    Input: df: a dataframe, 
           cols: a list of columns for being checked.
    output: a list of non-numerical columns
    """
    
    resType = [df[col].dtype for col in cols]
    res = [col for i,col in enumerate(cols) if resType[i] == 'O']
    if(len(res)>0):
        print('Notice!!!\n')
        print('The following columns are not pure numerical. They have some non-numericla entry.')
        print(res)
    else:
        print('Dataframe is verified!')
        print('All columns are pure numerical.')
    return res


def convert_to_numeric(df, cols, distributionFlag=False):
    """
    This function will convert passed columns into numerical.
    Finally, if there are some columns which are remained in form of object datatype, it will return them.
    Input: df: a dataframe, 
           cols: a list of columns to convert.
           distributionFlag: a flag which indicates to show the distribution of non-numerical columns or not. Default value is False.
    output: df: converted dataframe to numeric (if applicable),
            catCols: list of non-numerical columns,
            res_df: a dataframe including only non-numerical content
            indexLst: a 2d list includes index of non-numerical content
    """
    for col in cols:
        df[col] = pd.to_numeric(df[col], errors='ignore')

    
    resType = [df[col].dtype for col in cols]
    catCols = [col for i,col in enumerate(cols) if resType[i] == 'O']# We will save features having nonnumerical data.
    res_df = pd.DataFrame()
    indexLst = []# We will save index of data which are not numerical in indexLst.


    if(len(catCols)==0):
        print('All columns are verified; 100% numerical')
        return df, catCols, res_df, indexLst
    
    elif(len(catCols)>0): 
        print('Notice!!!\n')
        print('The following columns are not pure numerical. They have some non-numericla data.')
        print(catCols)
        # We keep the info in colLst and rowLst for printing in dataframe.
        colLst = ['Feature', 'NonNumerical %']
        rowLst = []
    
        for col in catCols:
            if(df[col].dtype == 'O'):
                check_status = df[col].apply(lambda x: x.isnumeric())
            else:
                check_status = df[col].apply(lambda x: str(x).isnumeric())
            
            catIndex = check_status.loc[check_status == False].index
            if(len(catIndex>0)):
                indexLst.append(catIndex)
                nonNumericalValue = round(len(catIndex)*100/df.shape[0], 2)
                valueLst = [col, nonNumericalValue]
                dic1 = {k:v for k,v in zip(colLst, valueLst)}
                rowLst.append(dic1)# Will save nonnumerical data for adding into dataframe
                
                if(distributionFlag == True):
                    print('The distribution of nonnumerical data of "{}" is as follow:'.format(col))
                    print(df.loc[catIndex, col].value_counts())
                    print('********************************************************')
        
        res_df = pd.DataFrame(rowLst, columns = colLst)
    return df, catCols, res_df, indexLst

def print_nonNumericalContent(df, nonNumCols, nonNumIndexLst):
    """
    This function will print the content of non-numerical content of the passed columns.
    Input: df: a dataframe, 
           nonNumCols: a list of columns which are non-numerical.
           nonNumIndexLst: is a 2d list which includes the index of non-numerical row.
    output: There is no variable for return. It only print non-numerical content  
    """
    for i, col in enumerate(nonNumCols):
        print('Feature of "{}":'.format(col))
        print(df.loc[nonNumIndexLst[i], col].value_counts())
        print('*******************************************************')
    return


def make_box_plot(df, cols):
    """
    This function will show box plot for the passed features.
    Input: df: a dataframe, 
           cols: a list of feature names for box plot.
    output: This function only show box plot.
    """
    cnt = len(cols)
    if cnt == 1:
        col = cols[0]
        fig, ax = plt.subplots(figsize=(5,3))
        sns.boxplot(x=df[col], ax=ax);
        #axes[i].set_title('Distribution')
        ax.set_xlabel(col)
    else:

        fig, axes = plt.subplots(cnt, 1, figsize=(5,3*cnt))

        for i, col in enumerate(cols):
            sns.boxplot(x=df[col], ax=axes[i]);
            #axes[i].set_title('Distribution')
            axes[i].set_xlabel(col)
            #axes[i].set_ylabel('Count');
        fig.tight_layout()
    return
        
        
def removeOutliers_IQR(df, num_Feat, nIQR = 1.5, inplace=False):
    """
    This function will remove outlier using interquartile approach.
    Input: df: a dataframe, 
           cols: a list of numerical feature names for evaluation of outliers,
           nIQR: the coefficient of interquartile range,
           inplace: a flag which determines if modification be applied to original df or not.
    output: non_outlier_df: the original dataframe after removing outliers. 
            dropped_df.index: the row index of outliers in the original df.
            df: If inplace=False, it is the same as original dataframe, otherwise it is the same as non-outlier_df
    """
    totalCnt = df.shape[0]
    numeric_df = df[num_Feat]
    nonNum_Feat = [col for col in df.columns if col not in num_Feat]
    
    q1 = numeric_df.quantile(.25)
    q3 = numeric_df.quantile(.75)
    iqr = q3 - q1
    lower = (q1 - nIQR * iqr)
    upper = (q3 + nIQR * iqr)
        
    
    dropped_df = numeric_df[((numeric_df < lower) | (numeric_df > upper)).any(axis=1)]    
    droppedCnt = dropped_df.shape[0]
    droppedPercent = 100 * droppedCnt / totalCnt
    print('\nFor nIQR = {:.2f}'.format(nIQR))
    print('Count of dropped rows: {}'.format(droppedCnt))
    print('Percentage of dropped rows: {:.3f}%'.format(droppedPercent))
    
    #print(numeric_df.loc[dropped_df.index].shape)
    non_outlier_df = df.copy(deep=True)
    non_outlier_df.drop(dropped_df.index, inplace = True)
    if(inplace==True):
        df.drop(dropped_df.index, inplace = True)
        
    return df, non_outlier_df, dropped_df.index


def make_count_plot(df, cols, annotate=False, xRotation = 0, title=None, width=5, height=3):
    """
    This function will calculate and show count plot.
    Input: df: a dataframe, 
           cols: a list of feature names for count plot.
           annotate: a condition flag, when it is True the percentage of each bar will be observable. The default value is False. 
           xRotation: a degree for rotaion of x-axis ticks. The default value is 0.
           title: the title of the plot,
           width: the width of the plot. The default value is 5.
           height: the height of the plot. The default value is 3.
    output: This function only shows count plot.
    """
    totalcnt = df.shape[0]
    cnt = len(cols)
    if(cnt == 1):
        fig, ax = plt.subplots(figsize=(width,height*cnt))
        col = cols[0]
        sns.countplot(x=df[col], ax=ax);
        ax.set_title(title)
        ax.set_xlabel(col)
        ax.set_ylabel('Frequency');
        
        if(annotate == True):
                for p in ax.patches:
                    x=p.get_bbox().get_points()[:,0]
                    y=p.get_bbox().get_points()[1,1]
                    ax.annotate('{:.1f}%'.format(y*100/totalcnt), (x.mean(), y), ha='center', va='bottom')
    else:
        fig, axes = plt.subplots(cnt, 1, figsize=(5,3*cnt))
        for i, col in enumerate(cols):
            sns.countplot(x=df[col], ax=axes[i]);
            #axes[i].set_title('Distribution')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency');
            if(xRotation != 0):
                axes[i].tick_params(axis='x', labelrotation=xRotation)
            
            if(annotate == True):
                for p in axes[i].patches:
                    x=p.get_bbox().get_points()[:,0]
                    y=p.get_bbox().get_points()[1,1]
                    axes[i].annotate('{:.1f}%'.format(y*100/totalcnt), (x.mean(), y), ha='center', va='bottom')

        fig.tight_layout()
    return
        
        
def make_multi_box_plot(df, cols,  refCol, xRotation=0, width=5, height=3.5):
    """
    This function will calculate and show box plot.
    Input: df: a dataframe, 
           cols: a list of categorical features. They are for x-axis of box plot.
           refCol: a feature that the boxplot of categorical features will be calculated based on that.  
           xRotation: a degree for rotaion of x-axis ticks. The default value is 0.
           width: the width of the plot. The default value is 5.
           height: the height of the plot. The default value is 3.
    output: This function only shows box plot.
    """
    cnt = len(cols)
    if(cnt == 1):
        fig, ax = plt.subplots(figsize=(width,height))
        col = cols[0]
        sns.boxplot(x=df[col], y=df[refCol], ax=ax);
        ax.set_title('{} vs {}'.format(col, refCol))
        ax.set_xlabel(col)

        if(xRotation != 0):
            axes[i].tick_params(axis='x', labelrotation=xRotation)
    else:
        fig, axes = plt.subplots(cnt, 1, figsize=(width,height*cnt))
        for i, col in enumerate(cols):
            sns.boxplot(x=df[col], y=df[refCol], ax=axes[i]);
            axes[i].set_title('{} vs {}'.format(col, refCol))
            axes[i].set_xlabel(col)
            #axes[i].set_ylabel('Count');

            if(xRotation != 0):
                axes[i].tick_params(axis='x', labelrotation=xRotation)
    fig.tight_layout()
    return
    
    
def make_multi_histogram(df, cols, xRotation=0, width=5, height=3.5):
    """
    This function will calculate and show histogram.
    Input: df: a dataframe, 
           cols: a list of numerical features for histogram.
           xRotation: a degree for rotaion of x-axis ticks. The default value is 0.
           width: the width of the plot. The default value is 5.
           height: the height of the plot. The default value is 3.
    output: This function only shows histogram.
    """
    cnt = len(cols)   
    if(cnt == 1):
        col = cols[0]
        fig, ax = plt.subplots(figsize=(width,height))
        sns.distplot(df[col],kde=False, hist_kws=dict(edgecolor="k"), ax=ax);
        ax.set_xlabel(col)
        ax.set_ylabel('Frequency');
        ax.set_title('Distribution of {}'.format(col))
        if(xRotation != 0):
            ax.tick_params(axis='x', labelrotation=xRotation)
    else:
        fig, axes = plt.subplots(cnt, 1, figsize=(width,height*cnt))
        for i, col in enumerate(cols):
            sns.distplot(df[col],kde=False, hist_kws=dict(edgecolor="k"), ax=axes[i]);
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency');
            axes[i].set_title('Distribution of {}'.format(col))
            if(xRotation != 0):
                axes[i].tick_params(axis='x', labelrotation=xRotation)
        fig.tight_layout()
    return

    
def make_scatter_plot(df, cols,  refCol, hueCol=None, xRotation=0, width=5, height=3.5):
    """
    This function will calculate and show scatter plot.
    Input: df: a dataframe, 
           cols: a list of numerical features. They are for x-axis of scatter plot.
           refCol: a feature that the scatterplot of the numerical features will be calculated based on that. 
           hueCol: a feature for being hue in scatter plot.
           xRotation: a degree for rotaion of x-axis ticks. The default value is 0.
           width: the width of the plot. The default value is 5.
           height: the height of the plot. The default value is 3.
    output: This function only shows scatter plot.
    """
    cnt = len(cols)
    if(cnt == 1):
        fig, ax = plt.subplots(figsize=(width,height))
        col = cols[0]

        if(hueCol == None):
            sns.scatterplot(x=df[col], y=df[refCol], ax=ax);
        else:
            sns.scatterplot(x=df[col], y=df[refCol], hue=df[hueCol], ax=ax);   
        ax.set_title('{} vs {}'.format(col, refCol))
        ax.set_xlabel(col)
        if(xRotation != 0):
            axes[i].tick_params(axis='x', labelrotation=xRotation)
    else:
        fig, axes = plt.subplots(cnt, 1, figsize=(width,height*cnt))
        for i, col in enumerate(cols):
            if(hueCol == None):
                sns.scatterplot(x=df[col], y=df[refCol], ax=axes[i]);
            else:
                sns.scatterplot(x=df[col], y=df[refCol], hue=df[hueCol], ax=axes[i]);
            axes[i].set_title('{} vs {}'.format(col, refCol))
            axes[i].set_xlabel(col)
            #axes[i].set_ylabel('Count');

            if(xRotation != 0):
                axes[i].tick_params(axis='x', labelrotation=xRotation)
        fig.tight_layout()
    return

        
def scatter_plot_specified(df, cols,  refCol, varName, varData, hueCol=None, xRotation=0, width=5, height=3.5):
    """
    This function will calculate and show scatter plot.
    Input: df: a dataframe, 
           cols: a list of numerical features. They are for x-axis of scatter plot.
           refCol: a feature that the scatterplot of the numerical features will be calculated based on that. 
           varName = it is a name of categorical feature. It acts as a filter that scatter plot will be calculated when a condition is applied on this feature. 
           varDate = the value of the categorical feature. We want to calculate scatterplot when varName=varDate.
           hueCol: a feature for being hue in scatter plot.
           xRotation: a degree for rotaion of x-axis ticks. The default value is 0.
           width: the width of the plot. The default value is 5.
           height: the height of the plot. The default value is 3.
    output: This function only shows scatter plot.
    """
    cnt = len(cols)
    
    if(cnt==1):
        fig, ax = plt.subplots(figsize=(width, height))
        df = df[df[varName]==varData]
        col = cols[0]
        if(hueCol == None):
            sns.scatterplot(x=df[col], y=df[refCol], ax=ax);
        else:
            sns.scatterplot(x=df[col], y=df[refCol], hue=df[hueCol], ax=ax);
        ax.set_title('Distribution where {}={}'.format(varName, varData))
        ax.set_xlabel(col)
        #ax.set_ylabel('Count');
        if(xRotation != 0):
            ax.tick_params(axis='x', labelrotation=xRotation)

    else:
        fig, axes = plt.subplots(cnt, 1, figsize=(width, height*cnt))
        df = df[df[varName]==varData]
        for i, col in enumerate(cols):
            if(hueCol == None):
                sns.scatterplot(x=df[col], y=df[refCol], ax=axes[i]);
            else:
                sns.scatterplot(x=df[col], y=df[refCol], hue=df[hueCol], ax=axes[i]);
            axes[i].set_title('Distribution where {}={}'.format(varName, varData))
            axes[i].set_xlabel(col)
            #axes[i].set_ylabel('Count');

            if(xRotation != 0):
                axes[i].tick_params(axis='x', labelrotation=xRotation)
    fig.tight_layout()
    return
    
    
def makeHeatmap(df, cols, annot = False, line=False):
    """
    This function will calculate and show heatmap of correlation
    Input: df: a dataframe, 
           cols: a list of numerical features.
           annot: a conditional flag. If True, then the value of correlation will be printed within heatmap.
           line = a conditional flag. If True, then cell border line will be printed inside heatmap.
    output: This function only shows heatmap.
    """
    #colormap = plt.cm.white
    colormap = plt.cm.YlGnBu
    sns.set(style="whitegrid")


    plt.figure(figsize=(12,10))
    plt.title('Correlation Heatmap', size=15)
    if(line == True):
        sns.heatmap(df[cols].corr(), vmax=1.0, cmap=colormap, annot=annot, square=True, linewidths=0.005, linecolor='gray');
    elif(line == False):
        sns.heatmap(df[cols].corr(), vmax=1.0, cmap=colormap, annot=annot, square=True);
    return