In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import re
import time
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")
import scipy.stats

In [1]:
def make_metadata(df):
    data = []
    for feature in df.columns:
        # Defining the role
        if feature == 'target':
            role = 'target'
        elif feature == 'id':
            role = 'id'
        else:
            role = 'input'

            
        # Defining the data type 
        dtype = df[feature].dtype

        uniqueCnt = df[feature].nunique()
        
        # Defining the type
        if (df[feature].dtype == 'datetime64[ns]') | (df[feature].dtypes == '<M8[ns]'):
            type = 'datetime'
        elif 'bin' in feature or feature == 'target':
            type = 'binary'
        elif 'cat' in feature or feature == 'id':
            type = 'categorical'
        elif (df[feature].dtype == 'float32') | (df[feature].dtype == 'float64'):
            type = 'numeric'
        elif (df[feature].dtype == 'int32') | (df[feature].dtype == 'int64'):
            type = 'ordinal'
        elif (df[feature].dtype == 'object'):
            type = 'categorical'

        if (uniqueCnt == 2 and type == 'ordinal'):
            type = 'binary'


        # Creating a dictionary for adding a row to metadata:
        feature_dictionary = {
            'varname': feature,
            'role': role,
            'type': type,
            'dtype': dtype,
            'uniqueCnt': uniqueCnt
        }
        data.append(feature_dictionary)

    metadata = pd.DataFrame(data, columns=['varname', 'role', 'type', 'dtype', 'uniqueCnt'])
    metadata.set_index('varname', inplace=True)
    return metadata


def make_typedf(df, metadata, typeName):
    cols = metadata[metadata['type']==typeName].index.tolist()
    if 'id' in cols:
        cols.remove('id')
    
    cnt = len(cols)
    
    res_df = pd.DataFrame(columns={'feature', 'uniqCnt', 'percentage'})
    res_df['feature'] = cols

    uniqLst = [df[col].nunique() for col in cols]
    percentLst = [round(x*100/df.shape[0],2) for x in uniqLst]
    
    res_df['uniqCnt'] = uniqLst
    res_df['percentage'] = percentLst
    res_df.sort_values(by='uniqCnt', inplace=True, ignore_index=True)
    
    # To ensure the order of features in the dataframe:
    res_df = res_df[['feature', 'uniqCnt', 'percentage']]
    return cols, cnt, res_df

def findSeason(dateCol, rowNumber):
    """
    This function gets a list containing day of the year.
    It returns a list of string equivalent to the season for each day in the input list.
    """
    dayofYearLst = dateCol.dt.dayofyear
    seasonLst = ['']*rowNumber

    #to divide by season it's better to use the day of the year instead of the months
    springLst = range(80, 172)
    summerLst = range(172, 264)
    fallLst = range(264, 355)
    # winterLst = everything else
    for i, x in enumerate(dayofYearLst):
        if x in springLst:
            season = 'Spring'
        elif x in summerLst:
            season = 'Summer'
        elif x in fallLst:
            season = 'Fall'
        else:
            season = 'Winter'
        seasonLst[i] = season
    return seasonLst

def make_box_plot(df, cols):
    cnt = len(cols)
    if(cnt == 1):
        col = cols[0]
        fig, ax = plt.subplots(figsize=(5,3))
        sns.boxplot(x=df[col], ax=ax);
        #ax.set_title('Distribution')
        ax.set_xlabel(col)
    
    else:
        fig, axes = plt.subplots(cnt, 1, figsize=(5,3*cnt))
        for i, col in enumerate(cols):
            sns.boxplot(x=df[col], ax=axes[i]);
            #axes[i].set_title('Distribution')
            axes[i].set_xlabel(col)
            #axes[i].set_ylabel('Count');
        fig.tight_layout()
    return None

def make_binarybox_plot(df, cols, target, xRotation=0):
    cnt = len(cols)
    if(cnt == 1):
        col = cols[0]
        fig, ax = plt.subplots(figsize=(7,3))
        sns.boxplot(x=df[col], y=df[target], ax=ax);
        ax.set_title('{} Distribution'.format(target))
        ax.set_xlabel(col)
        ax.set_ylabel('{} Range'.format(target));
        if(xRotation != 0):
            ax.tick_params(axis='x', labelrotation=90)

    
    else:
        xCnt = int(np.ceil(cnt/2))
        fig, axes = plt.subplots(xCnt, 2, figsize=(12,4*xCnt))
        for col, ax in zip(cols, axes.flatten()):
            sns.boxplot(x=df[col],y=df[target], ax=ax);
            ax.set_title('{} Distribution'.format(target))
            ax.set_xlabel(col)
            ax.set_ylabel('{} Range'.format(target));
            if(xRotation != 0):
                ax.tick_params(axis='x', labelrotation=90)
            
        fig.tight_layout()
    return None
        
       
def make_bar_plot(x, y,xLabel, yLabel, hue=None, xRotation = 0):
    """
    This function will make a bar plot.
    x: categorical or ordinal List,
    y: Numerical list
    """
    fig, ax = plt.subplots(figsize=(5,3))
    #if (hue != None):
    sns.barplot(x=x, y=y, hue=hue, ax=ax);
    ax.set_title('Distribution')
    ax.set_xlabel(xLabel)
    ax.set_ylabel(yLabel);
    if(xRotation != 0):
        ax.tick_params(axis='x', labelrotation=90)
        
def make_count_plot(df, cols, annotate=False, xRotation = 0, title=None):

    totalcnt = df.shape[0]
    cnt = len(cols)
    if(cnt == 1):
        fig, ax = plt.subplots(figsize=(5,3*cnt))
        col = cols[0]
        sns.countplot(x=df[col], ax=ax);
        ax.set_title(title)
        ax.set_xlabel(col)
        ax.set_ylabel('Frequency');
        

        if(xRotation != 0):
            ax.tick_params(axis='x', labelrotation=90)
            
        if(annotate == True):
                for p in ax.patches:
                    x=p.get_bbox().get_points()[:,0]
                    y=p.get_bbox().get_points()[1,1]
                    ax.annotate('{:.1f}%'.format(y*100/totalcnt), (x.mean(), y), ha='center', va='bottom')

                    
    
    if(cnt>1):
        fig, axes = plt.subplots(cnt, 1, figsize=(5,3*cnt))
        for i, col in enumerate(cols):
            sns.countplot(x=df[col], ax=axes[i]);
            #axes[i].set_title('Distribution')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency');
            
            
            
            if(xRotation != 0):
                axes[i].tick_params(axis='x', labelrotation=90)
            
            if(annotate == True):
                for p in axes[i].patches:
                    x=p.get_bbox().get_points()[:,0]
                    y=p.get_bbox().get_points()[1,1]
                    axes[i].annotate('{:.1f}%'.format(y*100/totalcnt), (x.mean(), y), ha='center', va='bottom')

        fig.tight_layout()
        
        
        
def removeOutliers_IQR(df, num_Feat, nIQR = 1.5, inplace=False ,imputation=0):
    totalCnt = df.shape[0]
    numeric_df = df[num_Feat]
    nonNum_Feat = [col for col in df.columns if col not in num_Feat]
    
    q1 = numeric_df.quantile(.25)
    q3 = numeric_df.quantile(.75)
    iqr = q3 - q1
    lower = (q1 - nIQR * iqr)
    upper = (q3 + nIQR * iqr)
        
#     if(imputation == 0):
#         #new_df = num_df[~((new_df < lower)|(new_df > upper)).any(axis=1)]
#         num_df = num_df[((num_df >lower) & (num_df <upper)).any(axis=1)]
#         final_df = pd.concat([num_df, cat_df], axis=1)
    
    #dropped_df = numeric_df[((numeric_df < lower) | (numeric_df > upper)).any(axis=1)]    
    dropped_df = df.loc[(numeric_df<lower) | (numeric_df>upper)]
    droppedCnt = dropped_df.shape[0]
    droppedPercent = 100 * droppedCnt / totalCnt
    print('\nFor nIQR = {:.2f}'.format(nIQR))
    print('Count of dropped rows: {}'.format(droppedCnt))
    print('Percentage of dropped rows: {:.3f}%'.format(droppedPercent))
    
    #print(numeric_df.loc[dropped_df.index].shape)
    temp_df = df.copy(deep=True)
    temp_df.drop(dropped_df.index, inplace = True)
    if(inplace==True):
        df.drop(dropped_df.index, inplace = True)
    
    return temp_df, dropped_df.index     



def change_datatype(df): #minimize used memory
    for col in list(df.select_dtypes(include=['int']).columns):
        if df[col].max() < 2**7 and df[col].min() >= -2**7:
            df[col] = df[col].astype(np.int8)
        elif df[col].max() < 2**8 and df[col].min() >= 0:
            df[col] = df[col].astype(np.uint8)
        elif df[col].max() < 2**15 and df[col].min() >= -2**15:
            df[col] = df[col].astype(np.int16)
        elif df[col].max() < 2**16 and df[col].min() >= 0:
            df[col] = df[col].astype(np.uint16)
        elif df[col].max() < 2**31 and df[col].min() >= -2**31:
            df[col] = df[col].astype(np.int32)
        elif df[col].max() < 2**32 and df[col].min() >= 0:
            df[col] = df[col].astype(np.uint32)
    for col in list(df.select_dtypes(include=['float']).columns):
        df[col] = df[col].astype(np.float32)
        

def size_MB(obj):
    import sys
    size = sys.getsizeof(obj)
    size /=10**6
    size = round(size, 2)
    return size


def miss_value_func(in_data, classificatio_flag=0, only_miss_col=0):
    """
    The dataframe is supposed to have target column with name of 'target'.
    """
    out_table = pd.DataFrame(in_data.isnull().sum(), columns = ['miss_value'])
    out_table['percentage'] = (100 * out_table['miss_value'])/len(in_data)
    out_table.sort_values(by='miss_value', ascending=False, inplace=True)

    lst_miss_col = [i for i in range(len(out_table)) if (out_table.iloc[i][1]!=0)]    
    print('The input data frame has {:d} features.'.format(in_data.shape[1]))
    print('The data frame has {:d} columns with missing values'.format(len(lst_miss_col)))
    
    
    if(only_miss_col == 1):
        len_miss = len(lst_miss_col)
        out_table = out_table.iloc[:len_miss, :]
    if (classificatio_flag == 0):
        return out_table
    
    if classificatio_flag == 1:
        try:
            uniq_levels = list(in_data['target'].unique())
            uniq_n = len(uniq_levels)
            n_total = len(in_data)
            n_target1 = len(in_data[in_data['target']==1])
            n_target0 = n_total - n_target1
            flag=1
        except:
            print("Error!  The dataframe does not have column with name 'target'.  Try again...")
            flag = 0
        
        if flag == 1:
            
            for lev in uniq_levels:
                str_col = 'count_miss: target=' + str(lev)
                out_table[str_col] = 0 
            
            for lev in uniq_levels:
                str_col = 'percent_miss: target=' + str(lev)
                out_table[str_col] = 0 
                
            for lev in uniq_levels:
                str_col = 'norm_miss: target=' + str(lev)
                out_table[str_col] = 0 
            
            len_miss = len(lst_miss_col)
            for i, col in enumerate(lst_miss_col):
                for j, lev in enumerate(uniq_levels):
                    count = in_data[out_table.index[i]][in_data['target']==lev].to_frame().isnull().sum()
                    percent = (count / out_table.iloc[i, 0])*100
                    out_table.iloc[i, 2+j] = count[0]
                    out_table.iloc[i, 2+j+uniq_n] = percent[0]
                    if(lev == 0):
                        out_table.iloc[i, 2+j+2*uniq_n] = 100*(count[0]/n_target0)
                    elif(lev== 1):
                        out_table.iloc[i, 2+j+2*uniq_n] = 100*(count[0]/n_target1)
            return out_table

        
def add_missCol(df):
    df.reset_index(inplace=True, drop=True)
    length = df.shape[0]
    missLst = [0]*length
    for row in range(length):
        missCnt = 0
        for col in df.columns:
            try:
                x = float(df.loc[row, col])
                if(x!=x):
                    missCnt +=1  
            except:
                pass

            if col == 'count':
                missLst[row] = missCnt
    df['misscnt'] = missLst
    return True#df


def check_numCols(df, cols):
    # This function checks if all columns are numerical or not. 
    # Finally, if there are some columns which are not numerical, they will return.
    resType = [df[col].dtype for col in cols]
    res = [col for i,col in enumerate(cols) if resType[i] == 'O']
    if(len(res)>0):
        print('Notice!!!\n')
        print('The following columns are not pure numerical. They have some non-numericla data.')
        print(res)
    else:
        print('Verified!')
        print('All columns are pure numerical.')
    return df, res

def convert_check_numCols(df, cols, distributionFlag=False):
    # This function will convert passed columns into numerical.
    # Finally, if there are some columns which are remained in form of object datatype, they will return.
    
    # First, we will change datatype into numeric:
    for col in cols:
        df[col] = pd.to_numeric(df[col], errors='ignore')

    
    resType = [df[col].dtype for col in cols]
    catCols = [col for i,col in enumerate(cols) if resType[i] == 'O']# We will save features having nonnumerical data.
    
    if(len(catCols)==0):
        print('All columns are verified; 100% numerical')
        return
    
    elif(len(catCols)>0):        
        print('Notice!!!\n')
        print('The following columns are not pure numerical. They have some non-numericla data.')
        print(catCols)

        # We will save index of data which are not numerical in indexLst.
        indexLst = []
        
        # We keep the info in colLst and rowLst for printing in dataframe.
        colLst = ['Feature', 'NonNumerical %']
        rowLst = []
    
        for col in catCols:
            if(df[col].dtype == 'O'):
                check_status = df[col].apply(lambda x: x.isnumeric())
            else:
                check_status = df[col].apply(lambda x: x.str.isnumeric())
            
            catIndex = check_status.loc[check_status == False].index
            if(len(catIndex>0)):
                indexLst.append(catIndex)
                nonNumericalValue = round(len(catIndex)*100/df.shape[0], 2)
                valueLst = [col, nonNumericalValue]
                dic1 = {k:v for k,v in zip(colLst, valueLst)}
                rowLst.append(dic1)# Will save nonnumerical data for adding into dataframe
                
                if(distributionFlag == True):
                    print('The distribution of nonnumerical data of "{}" is as follow:'.format(col))
                    print(df.loc[catIndex, col].value_counts())
                    print('********************************************************')
        
        res_df = pd.DataFrame(rowLst, columns = colLst)
    return df, catCols, res_df, indexLst


def analyze_prediction(y_predicted, y_observed, result_matrix = None, model_name=None):
    
    rmse = np.sqrt(mean_squared_error(y_true=y_observed, y_pred=y_predicted))
    r2 = r2_score(y_true=y_observed, y_pred=y_predicted)
    
    if (model_name != None):
        result_matrix.loc['R2', model_name] = r2
        result_matrix.loc['RMSE', model_name] = rmse
        

    print("R2, RMSE:")
    return r2, rmse


def makeHeatmap(df, cols, annot = False, line=False):
    #colormap = plt.cm.white
    colormap = plt.cm.YlGnBu
    sns.set(style="whitegrid")


    plt.figure(figsize=(12,10))
    plt.title('Correlation Heatmap', size=15)
    if(line == True):
        sns.heatmap(df[cols].corr(), vmax=1.0, cmap=colormap, annot=annot, square=True, linewidths=0.005, linecolor='gray');
    elif(line == False):
        sns.heatmap(df[cols].corr(), vmax=1.0, cmap=colormap, annot=annot, square=True);