In [3]:
import pandas as pd
import numpy as np

In [4]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj, pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else:
        usage_b = pandas_obj.memory_usage(deep=True)
    
    usage_mb = usage_b / 1024 ** 2
    
    return '{:03.2f} MB'.format(usage_mb)

In [5]:
def different_state(s1, s2):
    if s1 == s2:
        return 0
    else:
        return 1

In [6]:
def reducememory(gl):
    # 優化整數與浮點數
    gl_int = gl.select_dtypes(include=['int64'])
    converted_int = gl_int.apply(pd.to_numeric, downcast='unsigned')
    gl_float = gl.select_dtypes(include=['float'])
    converted_float = gl_float.apply(pd.to_numeric, downcast='float')
    
    # 優化object
    gl_obj = gl.select_dtypes(include=['object'])
    converted_obj = pd.DataFrame()
    
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:, col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:, col] = gl_obj[col]

    optimized_gl = gl.copy()
    optimized_gl[converted_int.columns] = converted_int
    optimized_gl[converted_float.columns] = converted_float
    optimized_gl[converted_obj.columns] = converted_obj
    
    # print(mem_usage(gl))
    # print(mem_usage(optimized_gl))
    
    return optimized_gl

In [7]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data))
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    
    return missing_data

In [8]:
def groupencoder(df, cols):
    df = abs(df)
    for col in cols:
        per_25 = np.nanpercentile(df[col], 25)
        per_50 = np.nanpercentile(df[col], 50)
        per_75 = np.nanpercentile(df[col], 75)
        
        for index, row in df.iterrows():
            if pd.isnull(row[col]) == False:
                if row[col] > per_50:
                    if row[col] > per_75:
                        df.loc[index, col] = 4
                    else:
                        df.loc[index, col] = 3
                else:
                    if row[col] > per_25:
                        df.loc[index, col] = 2
                    else:
                        df.loc[index, col] = 1
                        
    new_df = df.fillna(0)
    return new_df

In [9]:
def is_na_or_not(x):
    return pd.isnull(x)

In [10]:
# 年份二位轉四位
def year_2d_to_4d(x):
    try:
        if int(x) > 14:  # 代表19xx年
            x = '19' + x
        else:  # 20xx年
            x = '20' + x
    except ValueError:
        pass
    return x

In [11]:
def money_clean(org_money):
    org_money = org_money.strip().split('.')[0]
    org_money = org_money.strip('$').replace(',', '')

    return org_money

In [12]:
def num_visualization(df, num_features, exclude_outlier=0):
    # 視覺化數值型欄位
    count = 1

    for i in range(len(num_features)):
        plt.figure(figsize=(10,10))
        vis = df[num_features[i]]
        vis = vis[~(pd.isnull(vis))]
        
        if exclude_outlier != 0:
            upb = np.percentile(vis, 100-exclude_outlier)
            lwb = np.percentile(vis, exclude_outlier)
            vis = vis[(vis <= upb)]

        if len(vis) == 0:
            print('{} is skipped!'.format(num_features[i]))
            continue
        
        plt.title(num_features[i], fontsize=20)

#         binnum = math.floor(1 + 3.3 * np.log10(len(vis)))
#         print('data contains {} rows and was divided into {} groups.'.format(len(vis), binnum))
        sns.set_theme(style="darkgrid")
        sns.histplot(data=vis, log_scale=False, bins=15)
        plt.savefig('histogram_{}.jpeg'.format(num_features[i]))
        count += 1
        plt.show()

In [3]:
def num_visualization_with_hue(df, num_features, hue_col, exclude_outlier=0):
    # 視覺化數值型欄位
    count = 1

    for i in range(len(num_features)):
        if (num_features[i] == hue_col):
            continue

        plt.figure(figsize=(10,10))
        vis = df[[num_features[i], hue_col]]
        vis = vis[~(pd.isnull(vis[num_features[i]]))]

        if exclude_outlier != 0:
            if num_features[i] =='Appv_Dusburse_delta':
                upb = np.percentile(vis, 100-exclude_outlier)
                lwb = np.percentile(vis, exclude_outlier)
                vis = vis[(vis[num_features[i]] >= lwb) & (vis[num_features[i]] <= upb)]
            else:
                upb = np.percentile(vis, 100-exclude_outlier)
                lwb = np.percentile(vis, exclude_outlier)
                vis = vis[(vis[num_features[i]] <= upb)]

        if len(vis) == 0:
            continue

        plt.title('{}_hue_{}'.format(num_features[i], hue_col, fontsize=20))

        sns.set_theme(style="darkgrid")
        sns.set_palette(sns.color_palette("Set2"))
        sns.kdeplot(data=vis, x=num_features[i], log_scale=False, alpha=0.25, hue=hue_col, 
                    palette=sns.color_palette("Set2")[:vis[hue_col].nunique()])

        plt.savefig('{}_hue_{}.jpeg'.format(num_features[i], hue_col))
        count += 1
        plt.show()

In [1]:
def obj_visualization(df, obj_features, time_cols=['ApprovalDate_year', 'ApprovalDate_month', 'DisbursementDate_year', 'DisbursementDate_month']):
    df_vis = df.copy()

    for i in range(len(obj_features)):

        df_vis = df_vis[~pd.isnull(df_vis[obj_features[i]])]
        order = list(df.groupby(by=obj_features[i]).size().sort_values(ascending=False).index)

        if obj_features[i] in time_cols:
            df_vis[obj_features[i]] = df_vis[obj_features[i]].astype('int64')

        if df_vis[obj_features[i]].nunique() < 8:
            plt.figure(figsize=(10, 10))
        elif obj_features[i] == 'Industry':
            plt.figure(figsize=(15, 15))
        elif obj_features[i] in time_cols:
            plt.figure(figsize=(23, 10))
        else:
            plt.figure(figsize=(20, 10))

    #     plt.subplot(len(obj_features), 1, i + 1)
        plt.title('barplot_{}'.format(obj_features[i]), fontsize=20)
        sns.set_theme(style="darkgrid")

        if (obj_features[i] != 'Industry') & ('State' not in obj_features[i]):  
            # small category
            sns.countplot(x=obj_features[i], data=df_vis, edgecolor='w')
        elif (obj_features[i] != 'Industry') & ('State' in obj_features[i]):  # state系列
            sns.countplot(x=obj_features[i], data=df_vis, edgecolor='w', order=order)
        else:  # industry
            sns.countplot(y=obj_features[i], data=df_vis, edgecolor='w', order=order)

        plt.savefig('barplot_{}.jpeg'.format(obj_features[i]))
        df_vis = df.copy()

        plt.show()

In [16]:
def obj_visualization_with_hue(df, obj_features, hue_col,
                               time_cols=['ApprovalDate_year', 'ApprovalDate_month', 'DisbursementDate_year', 'DisbursementDate_month']):
    df_vis = df.copy()
    for i in range(len(obj_features)):
        if (obj_features[i] == hue_col) or (obj_features[i] in time_cols):
            continue
        df_vis = df_vis[~pd.isnull(df_vis[obj_features[i]])]
        
        if df_vis[obj_features[i]].nunique() < 8:
            plt.figure(figsize=(10, 10))
        elif obj_features[i] == 'Industry':
            plt.figure(figsize=(15, 15))
        else:
            plt.figure(figsize=(20, 10))

    #     plt.subplot(len(obj_features), 1, i + 1)
        plt.title('{}_hue_{}'.format(obj_features[i], hue_col), fontsize=20)
        sns.set_theme(style="darkgrid")

        if obj_features[i] != 'Industry':
            sns.countplot(x=obj_features[i], hue=hue_col, data=df_vis, edgecolor='w')
        else:
            sns.countplot(y=obj_features[i], hue=hue_col, data=df_vis, edgecolor='w')

        plt.legend(title=hue_col, fontsize=16, title_fontsize=20)
        plt.savefig('{}_hue_{}.jpeg'.format(obj_features[i], hue_col))
        df_vis = df.copy()

        plt.show()

In [17]:
def year_col_visualization(df, num_features, year_col):

    df_groupby_mean = df.groupby(by=year_col).mean()[num_features]

    for i in range(len(num_features)):
        plt.figure(figsize=(22,10))
        num = num_features[i]
        vis = df_groupby_mean[~(df.groupby(by=year_col).size().values < 10000)]
        vis.index = [int(i) for i in list(vis.index)]
        plt.title('Mean {} by {}'.format(num, year_col), fontsize=20)
        plt.grid(True)
        plt.vlines(x=[2006.0], ymin=vis[num].min(), ymax=vis[num].max(), color='r')
        sns.set_theme(style="darkgrid")
        sns.lineplot(x=vis.index, y=vis[num], marker=True)
        plt.xticks(vis.index)
        plt.savefig('Mean_{}_{}.jpeg'.format(num, year_col))
        plt.show()

In [18]:
def month_col_visualization(df, num_features, mon_col):
    
    df_groupby_mean = df.groupby(by=mon_col).mean()[num_features]
    
    for i in range(len(num_features)):
        plt.figure(figsize=(22,10))
        num = num_features[i]
        vis = df_groupby_mean[~(df_copy.groupby(by=mon_col).size().values < 30)]
        vis.index = [int(i) for i in list(vis.index)]
        plt.title('Mean {} by {}'.format(num, mon_col), fontsize=20)
        plt.grid(True)
        sns.set_theme(style="darkgrid")
        sns.lineplot(x=vis.index, y=vis[num], marker=True)
        plt.savefig('Mean_{}_{}.jpeg'.format(num, mon_col))
        plt.show()

In [4]:
def size_hue_disbursementyear(df, hue_cols, year_col):
    for i in range(len(hue_cols)):
        if df[hue_cols[i]].nunique() > 30:
            continue
        
        data = pd.DataFrame(df_copy.groupby(by=[year_col, hue_cols[i]]).size()).reset_index()
        data.rename(columns={0: 'count'}, inplace=True)
        data = data[data[year_col] >= 1972]
        data[[year_col]] = data[[year_col]].applymap(lambda x: int(x))
    
        plt.figure(figsize=(20,10))
        plt.title('Size by {}_{}'.format(year_col, hue_cols[i]))
        plt.grid(True)
        sns.set_theme(style="darkgrid")
        plt.vlines(x=[2006.0], ymin=data['count'].min(), ymax=data['count'].max(), color='r', linestyles='dashed')
        sns.lineplot(data=data, x=data[year_col], y=data['count'], marker=True, hue=hue_cols[i], palette="flare")
        plt.savefig('Size by {}_{}.jpeg'.format(year_col, hue_cols[i]))
#         plt.legend(title=hue_cols[i], fontsize=16, title_fontsize=20)

        plt.show()

In [1]:
def mean_default_hue_disbursementyear(df, hue_cols, year_col):
    for i in range(len(hue_cols)):
        if df[hue_cols[i]].nunique() > 30:
            continue
        
        data = df.groupby(by=[year_col, hue_cols[i]]).mean()
        data_long = data.melt(ignore_index=False).reset_index()
        data_def = data_long[data_long['variable'] == 'Default']

        data_def = data_def[data_def[year_col] >= 1988]
        data_def[[year_col]] = data_def[[year_col]].applymap(lambda x: int(x))

        plt.figure(figsize=(20,10))

        num = 'Default'

        plt.title('Mean {} by DisbursementDate_year_{}.jpeg'.format(num, hue_cols[i]))
        plt.grid(True)
        sns.set_theme(style="darkgrid")
        plt.vlines(x=[2006.0], ymin=data_def['value'].min(), ymax=data_def['value'].max(), color='r', linestyles='dashed')
        sns.lineplot(data=data_def, x=data_def[year_col], y=data_def['value'], marker=True, hue=hue_cols[i],
                     palette="flare")
        plt.legend(title=hue_cols[i], fontsize=20, title_fontsize=20)
        plt.savefig('Mean {} by DisbursementDate_year_{}.jpeg'.format(num, hue_cols[i]))
        plt.show()

In [2]:
def scatter_plot_with_hue(df, x_axis, y_axis, hue_col, exclude_outlier=0):
    df_vis = df.copy()

    df_vis = df_vis[~(pd.isnull(df_vis[x_axis]))]
    df_vis = df_vis[~(pd.isnull(df_vis[y_axis]))]
    df_vis = df_vis[~(pd.isnull(df_vis[hue_col]))]
    df_vis = df_vis[(df_vis[hue_col] != -1)]

    upb1 = np.percentile(df_vis[x_axis], 100-exclude_outlier)
    df_vis = df_vis[(df_vis[x_axis] <= upb1)]
    upb2 = np.percentile(df_vis[y_axis], 100-exclude_outlier)
    df_vis = df_vis[(df_vis[y_axis] <= upb2)]

    df_samp = df_vis.sample(n=2000, random_state=0)

    plt.figure(figsize=(15, 10))
    sns.set_theme(style="darkgrid")
    sns.set_palette(sns.color_palette("Set2"))
    title = 'scatter plot_{}_{}_hue{}'.format(x_axis, y_axis, hue_col)
    plt.title(title)
    sns.scatterplot(data=df_samp, x=x_axis, y=y_axis, hue=hue_col)
    plt.legend(loc='upper right', title=hue_col)
    plt.savefig(title + '.jpeg')
    plt.show()