In [1]:
import pandas as pd
import numpy as np

In [2]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj, pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else:
        usage_b = pandas_obj.memory_usage(deep=True)
    
    usage_mb = usage_b / 1024 ** 2
    
    return '{:03.2f} MB'.format(usage_mb)

In [1]:
def different_state(s1, s2):
    if s1 == s2:
        return 0
    else:
        return 1

In [3]:
def reducememory(gl):
    # 優化整數與浮點數
    gl_int = gl.select_dtypes(include=['int64'])
    converted_int = gl_int.apply(pd.to_numeric, downcast='unsigned')
    gl_float = gl.select_dtypes(include=['float'])
    converted_float = gl_float.apply(pd.to_numeric, downcast='float')
    
    # 優化object
    gl_obj = gl.select_dtypes(include=['object'])
    converted_obj = pd.DataFrame()
    
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:, col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:, col] = gl_obj[col]

    optimized_gl = gl.copy()
    optimized_gl[converted_int.columns] = converted_int
    optimized_gl[converted_float.columns] = converted_float
    optimized_gl[converted_obj.columns] = converted_obj
    
    # print(mem_usage(gl))
    # print(mem_usage(optimized_gl))
    
    return optimized_gl

In [4]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data))
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    
    return missing_data

In [5]:
def groupencoder(df, cols):
    df = abs(df)
    for col in cols:
        per_25 = np.nanpercentile(df[col], 25)
        per_50 = np.nanpercentile(df[col], 50)
        per_75 = np.nanpercentile(df[col], 75)
        
        for index, row in df.iterrows():
            if pd.isnull(row[col]) == False:
                if row[col] > per_50:
                    if row[col] > per_75:
                        df.loc[index, col] = 4
                    else:
                        df.loc[index, col] = 3
                else:
                    if row[col] > per_25:
                        df.loc[index, col] = 2
                    else:
                        df.loc[index, col] = 1
                        
    new_df = df.fillna(0)
    return new_df

In [6]:
def is_na_or_not(x):
    return pd.isnull(x)

In [7]:
# 年份二位轉四位
def year_2d_to_4d(x):
    try:
        if int(x) > 14:  # 代表19xx年
            x = '19' + x
        else:  # 20xx年
            x = '20' + x
    except ValueError:
        pass
    return x

In [8]:
def money_clean(org_money):
    org_money = org_money.strip().split('.')[0]
    org_money = org_money.strip('$').replace(',', '')

    return org_money

In [5]:
def num_visualization(df, num_features):
    # 視覺化數值型欄位
    count = 1

    for i in range(len(num_features)):
        plt.figure(figsize=(10,10))
        vis = df[num_features[i]]
        upb = np.percentile(vis, 97.5)
        lwb = np.percentile(vis, 2.5)
        vis = vis[(vis != 0) & (vis > lwb) & (vis < upb)]

        if len(vis) == 0:
            continue

        plt.title(num_features[i], fontsize=20)

        binnum = math.floor(1 + 3.3 * np.log10(len(vis)))
        print('data contains {} rows and was divided into {} groups.'.format(len(vis), binnum))

        sns.histplot(data=vis, log_scale=False, bins=binnum)
        plt.savefig('{}.jpeg'.format(num_features[i]))
        count += 1
        plt.show()

In [6]:
def obj_visualization(df, obj_features, time_cols=['ApprovalDate_year', 'ApprovalDate_month', 'DisbursementDate_year', 'DisbursementDate_month']):
    for i in range(len(obj_features)):
        df_temp = pd.DataFrame(df.groupby(obj_features[i]).size(), columns=['count'])
        df_temp = df_temp[~(df_temp.values < 100)]

        if len(df_temp.index) < 8:
            plt.figure(figsize=(10, 10))
        else:
            plt.figure(figsize=(20, 10))

    #     plt.subplot(len(obj_features), 1, i + 1)
        plt.title(obj_features[i], fontsize=20)

        if obj_features[i] in time_cols:
            df_temp.index = [int(i) for i in list(df_temp.index)]
        sns.barplot(x=df_temp.index, y=df_temp['count'])
        plt.savefig('{}.jpeg'.format(obj_features[i]))

        plt.show()

In [2]:
def year_col_visualization(df, num_features, year_col):

    df_groupby_mean = df.groupby(by=year_col).mean()[num_features]

    for i in range(len(num_features)):
        plt.figure(figsize=(22,10))
        num = num_features[i]
        vis = df_groupby_mean[~(df_copy.groupby(by=year_col).size().values < 30)]
        vis.index = [int(i) for i in list(vis.index)]
        plt.title('Mean {} by {}'.format(num, year_col), fontsize=20)
        plt.grid(True)
        plt.vlines(x=[2006.0], ymin=vis[num].min(), ymax=vis[num].max(), color='r')
        sns.lineplot(x=vis.index, y=vis[num], marker=True)
        plt.savefig('Mean_{}_{}.jpeg'.format(num, year_col))
        plt.show()

In [3]:
def month_col_visualization(df, num_features, mon_col):
    
    df_groupby_mean = df.groupby(by=mon_col).mean()[num_features]
    
    for i in range(len(num_features)):
        plt.figure(figsize=(22,10))
        num = num_features[i]
        vis = df_groupby_mean[~(df_copy.groupby(by=mon_col).size().values < 30)]
        vis.index = [int(i) for i in list(vis.index)]
        plt.title('Mean {} by {}'.format(num, mon_col), fontsize=20)
        plt.grid(True)
        sns.lineplot(x=vis.index, y=vis[num], marker=True)
        plt.savefig('Mean_{}_{}.jpeg'.format(num, mon_col))
        plt.show()