In [1]:
import pandas as pd
import numpy as np

In [2]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj, pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else:
        usage_b = pandas_obj.memory_usage(deep=True)
    
    usage_mb = usage_b / 1024 ** 2
    
    return '{:03.2f} MB'.format(usage_mb)

In [3]:
def reducememory(gl):
    # 優化整數與浮點數
    gl_int = gl.select_dtypes(include=['int64'])
    converted_int = gl_int.apply(pd.to_numeric, downcast='unsigned')
    gl_float = gl.select_dtypes(include=['float'])
    converted_float = gl_float.apply(pd.to_numeric, downcast='float')
    
    # 優化object
    gl_obj = gl.select_dtypes(include=['object'])
    converted_obj = pd.DataFrame()
    
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:, col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:, col] = gl_obj[col]

    optimized_gl = gl.copy()
    optimized_gl[converted_int.columns] = converted_int
    optimized_gl[converted_float.columns] = converted_float
    optimized_gl[converted_obj.columns] = converted_obj
    
    # print(mem_usage(gl))
    # print(mem_usage(optimized_gl))
    
    return optimized_gl

In [4]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    
    return missing_data

In [5]:
def groupencoder(df, cols):
    df = abs(df)
    for col in cols:
        per_25 = np.nanpercentile(df[col], 25)
        per_50 = np.nanpercentile(df[col], 50)
        per_75 = np.nanpercentile(df[col], 75)
        
        for index, row in df.iterrows():
            if pd.isnull(row[col]) == False:
                if row[col] > per_50:
                    if row[col] > per_75:
                        df.loc[index, col] = 4
                    else:
                        df.loc[index, col] = 3
                else:
                    if row[col] > per_25:
                        df.loc[index, col] = 2
                    else:
                        df.loc[index, col] = 1
                        
    new_df = df.fillna(0)
    return new_df