In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from pandas.api.types import is_numeric_dtype

sns.set()



# load data iris

data = sns.load_dataset('penguins')


class Lookdata():
    
    """
    EDA tool class
    
    - numeric_category_plot
    - 기본차트 추가
    - 다중공선성 검증 추가
    
    """
    
    def __init__(self,df) :
        self.df = df    
    
    def _dtypes(self) -> pd.DataFrame:
        return pd.DataFrame(self.df.dtypes).rename(columns={0: 'dtype'})
    
    def glimpse(self, maxvals=10, maxlen=110):
        print('Shape: ', self.df.shape)
    
        def pad(y):
            max_len = max([len(x) for x in y])
            return [x.ljust(max_len) for x in y]
    
        # Column Name
        toprnt = pad(self.df.columns.tolist())
    
        # Column Type
        toprnt = pad([toprnt[i] + ' ' + str(self.df.iloc[:,i].dtype) for i in range(self.df.shape[1])])
    
        # Num NAs
        num_nas = [self.df.iloc[:,i].isnull().sum() for i in range(self.df.shape[1])]
        num_nas_ratio = [int(round(x*100/self.df.shape[0])) for x in num_nas]
        num_nas_str = [str(x) + ' (' + str(y) + '%)' for x,y in zip(num_nas, num_nas_ratio)]
        max_len = max([len(x) for x in num_nas_str])
        num_nas_str = [x.rjust(max_len) for x in num_nas_str]
        toprnt = [x + ' ' + y + ' NAs' for x,y in zip(toprnt, num_nas_str)]
    
        # Separator
        toprnt = [x + ' : ' for x in toprnt]
    
        # Values
        toprnt = [toprnt[i] + ', '.join([str(y) for y in self.df.iloc[:min([maxvals,self.df.shape[0]]), i]]) for i in range(self.df.shape[1])]
    
        # Trim to maxlen
        toprnt = [x[:min(maxlen, len(x))] for x in toprnt]
    
        for x in toprnt:
            print(x)
        

    def _compute_outlier(self) -> list:
        """ 
        Compute outlier of a series
        iqr = interquartile range
        """

        if not is_numeric_dtype(self.df):
            raise TypeError("Must pass a column with numeric data type")

        df_ = sorted(self.df)
        q1, q3 = np.percentile(self.df_, [25, 75])
        iqr = q3 - q1

        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        outliers = [x for x in df_ if x < lower_bound or x > upper_bound]

        return outliers


    def _remove_outliers(df: pd.DataFrame) -> pd.DataFrame:
        return df.loc[(~df.isin(_compute_outlier(df)))]

In [27]:
eda_ins = Lookdata(data)

In [29]:
eda_ins_dtypes = eda_ins._dtypes()

In [30]:
eda_ins_dtypes

Unnamed: 0,dtype
species,object
island,object
bill_length_mm,float64
bill_depth_mm,float64
flipper_length_mm,float64
body_mass_g,float64
sex,object


In [31]:
eda_ins.glimpse()

Shape:  (344, 7)
species           object   0 (0%) NAs : Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adelie
island            object   0 (0%) NAs : Torgersen, Torgersen, Torgersen, Torgersen, Torgersen, Torgersen, Torg
bill_length_mm    float64  2 (1%) NAs : 39.1, 39.5, 40.3, nan, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0
bill_depth_mm     float64  2 (1%) NAs : 18.7, 17.4, 18.0, nan, 19.3, 20.6, 17.8, 19.6, 18.1, 20.2
flipper_length_mm float64  2 (1%) NAs : 181.0, 186.0, 195.0, nan, 193.0, 190.0, 181.0, 195.0, 193.0, 190.0
body_mass_g       float64  2 (1%) NAs : 3750.0, 3800.0, 3250.0, nan, 3450.0, 3650.0, 3625.0, 4675.0, 3475.0, 4
sex               object  11 (3%) NAs : Male, Female, Female, nan, Female, Male, Female, Male, nan, nan
