In [2]:
import pandas as pd 
import numpy as np 
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
%matplotlib inline  
%precision 3


# 시각화 설정
colors = [plt.cm.Dark2(i) for i in range(20)]
mpl.rcParams.update({'font.size':18})

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

train.head()

Unnamed: 0,id,title,odometer,location,isimported,engine,transmission,fuel,paint,year,target
0,0,Toyota RAV 4,18277,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,Red,2016,13665000
1,1,Toyota Land Cruiser,10,Lagos,New,4-cylinder(I4),automatic,petrol,Black,2019,33015000
2,2,Land Rover Range Rover Evoque,83091,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,Red,2012,9915000
3,3,Lexus ES 350,91524,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,Gray,2007,3815000
4,4,Toyota Venza,94177,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,Red,2010,7385000


In [4]:
target = train['target']
target.describe()

count    1.015000e+03
mean     8.243204e+06
std      1.239567e+07
min      4.000000e+05
25%      2.535000e+06
50%      4.215000e+06
75%      8.927500e+06
max      1.500150e+08
Name: target, dtype: float64

In [5]:
print('평균값 : ', target.mean())
print('중간값 : ', target.median())
print('중간값과 평균값의 차이 : ', target.median() - target.mean())

평균값 :  8243204.450246305
중간값 :  4215000.0
중간값과 평균값의 차이 :  -4028204.450246305


In [7]:
from scipy.stats import trim_mean

for trim in [0.0, 0.05, 0.1, 0.15, 0.2, 0.25]:
    print(f'{trim*100}% 절사평균 : ',trim_mean(target, trim))

0.0% 절사평균 :  8243204.450246305
5.0% 절사평균 :  6250996.731147541
10.0% 절사평균 :  5653857.329643296
15.0% 절사평균 :  5252582.29113924
20.0% 절사평균 :  4967098.536945812
25.0% 절사평균 :  4764495.106090373


In [8]:
print('분산 : ', target.var())
print('표준편차 : ', target.std())


분산 :  153652724096224.1
표준편차 :  12395673.603972642


In [10]:
# 중위표준편차 (MAD)는 statsmodels.api 패키지에 있는 함수로 구할 수 있다.
import statsmodels.api as sm

print('중위절대편차 : ',sm.robust.scale.mad(target))

중위절대편차 :  3409985.1025628843


In [11]:
target.median() - sm.robust.scale.mad(target), target.median() + sm.robust.scale.mad(target)


(805014.897, 7624985.103)

In [12]:
print('최댓값 : ', target.max())
print('최솟값 : ', target.min())
print('범위 : ', target.max() - target.min())

최댓값 :  150015008
최솟값 :  400000
범위 :  149615008


In [13]:
print('상위 25% : ', target.quantile(0.75))
print('하위 25% : ', target.quantile(0.25))
print('사분위수 범위 : ',target.quantile(0.75) - target.quantile(0.25))

상위 25% :  8927500.0
하위 25% :  2535000.0
사분위수 범위 :  6392500.0


In [14]:
print('왜도 : ',target.skew())

왜도 :  4.9552768084421


In [15]:
print('첨도 : ', target.kurt())


첨도 :  35.55992197898911


In [16]:
# 수치형변수 분석기
from scipy.stats import trim_mean
import statsmodels.api as sm



def numerical_analysis(dataframe):
    # 분석을 통해 얻어지는 결과들은 dictionary에 저장합니다.
    analysis = {}

    # 위치분석
    analysis['평균'] = dataframe.mean()
    analysis['중위값'] = dataframe.median()
    
    for trim in [0.1, 0.15, 0.2, 0.25]:
        analysis[f'{trim*100}% 절사평균'] = trim_mean(dataframe, trim)
    
    # 변이분석
    analysis['분산'] = dataframe.var()
    analysis['표준편차'] = dataframe.std()
    analysis['중위절대편차'] = sm.robust.scale.mad(dataframe)

    analysis['-1sigma'] = analysis['평균'] - analysis['표준편차']
    analysis['+1sigma'] = analysis['평균'] + analysis['표준편차']

    analysis['-1MAD'] = analysis['중위값'] - analysis['중위절대편차']
    analysis['+1MAD'] = analysis['중위값'] + analysis['중위절대편차']

    # 범위분석
    analysis['최댓값'] = dataframe.max()
    analysis['최솟값'] = dataframe.min()
    analysis['범위'] = analysis['최댓값'] - analysis['최솟값']
    
    analysis['삼분위수'] = dataframe.quantile(0.75) 
    analysis['일분위수'] = dataframe.quantile(0.25)
    analysis['사분위수범위'] = analysis['삼분위수'] - analysis['일분위수']

    # 왜도와 첨도
    analysis['왜도'] = dataframe.skew()
    analysis['첨도'] = dataframe.kurt()

    return analysis


In [17]:
numerical_analysis(target)


{'평균': 8243204.450,
 '중위값': 4215000.000,
 '10.0% 절사평균': 5653857.330,
 '15.0% 절사평균': 5252582.291,
 '20.0% 절사평균': 4967098.537,
 '25.0% 절사평균': 4764495.106,
 '분산': 153652724096224.094,
 '표준편차': 12395673.604,
 '중위절대편차': 3409985.103,
 '-1sigma': -4152469.154,
 '+1sigma': 20638878.054,
 '-1MAD': 805014.897,
 '+1MAD': 7624985.103,
 '최댓값': 150015008,
 '최솟값': 400000,
 '범위': 149615008,
 '삼분위수': 8927500.000,
 '일분위수': 2535000.000,
 '사분위수범위': 6392500.000,
 '왜도': 4.955,
 '첨도': 35.560}