In [None]:
import os
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels import robust

In [None]:
df = pd.read_csv('https://bit.ly/UsedCarsPrice')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
# 컬럼 타입 변경
df.columns
cols = ['MetColor', 'Automatic']
df[cols] = df[cols].astype(str)
df.dtypes

## 평균

In [None]:
df['Price'].mean()
# np.float64(9690.232941176471)

In [None]:
nums = pd.Series([1, np.nan, 2])
nums.mean()
# np.float64(1.5)

## 절사평균

In [None]:
stats.trim_mean(df['Price'], proportiontocut=0.1)
# np.float64(9584.380019588638)

## 중위수

In [None]:
df['Price'].median()
# np.float64(9450.0)

## 최빈값

In [None]:
df['FuelType'].mode()
# 0    Petrol
# Name: FuelType, dtype: object

In [None]:
df['FuelType'].value_counts()
# FuelType
# Petrol    1129
# Diesel     129
# CNG         17
# Name: count, dtype: int64

In [None]:
# 상대도수 확인
df['FuelType'].value_counts(normalize=True)
# FuelType
# Petrol    0.885490
# Diesel    0.101176
# CNG       0.013333
# Name: proportion, dtype: float64

## 분위수

In [None]:
# 4분위수
df['Price'].quantile(np.linspace(0, 1, 5))
# 0.00     4350.0
# 0.25     8250.0
# 0.50     9450.0
# 0.75    10950.0
# 1.00    15950.0
# Name: Price, dtype: float64

In [None]:
# 10분위수
df['Price'].quantile(np.linspace(0, 1, 11))
# 0.0     4350.0
# 0.1     7250.0
# 0.2     7950.0
# 0.3     8500.0
# 0.4     8950.0
# 0.5     9450.0
# 0.6     9950.0
# 0.7    10500.0
# 0.8    11456.0
# 0.9    12500.0
# 1.0    15950.0
# Name: Price, dtype: float64

## 최솟값과 최댓값

In [None]:
df['Price'].min()
# np.int64(4350)

df['Price'].max()
# np.int64(15950)

In [None]:
df['Price'].quantile([0, 1])
# 0.0     4350.0
# 1.0    15950.0
# Name: Price, dtype: float64

In [None]:
df['Price'].agg(func=['min', 'max'])
# min     4350
# max    15950
# Name: Price, dtype: int64

## 범위와 사분범위

In [None]:
df['Price'].max() - df['Price'].min()
# np.int64(11600)

In [None]:
df['Price'].quantile([0, 1]).diff().iloc[-1]
# np.float64(11600.0)

In [None]:
# 사분위수 범위
df['Price'].quantile([0.25, 0.75]).diff().iloc[-1]
# np.float64(2700.0)

## 분산

- ddof는 degree of freedom이며, 1을 지정하면 편차 제곱합을 n-1로 나눔

In [None]:
df['Price'].var()
# np.float64(4120265.326386555)
df['Price'].var(ddof=0)
# np.float64(4117033.7457384085)

In [None]:
# 넘파이는 기본값 = 0
np.var(df['Price'])
# np.float64(4117033.7457384085)

## 표준편차

In [None]:
df['Price'].std()
# np.float64(2029.8436704304484)

## 변동계수

## 중위수절대편차

In [None]:
robust.mad(df['Price'])
# np.float64(2223.903327758403)

## 기술통계량 확인

In [None]:
df.describe().round(2)

In [None]:
df.sort_values('KM').head()

In [None]:
df = df.loc[df['KM'].gt(1), :]
df.shape

In [None]:
# 범주형 변수 기술통계량(원소 개수, 고유값 개수, 최빈값 및 최빈값의 도수)
df.describe(include=object)
#   FuelType	MetColor	Automatic
# count	1273	1273	1273
# unique	3	2	2
# top	Petrol	1	0
# freq	1128	842	1203

## 공분산

In [None]:
df['Age'].cov(df['Price'])
# np.float64(-22136.617857831003)

In [None]:
df.cov(numeric_only=True).round(2)
#           Price	Age	KM	HP	CC	Doors	Weight
# Price	4117054.19	-22136.62	-3.747196e+07	5977.21	17623.05	325.35	15746.20
# Age	-22136.62	187.56	1.711083e+05	-8.46	-210.17	-1.22	-102.32
# KM	-37471955.30	171108.28	1.285867e+09	-156932.02	2588046.91	467.42	359285.14
# HP	5977.21	-8.46	-1.569320e+05	172.01	-53.44	1.58	-41.74
# CC	17623.05	-210.17	2.588047e+06	-53.44	34010.01	23.36	5010.95
# Doors	325.35	-1.22	4.674200e+02	1.58	23.36	0.90	13.10
# Weight	15746.20	-102.32	3.592851e+05	-41.74	5010.95	13.10	1553.06

## 상관계수

In [None]:
df['Age'].corr(df['Price'])
# np.float64(-0.7966182791038179)

In [None]:
# 상관계수 행렬
df.corr(numeric_only=True)
#           Price	      Age	      KM	     HP	         CC	        Doors	  Weight
# Price	1.000000	-0.796618	-0.515009	0.224610	0.047096	0.168562	0.196920
# Age	-0.796618	1.000000	0.348422	-0.047075	-0.083213	-0.093670	-0.189588
# KM	-0.515009	0.348422	1.000000	-0.333686	0.391355	0.013703	0.254242
# HP	0.224610	-0.047075	-0.333686	1.000000	-0.022095	0.126925	-0.080759
# CC	0.047096	-0.083213	0.391355	-0.022095	1.000000	0.133181	0.689482
# Doors	0.168562	-0.093670	0.013703	0.126925	0.133181	1.000000	0.349477
# Weight	0.196920	-0.189588	0.254242	-0.080759	0.689482	0.349477	1.000000

In [None]:
os.getcwd()

In [None]:
os.chdir('../data')

In [None]:
sorted(os.listdir())

In [None]:
df.to_excel('Used_Cars.xlsx', index=False)

In [None]:
df.to_csv('Used_Cars.csv', index=False)

In [None]:
df.to_pickle('Used_Cars.pkl')