## 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('mtcars.csv')
df.head()

Unnamed: 0,car,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


## 1. 데이터 type

In [5]:
# 데이터 타입 확인
df.dtypes

car      object
mpg     float64
cyl       int64
disp    float64
hp        int64
drat    float64
wt      float64
qsec    float64
vs        int64
am        int64
gear      int64
carb      int64
dtype: object

In [6]:
# 데이터 타입 변경(1개)
df1 = df.copy()
df1 = df1.astype({'cyl':'object'})
df1.dtypes

car      object
mpg     float64
cyl      object
disp    float64
hp        int64
drat    float64
wt      float64
qsec    float64
vs        int64
am        int64
gear      int64
carb      int64
dtype: object

In [8]:
# 데이터 타입 변경(2개 이상)
df1 = df1.astype({'cyl':'int', 'gear':'object'})
df1.dtypes

car      object
mpg     float64
cyl       int64
disp    float64
hp        int64
drat    float64
wt      float64
qsec    float64
vs        int64
am        int64
gear     object
carb      int64
dtype: object

In [10]:
df1['cyl'].head(3)

0    6
1    6
2    4
Name: cyl, dtype: int64

In [14]:
df1['cyl'].value_counts()

8    14
4    11
6     7
Name: cyl, dtype: int64

## 2. 기초통계량(평균, 중앙값, IQR, 표준편차 등)

### 1) 중심측도를 나타내는 값(평균, 중앙값, 최빈값)

In [15]:
df = pd.read_csv('mtcars.csv')
df.head()

Unnamed: 0,car,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [16]:
df.shape

(32, 12)

In [21]:
#평균값
mpg_mean = df['mpg'].mean()
mpg_mean

20.090625000000003

In [28]:
#중앙값
mpg_median = df['mpg'].median()

In [26]:
#최빈값
mpg_mode = df['cyl'].mode()
mpg_mode

0    8
Name: cyl, dtype: int64

In [25]:
df['cyl'].mode()

0    8
Name: cyl, dtype: int64

### 2) 산포도를 나타내는 값(분산, 표준편차, IQR, 범위(최대-최소) 등)

In [29]:
import seaborn as sns

In [30]:
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [31]:
mean_age = df['age'].mean()
std_age = df['age'].std()

In [32]:
mean_age

29.69911764705882

In [33]:
std_age

14.526497332334042

In [35]:
znorm = (df['age']-mean_age)/std_age
znorm

0     -0.530005
1      0.571430
2     -0.254646
3      0.364911
4      0.364911
         ...   
886   -0.185807
887   -0.736524
888         NaN
889   -0.254646
890    0.158392
Name: age, Length: 891, dtype: float64

In [36]:
cond1 = (znorm>3)
cond2 = (znorm<-3)

In [40]:
len(df[cond1])

2

In [42]:
len(df[cond1|cond2])

2

In [43]:
df = pd.read_csv('mtcars.csv')
df.head()

Unnamed: 0,car,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [44]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()

In [50]:
std.fit_transform(df[['mpg']])

array([[ 0.15329914],
       [ 0.15329914],
       [ 0.4567366 ],
       [ 0.22072968],
       [-0.23442651],
       [-0.33557233],
       [-0.97616253],
       [ 0.72645879],
       [ 0.4567366 ],
       [-0.15013833],
       [-0.38614524],
       [-0.62215216],
       [-0.47043343],
       [-0.8244438 ],
       [-1.63361037],
       [-1.63361037],
       [-0.90873199],
       [ 2.07506974],
       [ 1.737917  ],
       [ 2.32793429],
       [ 0.23758732],
       [-0.77387089],
       [-0.8244438 ],
       [-1.1447389 ],
       [-0.15013833],
       [ 1.21533026],
       [ 0.99618098],
       [ 1.737917  ],
       [-0.72329798],
       [-0.06585014],
       [-0.85815908],
       [ 0.22072968]])

In [47]:
type(df[['mpg']])

pandas.core.frame.DataFrame

In [48]:
type(df['mpg'])

pandas.core.series.Series

In [51]:
type(df.mpg)

pandas.core.series.Series