### Python intro

## 드라이브 마운트

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 마운트 잘 되었는지 확인

In [18]:
import os

os.listdir('/content/drive/MyDrive/머신러닝 수업/4주차 실습수업')

['Ch01_01_Python_Pandas_practice.ipynb', 'College.csv', 'iris.csv']

In [19]:
# warning message 제거
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
import pandas as pd
print("pandas 버전: {}".format(pd.__version__))

import matplotlib
print("matplotlib 버전: {}".format(matplotlib.__version__))

import numpy as np
print("NumPy 버전: {}".format(np.__version__))

import sklearn
print("scikit-learn 버전: {}".format(sklearn.__version__))

pandas 버전: 1.3.5
matplotlib 버전: 3.2.2
NumPy 버전: 1.21.6
scikit-learn 버전: 1.0.2


## Pandas

데이터분석을 위해 사용하는 패키지(Package).  
파이썬을 이용해 엑셀과 같은 역할을 수행한다고 생각하면 편하다.  
판다스는 대용량 데이터를 효율적으로 다룰 수 있기 때문에 빅데이터 분석에 유리하며  
여러가지 복잡한 기능을 구현할 수 있다.

### 1. Load Data

In [20]:
import pandas as pd

# 데이터 불러오기
dataframe_no_index = pd.read_csv("College.csv")
# 데이터의 shape (row * column) : 불러온 데이터의 matrix 사이즈를 나타냄
print("data의 shape",dataframe_no_index.shape)
# 상위 5개 출력
dataframe_no_index.head()

data의 shape (777, 19)


Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [21]:
# 데이터 프레임의 특정 컬럼(여기서는 Unnamed: 0)을 index 로 가져오고 싶을 때 index_col="Unnamed: 0" 명령어를 쓰면 된다

dataframe = pd.read_csv("College.csv", index_col = "Unnamed: 0")  # "Unnamed: 0"기준으로 인덱스를 잡았다.
dataframe.head(5)                                                 # index_col = 0(인덱스가 없음. False)

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [22]:
# dataframe 의 column 과 index 를 가져와보자
dataframe.columns

Index(['Private', 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc',
       'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books',
       'Personal', 'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend',
       'Grad.Rate'],
      dtype='object')

In [23]:
dataframe.index

Index(['Abilene Christian University', 'Adelphi University', 'Adrian College',
       'Agnes Scott College', 'Alaska Pacific University', 'Albertson College',
       'Albertus Magnus College', 'Albion College', 'Albright College',
       'Alderson-Broaddus College',
       ...
       'Winthrop University', 'Wisconsin Lutheran College',
       'Wittenberg University', 'Wofford College',
       'Worcester Polytechnic Institute', 'Worcester State College',
       'Xavier University', 'Xavier University of Louisiana',
       'Yale University', 'York College of Pennsylvania'],
      dtype='object', length=777)

### 2. Matrix, Row, Column

#### Column Slicing

In [24]:
# 특정 column 가져오기('Private'컬럼 가져오기)
dataframe['Private']

Abilene Christian University      Yes
Adelphi University                Yes
Adrian College                    Yes
Agnes Scott College               Yes
Alaska Pacific University         Yes
                                 ... 
Worcester State College            No
Xavier University                 Yes
Xavier University of Louisiana    Yes
Yale University                   Yes
York College of Pennsylvania      Yes
Name: Private, Length: 777, dtype: object

특정 column 여러개 가져오기

In [29]:
# 특정 컬럼 여러개를 가져오기 위해서는 1. list 에 컬럼명을 담거나 2. [[]] 형식으로 이중 대괄호 안에 컬럼명을 가지고 온다
# 1. list 에 컬럼명 담기 ("Private", "Apps" ,"Accept", "Enroll" 컬럼 가져오기)
col_list = ["Private", "Apps" ,"Accept", "Enroll"]
col_list

['Private', 'Apps', 'Accept', 'Enroll']

In [30]:
# 2 이중 대괄호 [[]] 사용
df_double = dataframe[["Private", "Apps" ,"Accept", "Enroll"]]
df_double.head()

Unnamed: 0,Private,Apps,Accept,Enroll
Abilene Christian University,Yes,1660,1232,721
Adelphi University,Yes,2186,1924,512
Adrian College,Yes,1428,1097,336
Agnes Scott College,Yes,417,349,137
Alaska Pacific University,Yes,193,146,55


#### Row Slicing

In [31]:
# row 를 slicing 할 때는 .loc 를 붙여 사용해야 한다.
# row 명을 그대로 가지고 와서 slicing 을 시켜준다.

# 0번 index 부터 4번 index 까지 출력
dataframe_0_4 = dataframe.loc['Abilene Christian University':'Agnes Scott College']
dataframe_0_4

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59


In [33]:
## 에러!!!
# dataframe은 인덱스를 제거한 데이터 프레임이다. 따라서 인덱스 번호로 출력 안됨
dataframe.loc[0:3]

TypeError: ignored

In [34]:
# index 가 숫자인 경우 
dataframe_no_index.loc[0:3]

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59


In [35]:
# 특정 row 를 출력하기 위해서는 아까와 같은 방법으로 이중 대괄호를 사용하면 된다.
# - 0, 2, 4번 인덱스 출력
dataframe_no_index.loc[[0, 2, 4]]

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


#### Extract Feature

In [39]:
# indexing
# 특점 컬럼에서 특정 값을 가지는 것만 가져오자
# - dataframe에서 dataframe의 Private컬럼에서 No값을 가지는 것만 가져오고 상위 5개만 보여라. 그리고 그것을 dataframe_private 변수에 넣어서 출력하라.

dataframe_private = dataframe[dataframe['Private'] == 'No'].head(5)
dataframe_private

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Angelo State University,No,3540,2001,1016,24,54,4190,1512,5130,3592,500,2000,60,62,23.1,5,4010,34
Appalachian State University,No,7313,4664,1910,20,63,9940,1035,6806,2540,96,2000,83,96,18.3,14,5854,70
Arizona State University Main campus,No,12809,10308,3761,24,49,22593,7585,7434,4850,700,2100,88,93,18.9,5,4602,48
Arkansas Tech University,No,1734,1729,951,12,52,3602,939,3460,2650,450,1000,57,60,19.6,5,4739,48
Auburn University-Main Campus,No,7548,6791,3070,25,57,16262,1716,6300,3933,600,1908,85,91,16.7,18,6642,69


In [41]:
# private 이 No 이면서 PhD column 의 값이 95 이상인 경우를 출력한 후, dataframe_pri_phd 변수에 넣기
dataframe_pri_phd = dataframe[(dataframe['Private'] == 'No') & (dataframe['PhD'] >= 95)]
dataframe_pri_phd

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
New Mexico Institute of Mining and Tech.,No,787,601,233,40,73,1017,411,5376,3214,600,1100,99,100,13.7,11,9241,34
Texas A&M University at Galveston,No,529,481,243,22,47,1206,134,4860,3122,600,650,103,88,17.4,16,6415,43
The Citadel,No,1500,1242,611,12,36,2024,292,7070,2439,400,779,95,94,17.1,17,7744,84
University of Alabama at Birmingham,No,1797,1260,938,24,35,6960,4698,4440,5175,750,2200,96,96,6.7,16,16352,33
University of California at Irvine,No,15698,10775,2478,85,100,12677,864,12024,5302,790,1818,96,96,16.1,11,15934,66
University of North Dakota,No,2777,2249,1652,20,54,8334,1435,5634,2703,450,1200,97,97,15.9,16,9424,49
University of Washington,No,12749,7025,3343,40,81,20356,4582,8199,4218,708,2172,96,94,9.0,10,16527,65


In [40]:
# private 이 No 이면서 "Apps","Accept","Enroll","Top10perc" column을 가져올 경우

print(dataframe[dataframe['Private'] == 'No'][['Apps', 'Accept', 'Enroll', 'Top10perc']].shape)  # shape : 불러온 데이터의 matrix 사이즈

dataframe_pri_col = dataframe[dataframe['Private'] == 'No'][['Apps', 'Accept', 'Enroll', 'Top10perc']]
dataframe_pri_col.head()

(212, 4)


Unnamed: 0,Apps,Accept,Enroll,Top10perc
Angelo State University,3540,2001,1016,24
Appalachian State University,7313,4664,1910,20
Arizona State University Main campus,12809,10308,3761,24
Arkansas Tech University,1734,1729,951,12
Auburn University-Main Campus,7548,6791,3070,25


#### Add and remove columns

In [42]:
# add and remove columns

# 데이터프레임[추가할컬럼] = 추가할 값 의 형식으로 column 을 추가
dataframe['master'] = 50

# master 컬럼만 출력
# 컬럼 안에 있는 모든 값에 50 이라는 값으로 채워졌다
dataframe[['master']].head()

Unnamed: 0,master
Abilene Christian University,50
Adelphi University,50
Adrian College,50
Agnes Scott College,50
Alaska Pacific University,50


column을 제거하는 방법은 두가지가 있는데, 첫번째로는 새로 컬럼을 만드는것 <br>
두번째로는 직접적으로 drop 기능을 쓰는 것이다.<br>

다른 데이터를 불러와서 좀 더 빠른 이해를 해보자<br>
(기존에 쓰던 데이터는 column 명이 너무 복잡하다)

In [43]:
iris = pd.read_csv('iris.csv')
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [45]:
# 1. 컬럼을 새로 지정해주기
# List method 를 이용한다

print("variety 컬럼 삭제")
new_col = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
new_iris = iris[new_col]
new_iris.head()

variety 컬럼 삭제


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [46]:
# 2. drop method 사용
# drop으로 컬럼을 제거할때는 언제나 axis=1이라는 옵션을 넣어주어야 한다 (axis =0 은 row 를 제거)

new_iris1 = iris.drop('sepal.length', axis = 1)
new_iris1.head()

Unnamed: 0,sepal.width,petal.length,petal.width,variety
0,3.5,1.4,0.2,Setosa
1,3.0,1.4,0.2,Setosa
2,3.2,1.3,0.2,Setosa
3,3.1,1.5,0.2,Setosa
4,3.6,1.4,0.2,Setosa
