# Pandas는 무엇인가요_20221109
- 데이터 분석 및 가공에 사용되는 파이썬 라이브러리

In [7]:
import pandas as pd

pd.__version__

'1.3.5'

In [18]:
data_frame = pd.read_csv('data/friend_list.csv')
data_frame

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


## 데이터 프레임_20221109
- 가로축과 세로축이 있는 엑셀과 유사한 데이터 구조
- 가로축은 row(행), 세로축은 column(열)
- 데이터베이스의 테이블 구조

In [22]:
# 데이터프레임이 가지고 있는 함수의 예제
data_frame.head(3) # default : 5 개를 보여줌 (R은 6개)

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


## 시리즈(Series)_20221109
- 데이터 프레임의 컬럼(열)은 모두 시리즈.
- 단순히 파이썬 리스트를 간직한 오브젝트.
- 리스트를 파라미터로 주면 바로 시리즈가 생성.
- 데이터 가공 및 분석이 파이썬 리스트보다 훨씬 많다.

In [25]:
type(data_frame.job) # pandas.core.series.Series # 대문자 Series 클래스

pandas.core.series.Series

In [27]:
# 시리즈의 함수 예제
data_frame.job = data_frame.job.str.upper()
data_frame.head()

Unnamed: 0,name,age,job
0,John,20,STUDENT
1,Jenny,30,DEVELOPER
2,Nate,30,TEACHER
3,Julia,40,DENTIST
4,Brian,45,MANAGER


In [30]:
s1 = pd.core.series.Series(['one', 'two', 'three'])
s2 = pd.core.series.Series([1,2,3])

In [31]:
pd.DataFrame(data=dict(word=s1, num=s2)) 

Unnamed: 0,word,num
0,one,1
1,two,2
2,three,3


In [33]:
df = pd.read_csv('data/friend_list.txt')
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [35]:
# 구분자에 의해 컬럼이 구분되어져 있는 데이터는 모두 지원
df = pd.read_csv('data/friend_list_tab.txt', delimiter='\t') # delimiter = 구분자 # R에서도 delimiter 매개변수 사용
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [39]:
df = pd.read_csv('data/friend_list_no_head.csv', header = None) 
df.head()

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [40]:
df.columns = ['name','age','job']
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [41]:
df = pd.read_csv('data/friend_list_no_head.csv', header = None, names=['name','age','job']) 
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


# 데이터 프레임을 파이썬 코드로 생성하기


In [46]:
friend_dict_list = [{'name':'Jane','age':20,'job':'student'},
                    {'name':'Jenny','age':30,'job':'developer'},
                    {'name':'Nate','age':25,'job':'teacher'}]

df = pd.DataFrame(friend_dict_list)
df.head()

Unnamed: 0,name,age,job
0,Jane,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [48]:
df = df[['name','age','job']] # 만약에 순서가 다르게 나왔을 경우 따로 이름을 지정을 해줘서 변수로 담아줘야함.
df.head()

Unnamed: 0,name,age,job
0,Jane,20,student
1,Jenny,30,developer
2,Nate,25,teacher


## OrderedDict으로 데이터 프레임 생성하기_20221109
- OrderedDict 자료구조로 데이터프레임을 생성하면, 컬럼의 순서가 뒤바뀌지 않음.

In [2]:
from collections import OrderedDict

In [58]:
friend_ordered_dict = OrderedDict([('name',['John','Jenny','Nate']),
                                   ('age',[20, 30, 25]),
                                   ('job',['student','developer','teacher'])])

df = pd.DataFrame.from_dict(friend_ordered_dict) # 메서드 체이닝 방식으로 from_dict메서드 호출
df.tail(2)

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,25,teacher


### list로 데이터프레임 생성하기

In [60]:
friend_list= [['John', 20, 'student'],
              ['Jenny',30,'developer'],
              ['Nate',25,'teacher']]

column_name = ['name','age','job']

df = pd.DataFrame.from_records(friend_list, columns=column_name) # list에서 DataFrame으로 바꿔주는 메서드는 from_records
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [64]:
friend_dict = {'name':['John','Jenny','Nate'],
               'age': [20, 30, 25],
               'job': ['student','developer','teacher']
               }

df = pd.DataFrame.from_dict(friend_dict) # 메서드 체이닝 방식으로 from_dict메서드 호출
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [65]:
df.to_csv('data/friend_list_from_df.csv') # 만들어짐

In [67]:
df.to_csv('data/friend_list_from_df.txt') # csv포맷만 만들어지는 것이 아니라 txt도 생성 가능함.

In [68]:
df.to_csv('data/friend_list_from_df_header_index.csv', header=False, index=False)

In [69]:
friend_dict = {'name':['John','Jenny','Nate'],
               'age': [20, None, 25], # Not a Number = NaN
               'job': ['student','developer','teacher']
               }

df = pd.DataFrame.from_dict(friend_dict) 
df.head()

Unnamed: 0,name,age,job
0,John,20.0,student
1,Jenny,,developer
2,Nate,25.0,teacher


In [73]:
df.to_csv('data/friend_dict_from_df.csv')

In [74]:
df.to_csv('data/friend_dict_from_df_narep.csv', na_rep='-')

## 데이터 접근 방법_20221109
* 인덱스로 row 선택하기

In [9]:
friend_dict = {'name':['John','Jenny','Nate'],
               'age': [20, 30, 25],
               'job': ['student','developer','teacher']
               }

df = pd.DataFrame.from_dict(friend_dict) 
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [79]:
df[1:3] # row로 지정해줌 # 데이터프레임은 슬라이싱으로 # 순차적인 접근

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,25,teacher


In [None]:
df[0,2] # KeyError: (0, 2) # 행렬처럼 될 수 있기 때문에 KeyError 오류 

In [10]:
# 순차적이지 않은 row를 선택
df.loc[[0,2]] # location(위치) 제공 # list 자료형으로 표현하면 행만 출력된다.

Unnamed: 0,name,age,job
0,John,20,student
2,Nate,25,teacher


In [11]:
df.loc[[0:2]] # SyntaxError: invalid syntax # loc는 슬라이싱 기능 지원 안 됨.

SyntaxError: invalid syntax (1573064818.py, line 1)

### 컬럼값에 따른 row 선택하기_20221109
- 마치 데이터베이스에 쿼리를 전달하듯, 특정한 컬럼값을 충족하는 row만 선택.

In [94]:
df_filtered = df[df.age > 25]
df_filtered

Unnamed: 0,name,age,job
1,Jenny,30,developer


In [96]:
df_query = df.query('age > 25')
df_query

Unnamed: 0,name,age,job
1,Jenny,30,developer


In [102]:
df_filtered = df[(df.age >= 25) & (df.name == 'Nate')]
df_filtered

Unnamed: 0,name,age,job
2,Nate,25,teacher


# 컬럼 필터하기_20221109
### 인덱스로 필터하기

In [105]:
friend_list = [['John',20,'student'],
               ['Jenny',30,'developer'],
               ['Nate', 25, 'teacher']]

df = pd.DataFrame.from_records(friend_list)
df

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [111]:
# 모든 row를 보여주되, 컬럼은 0 ~ 1까지만 출력.

df.iloc[:, :2] # i location는 행,열 전체를 컨트롤 할 수 있게 만들어준 메서드

Unnamed: 0,0,1
0,John,20
1,Jenny,30
2,Nate,25


In [112]:
df.iloc[:, [0,2]]

Unnamed: 0,0,2
0,John,student
1,Jenny,developer
2,Nate,teacher


# 컬럼 이름으로 필터링하기_20221109

In [13]:
df = pd.read_csv('data/friend_list_no_head.csv', header=None, names=['name','age','job']) # pd.read_table()
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [122]:
df_filtered = df[['name', 'age']]
df_filtered

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [123]:
df.filter(items=['age','job'])

Unnamed: 0,age,job
0,20,student
1,30,developer
2,30,teacher
3,40,dentist
4,45,manager
5,25,intern


In [125]:
df.filter(like='a', axis=1) # (axis=1 기준으로 검색해서 보면) header 이름에 a가 들어가있으면 열 기준으로 출력

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [127]:
# 정규식 # regular expression 정규 표현식
df.filter(regex='b$', axis=1) # b$로 해도 되고 b 들어가있는 거 검색해줌

Unnamed: 0,job
0,student
1,developer
2,teacher
3,dentist
4,manager
5,intern


# row 드롭하기_20221110
* row 인덱스로 row를 drop할 수 있습니다.

In [15]:
friend_dict_list = [{'age':20, 'job':'student'},
                    {'age':30, 'job':'developer'},
                    {'age':25, 'job':'teacher'}]

df = pd.DataFrame(friend_dict_list, index=['John', 'Jenny', 'Nate'])
df.head()

Unnamed: 0,age,job
John,20,student
Jenny,30,developer
Nate,25,teacher


In [19]:
df.drop(['John','Nate']) # 수행하고 난 결과만 리턴해줌. # drop된 결과는 데이터프레임에 저장되지 않음
                         # 저장하고 싶을 경우, 결과를 데이터프레임에 따로 저장해야 됨.

Unnamed: 0,age,job
Jenny,30,developer


In [20]:
df

Unnamed: 0,age,job
John,20,student
Jenny,30,developer
Nate,25,teacher


In [21]:
df = df.drop(['John','Nate']) # 내 자신에게 업데이트를 하면 반영된다.
df

Unnamed: 0,age,job
Jenny,30,developer


### drop된 결과를 바로 데이터프레임에 저장하는 방법
- inplace 키워드를 사용하면, 따로 저장할 필요없이 drop된 결과가 데이터프레임에 반영됨.

In [23]:
friend_dict_list = [{'age':20, 'job':'student'},
                    {'age':30, 'job':'developer'},
                    {'age':25, 'job':'teacher'}]

df = pd.DataFrame(friend_dict_list, index=['John', 'Jenny', 'Nate'])
df.head()

Unnamed: 0,age,job
John,20,student
Jenny,30,developer
Nate,25,teacher


In [24]:
df.drop(['John', 'Nate'], inplace=True) # 결과를 내 자신에게 반영해줌 = inplace
df

Unnamed: 0,age,job
Jenny,30,developer


In [31]:
friend_dict_list = [{'name':'Jane','age':20,'job':'student'},
                    {'name':'Jenny','age':30,'job':'developer'},
                    {'name':'Nate','age':25,'job':'teacher'}]

df = pd.DataFrame(friend_dict_list)
df.head()

Unnamed: 0,name,age,job
0,Jane,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [32]:
# row index로 drop하는 방법
df = df.drop(df.index[[0,2]]) # loc, iloc가 가장 많이 쓰이는 연산자
df

Unnamed: 0,name,age,job
1,Jenny,30,developer


## 컬럼값으로 row drop 하기

In [41]:
friend_dict_list = [{'name':'Jane','age':20,'job':'student'},
                    {'name':'Jenny','age':30,'job':'developer'},
                    {'name':'Nate','age':25,'job':'teacher'}]

df = pd.DataFrame(friend_dict_list)
df.head()

Unnamed: 0,name,age,job
0,Jane,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [42]:
df[df.age != 30]

Unnamed: 0,name,age,job
0,Jane,20,student
2,Nate,25,teacher


In [43]:
friend_dict_list = [{'name':'Jane','age':20,'job':'student'},
                    {'name':'Jenny','age':30,'job':'developer'},
                    {'name':'Nate','age':25,'job':'teacher'}]

df = pd.DataFrame(friend_dict_list)
df.head()

Unnamed: 0,name,age,job
0,Jane,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [44]:
df.drop('age', axis=1) # axis가 default로 0으로 돼있다.

Unnamed: 0,name,job
0,Jane,student
1,Jenny,developer
2,Nate,teacher


### 컬럼 추가 변경하기

In [46]:
df

Unnamed: 0,name,age,job
0,Jane,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [48]:
df['salary'] = 0
df

Unnamed: 0,name,age,job,salary
0,Jane,20,student,0
1,Jenny,30,developer,0
2,Nate,25,teacher,0


In [49]:
df = df.drop('salary', axis=1) # salary 칼럼 삭제
df

Unnamed: 0,name,age,job
0,Jane,20,student
1,Jenny,30,developer
2,Nate,25,teacher


In [50]:
# 넘파이를 이용해서 한줄에 새로운 컬럼값을 생성
import numpy as np

df['salary'] = np.where(df['job'] != 'student', 'yes', 'no') # 파생변수('yes', 'no' 등)를 새로 추가해줌.
df

Unnamed: 0,name,age,job,salary
0,Jane,20,student,no
1,Jenny,30,developer,yes
2,Nate,25,teacher,yes


In [66]:
friend_dict_list = [{'name':'John', 'midterm':95, 'final':85},
                    {'name':'Jenny', 'midterm':85, 'final':80},
                    {'name':'Nate', 'midterm':75, 'final':95},
                    {'name':'Brian', 'midterm':55, 'final':55}]

score_df = pd.DataFrame(friend_dict_list, columns=['name','midterm','final'])
score_df

Unnamed: 0,name,midterm,final
0,John,95,85
1,Jenny,85,80
2,Nate,75,95
3,Brian,55,55


In [67]:
# 파생 변수
score_df['total'] = score_df['midterm'] + score_df['final']
score_df

Unnamed: 0,name,midterm,final,total
0,John,95,85,180
1,Jenny,85,80,165
2,Nate,75,95,170
3,Brian,55,55,110


In [68]:
# 평균
score_df['average'] = score_df['total'] / 2
score_df

Unnamed: 0,name,midterm,final,total,average
0,John,95,85,180,90.0
1,Jenny,85,80,165,82.5
2,Nate,75,95,170,85.0
3,Brian,55,55,110,55.0


In [69]:
# 리스트에 조건별 값을 담아서, 새로운 컬럼으로 추가 시킬 수 있다.

grades = []

for row in score_df['average']:
    if row >= 90:
        grades.append('A')
    elif row >= 80:
        grades.append('B')
    elif row >= 70:
        grades.append('C')  
    elif row >= 60:
        grades.append('D')   
    else:
        grades.append('F')
        
score_df['grade'] = grades
score_df

Unnamed: 0,name,midterm,final,total,average,grade
0,John,95,85,180,90.0,A
1,Jenny,85,80,165,82.5,B
2,Nate,75,95,170,85.0,B
3,Brian,55,55,110,55.0,F


### apply() 사용 예제
- 컬럼의 값을 변경하는 코드 구현.

In [73]:
# 값의 수정

def pass_or_fail(row):
    if row != 'F':
        return 'Pass'
    else:
        return 'Fail'
    
score_df.grade = score_df.grade.apply(pass_or_fail) # 함수만 넣어주면 됨.
score_df

Unnamed: 0,name,midterm,final,total,average,grade
0,John,95,85,180,90.0,Pass
1,Jenny,85,80,165,82.5,Pass
2,Nate,75,95,170,85.0,Pass
3,Brian,55,55,110,55.0,Fail


In [116]:
# 연월일의 정보에서 연도만 추출하는 예제

date_list = [{'yyyy-mm-dd':'1984-06-21'},
             {'yyyy-mm-dd':'1982-03-18'},
             {'yyyy-mm-dd':'2015-11-16'}]

date_df = pd.DataFrame(date_list, index=['민용기','이수향','결혼기념일'] ,columns=['yyyy-mm-dd'])
date_df

Unnamed: 0,yyyy-mm-dd
민용기,1984-06-21
이수향,1982-03-18
결혼기념일,2015-11-16


In [117]:
def extract_year(row):
    return row.split('-')[0]

def extract_month(row):
    return row.split('-')[1]

def extract_day(row):
    return row.split('-')[2]

In [118]:
date_df['year'] = date_df['yyyy-mm-dd'].apply(extract_year)
date_df['month'] = date_df['yyyy-mm-dd'].apply(extract_month)
date_df['day'] = date_df['yyyy-mm-dd'].apply(extract_day)
date_df

Unnamed: 0,yyyy-mm-dd,year,month,day
민용기,1984-06-21,1984,6,21
이수향,1982-03-18,1982,3,18
결혼기념일,2015-11-16,2015,11,16


### apply()에 파라미터 전달하기 
* 키워드 파라미터를 사용하면, apply가 적용된 함수에 파라미터를 전달할 수 있다.

In [119]:
def extract_year_age(year, current_year):
    return current_year - int(year)

In [120]:
date_df['age'] = date_df['year'].apply(extract_year_age, current_year = 2022)
date_df

Unnamed: 0,yyyy-mm-dd,year,month,day,age
민용기,1984-06-21,1984,6,21,38
이수향,1982-03-18,1982,3,18,40
결혼기념일,2015-11-16,2015,11,16,7


In [121]:
def get_introduce(age, prefix, suffix):
    return prefix + str(age) + suffix

In [122]:
date_df['introduce'] = date_df['age'].apply(get_introduce, 
                                           prefix='I am ',
                                           suffix=' years old')
date_df

Unnamed: 0,yyyy-mm-dd,year,month,day,age,introduce
민용기,1984-06-21,1984,6,21,38,I am 38 years old
이수향,1982-03-18,1982,3,18,40,I am 40 years old
결혼기념일,2015-11-16,2015,11,16,7,I am 7 years old


In [123]:
# 여러 개의 컬럼을 동시에 전달하기
# - axis=1 이라는 키워드 파라미터를 apply()에 전달해 주면, 모든 컬럼을 지정된 함수에서 사용 가능

def get_introduce2(row):
    return 'I was born in ' + str(row.year) + ' my age is ' + str(row.age)

date_df.introduce = date_df.apply(get_introduce2, axis=1)
date_df

Unnamed: 0,yyyy-mm-dd,year,month,day,age,introduce
민용기,1984-06-21,1984,6,21,38,I was born in 1984 my age is 38
이수향,1982-03-18,1982,3,18,40,I was born in 1982 my age is 40
결혼기념일,2015-11-16,2015,11,16,7,I was born in 2015 my age is 7


### map()으로 컴럼 추가 및 변경하기

In [124]:
date_list = [{'yyyy-mm-dd':'1984-06-21'},
             {'yyyy-mm-dd':'1982-03-18'},
             {'yyyy-mm-dd':'2015-11-16'}]

date_df = pd.DataFrame(date_list, index=['민용기','이수향','결혼기념일'] ,columns=['yyyy-mm-dd'])
date_df

Unnamed: 0,yyyy-mm-dd
민용기,1984-06-21
이수향,1982-03-18
결혼기념일,2015-11-16


In [126]:
date_df['year'] = date_df['yyyy-mm-dd'].map(extract_year) # apply 메서도같이 컴럼값을 추가해줌.
date_df

Unnamed: 0,yyyy-mm-dd,year
민용기,1984-06-21,1984
이수향,1982-03-18,1982
결혼기념일,2015-11-16,2015


#### 파라미터로 딕셔너리를 전달하면 컬럼값을 쉽게 원하는 값으로 변경 가능.
#### 기존의 컬럼값은 딕셔너리의 Key로 사용되고, 해당되는 value의 값으로 컬럼값이 변경

In [129]:
job_list = [{'age':20,'job':'student'},
            {'age':30,'job':'developer'},
            {'age':35,'job':'teacher'}]

df = pd.DataFrame(job_list)
df

Unnamed: 0,age,job
0,20,student
1,30,developer
2,35,teacher


In [133]:
df.job = df.job.map({'student':1,'developer':2, 'teacher':3}) # 내용 변경이 가능.
df

Unnamed: 0,age,job
0,20,1
1,30,2
2,35,3


### applymap() 메서드
- 데이터 프레임 전체의 각각의  값을 한 번에 변경시킬 때 사용하면 유용한 메서드
- np.around(data) : 소수점 자리 수를 반올림해서 정수로 만들어줌

- np.round(data, decimals) :  원하는 소수점 자리수에서 반올림

In [137]:
x_y = [{'x':5.5, 'y':-5.6},
       {'x':-5.2, 'y':5.5},
       {'x':-1.6, 'y':-4.5}]

df = pd.DataFrame(x_y)
df

Unnamed: 0,x,y
0,5.5,-5.6
1,-5.2,5.5
2,-1.6,-4.5


In [138]:
df = df.applymap(np.around) # 전체가 변경됨.
df

Unnamed: 0,x,y
0,6.0,-6.0
1,-5.0,6.0
2,-2.0,-4.0


# 데이터프레임에 row 추가하기

In [142]:
friend_dict_list = [{'name':'John', 'midterm':95, 'final':85},
                    {'name':'Jenny', 'midterm':85, 'final':80},
                    {'name':'Nate', 'midterm':75, 'final':95},
                    {'name':'Brian', 'midterm':55, 'final':55}]

score_df = pd.DataFrame(friend_dict_list, columns=['name','midterm','final'])
score_df

Unnamed: 0,name,midterm,final
0,John,95,85
1,Jenny,85,80
2,Nate,75,95
3,Brian,55,55


In [143]:
df2 = pd.DataFrame([['Ben', 50, 50]], columns=['name','midterm','final'])
df2

Unnamed: 0,name,midterm,final
0,Ben,50,50


In [144]:
score_df.append(df2, ignore_index=True) # ignore_index 기존의 인덱스를 무시하고 새 인덱스 4에 넣어줌.

Unnamed: 0,name,midterm,final
0,John,95,85
1,Jenny,85,80
2,Nate,75,95
3,Brian,55,55
4,Ben,50,50


### groupby() 함수
* 데이터에서 정보를 취하기 위해서 그룹별로 묶는 방법.

In [175]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': "female"}
                ]

df = pd.DataFrame(student_list, columns=['name','major','sex'])
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [151]:
groupby_major = df.groupby('major')
groupby_major

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000025631316E88>

In [152]:
groupby_major.groups # 인덱스를 그룹별로 정보를 제공

{'Computer Science': [0, 1, 6, 7], 'Economics': [4, 5, 9], 'Physics': [2], 'Psychology': [3, 8, 10]}

In [154]:
for name, group in groupby_major:
    print(name + ':' + str(len(group)))
    print(group)
    print()

Computer Science:4
       name             major     sex
0      John  Computer Science    male
1      Nate  Computer Science    male
6  Jeniffer  Computer Science  female
7    Edward  Computer Science    male

Economics:3
    name      major     sex
4  Janny  Economics  female
5   Yuna  Economics  female
9  Wendy  Economics  female

Physics:1
      name    major   sex
2  Abraham  Physics  male

Psychology:3
     name       major     sex
3   Brian  Psychology    male
8    Zara  Psychology  female
10   Sera  Psychology  female



In [159]:
# 그룹 객체를 다시 데이터프레임으로 생성.

df_major_cnt = pd.DataFrame({'count':groupby_major.size()}).reset_index()
df_major_cnt

Unnamed: 0,major,count
0,Computer Science,4
1,Economics,3
2,Physics,1
3,Psychology,3


In [164]:
groupby_sex = df.groupby('sex')
groupby_sex

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002563117F5C8>

In [165]:
for name, group in groupby_sex:
    print(name + ':' + str(len(group)))
    print(group)
    print()

female:6
        name             major     sex
4      Janny         Economics  female
5       Yuna         Economics  female
6   Jeniffer  Computer Science  female
8       Zara        Psychology  female
9      Wendy         Economics  female
10      Sera        Psychology  female

male:5
      name             major   sex
0     John  Computer Science  male
1     Nate  Computer Science  male
2  Abraham           Physics  male
3    Brian        Psychology  male
7   Edward  Computer Science  male



In [166]:
df_major_cnt = pd.DataFrame({'count':groupby_sex.size()}).reset_index()
df_major_cnt

Unnamed: 0,sex,count
0,female,6
1,male,5


# 중복 데이터 drop 하기

In [176]:
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [177]:
# 중복 데이터 삽입
df2 = pd.DataFrame([['Zara','Psychology','female']], columns=['name','major','sex'])
df = df.append(df2, ignore_index=True)
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [182]:
# 중복 데이터 확인하기
df.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
dtype: bool

In [183]:
# 중복 데이터를 삭제

df = df.drop_duplicates()
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [186]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': None, 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': None},
                {'name': 'John', 'major': "Computer Science", 'sex': None},
                {'name': 'Nate', 'major': None, 'sex': "male"}
                ]

df = pd.DataFrame(student_list, columns=['name','major','sex'])
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,,female


In [187]:
df.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
dtype: bool

In [191]:
# name 컬럼이 똑같을 경우, 중복된 데이터라고 표시
df.duplicated(['name'], keep=False)

0      True
1      True
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11     True
12     True
dtype: bool

In [190]:
# Keep 값을 first 또는 last라고 값을 줘서 중복된 값 줌, 어느 값을 살릴지 결정.

df.drop_duplicates(['name'], keep = 'last') # keep = 'last' # 나중에 중복되서 버린 데이터를 반환시켜줌
# keep = 디폴트값은 'first'로 돼있음.

Unnamed: 0,name,major,sex
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,,female
10,Sera,Psychology,
11,John,Computer Science,
