* Contents🔍
> 1. 판다스 시리즈에서 값 얻기
> 2. 판다스 시리즈 요약통계
> 3. 시리즈 값 변경
> 4. (조건에 따라) 시리즈 값 변경⭐
> 5. 문자열 시리즈⭐
> 6. 날짜 다루기⭐
> 7. 누락 데이터 다루기⭐

#  판다스 시리즈에서 값 얻기

In [2]:
import pandas as pd

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 35)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

nls97 = pd.read_csv("data/nls97b.csv")
nls97.set_index("personid", inplace=True)

In [99]:
nls97.gpaoverall

personid
100061     3
100139   nan
100284   nan
100292     3
100583     3
          ..
999291     3
999406     2
999543   nan
999698   nan
999963     4
Name: gpaoverall, Length: 8984, dtype: float64

In [3]:
# 1. create a series from the GPA column
gpaoverall = nls97.gpaoverall
type(gpaoverall), gpaoverall.index

(pandas.core.series.Series,
 Int64Index([100061, 100139, 100284, 100292, 100583, 100833, 100931, 101089, 101122, 101132,
             ...
             998997, 999031, 999053, 999087, 999103, 999291, 999406, 999543, 999698, 999963],
            dtype='int64', name='personid', length=8984))

In [4]:
gpaoverall.head(2)

personid
100061   3.06
100139    nan
Name: gpaoverall, dtype: float64

In [5]:
# 2. select gpa values using bracket notation
gpaoverall[:5]

personid
100061   3.06
100139    nan
100284    nan
100292   3.45
100583   2.91
Name: gpaoverall, dtype: float64

In [6]:
gpaoverall.tail()

personid
999291   3.11
999406   2.17
999543    nan
999698    nan
999963   3.78
Name: gpaoverall, dtype: float64

In [7]:
gpaoverall[-5:]

personid
999291   3.11
999406   2.17
999543    nan
999698    nan
999963   3.78
Name: gpaoverall, dtype: float64

In [8]:
# 3. select values using loc (이름)
gpaoverall.loc[100061]

3.06

In [9]:
gpaoverall.loc[[100061]]

personid
100061   3.06
Name: gpaoverall, dtype: float64

In [10]:
gpaoverall.loc[[100061,100139,100284]]

personid
100061   3.06
100139    nan
100284    nan
Name: gpaoverall, dtype: float64

In [11]:
gpaoverall.loc[100061:100833]

personid
100061   3.06
100139    nan
100284    nan
100292   3.45
100583   2.91
100833   2.46
Name: gpaoverall, dtype: float64

In [12]:
# 4. select values using iloc (위치)
gpaoverall.iloc[[0]]

personid
100061   3.06
Name: gpaoverall, dtype: float64

In [13]:
gpaoverall.iloc[[0,1,2,3,4]]

personid
100061   3.06
100139    nan
100284    nan
100292   3.45
100583   2.91
Name: gpaoverall, dtype: float64

In [14]:
gpaoverall.iloc[:5]

personid
100061   3.06
100139    nan
100284    nan
100292   3.45
100583   2.91
Name: gpaoverall, dtype: float64

In [15]:
gpaoverall.iloc[-5:]

personid
999291   3.11
999406   2.17
999543    nan
999698    nan
999963   3.78
Name: gpaoverall, dtype: float64

# 판다스 시리즈 요약통계

In [16]:
import pandas as pd
import numpy as np
nls97 = pd.read_csv("data/nls97b.csv")
nls97.set_index("personid", inplace=True)

In [17]:
# 1. descriptive statistics
gpaoverall = nls97.gpaoverall
gpaoverall.mean()

2.8184077281812145

In [18]:
gpaoverall.describe()

count   6,004.00
mean        2.82
std         0.62
min         0.10
25%         2.43
50%         2.86
75%         3.26
max         4.17
Name: gpaoverall, dtype: float64

In [19]:
gpaoverall.quantile(np.arange(0.1,1.1,0.1))

0.10   2.02
0.20   2.31
0.30   2.52
0.40   2.70
0.50   2.86
0.60   3.01
0.70   3.17
0.80   3.36
0.90   3.60
1.00   4.17
Name: gpaoverall, dtype: float64

In [20]:
# 2. subset based on values
gpaoverall.loc[gpaoverall.between(3,3.5)].head(3)

personid
100061   3.06
100292   3.45
101526   3.37
Name: gpaoverall, dtype: float64

In [21]:
gpaoverall.loc[gpaoverall.between(3,3.5)].count()

1679

In [22]:
gpaoverall.loc[(gpaoverall<2) | (gpaoverall>4)].sample(5, random_state=2)

personid
932782   1.90
561335   1.82
850001   4.10
292455   1.97
644271   1.97
Name: gpaoverall, dtype: float64

In [23]:
gpaoverall.loc[gpaoverall>gpaoverall.quantile(0.99)].agg(['count','min','max']) # 상위 1%의 빈도 및 최소/최댓값

count   60.00
min      3.98
max      4.17
Name: gpaoverall, dtype: float64

In [100]:
gpaoverall

personid
100061     3
100139   nan
100284   nan
100292     3
100583     3
          ..
999291     3
999406     2
999543   nan
999698   nan
999963     4
Name: gpaoverall, Length: 8984, dtype: float64

In [24]:
# 3. run tests across all values
(gpaoverall>4).any() # any person has GPA greater than 4

True

In [25]:
(gpaoverall>=0).all() # all people have GPA greater than 0

False

In [26]:
(gpaoverall>=0).sum() # of people with GPA greater than 0

6004

In [27]:
(gpaoverall.isnull()==True).sum() # data length=8984

2980

In [28]:
(gpaoverall==0).sum() # of people with GPA equal to 0

0

In [29]:
gpaoverall.isnull().sum() # of people with missing value for GPA

2980

📍 True 조건을 걸지 않아도, True인 것에 대해서만 요약통계를 보여줌  > e.g. (gpaoverall.isnull()==True).sum()

    참고로 데이터프레임의 length=8984

In [30]:
# 4. GPA for high/low wage income earners
nls97.loc[nls97.wageincome > nls97.wageincome.quantile(0.75),'gpaoverall'].mean()

3.0804171011470256

In [31]:
nls97.loc[nls97.wageincome < nls97.wageincome.quantile(0.25),'gpaoverall'].mean()

2.720143415906124

In [32]:
# 5. counts for series with categorical data
nls97.maritalstatus.describe()

count        6672
unique          5
top       Married
freq         3066
Name: maritalstatus, dtype: object

📍 top은 최빈값

In [33]:
nls97.maritalstatus.value_counts()

Married          3066
Never-married    2766
Divorced          663
Separated         154
Widowed            23
Name: maritalstatus, dtype: int64

# 시리즈 값 변경

In [34]:
import pandas as pd
nls97 = pd.read_csv("data/nls97b.csv")
nls97.set_index("personid", inplace=True)

In [35]:
# 1. multiply all values of a series by scalar
nls97.gpaoverall.head()

personid
100061   3.06
100139    nan
100284    nan
100292   3.45
100583   2.91
Name: gpaoverall, dtype: float64

In [36]:
gpaoverall100 = nls97['gpaoverall'] * 100
gpaoverall100.head()

personid
100061   306.00
100139      nan
100284      nan
100292   345.00
100583   291.00
Name: gpaoverall, dtype: float64

In [37]:
# 2. use loc accessor to apply a scalar to selected rows
nls97.loc[[100061], 'gpaoverall'] = 3
nls97.loc[[100139,100284,100292],'gpaoverall'] = 0
nls97.gpaoverall.head()

personid
100061   3.00
100139   0.00
100284   0.00
100292   0.00
100583   2.91
Name: gpaoverall, dtype: float64

In [38]:
# 3. set values using more than one series
nls97[['childathome', 'childnotathome']].head(3)

Unnamed: 0_level_0,childathome,childnotathome
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
100061,4.0,0.0
100139,2.0,0.0
100284,1.0,0.0


In [39]:
nls97['childnum'] = nls97.childathome + nls97.childnotathome
nls97.childnum.value_counts().sort_index()

0.00       23
1.00     1364
2.00     1729
3.00     1020
4.00      420
5.00      149
6.00       55
7.00       21
8.00        7
9.00        1
12.00       2
Name: childnum, dtype: int64

In [40]:
# 4. apply a summary value to selected rows
nls97.loc[100061:100292,'gpaoverall'] = nls97.gpaoverall.mean()
nls97.gpaoverall.head()

personid
100061   2.82
100139   2.82
100284   2.82
100292   2.82
100583   2.91
Name: gpaoverall, dtype: float64

In [41]:
# 5. use iloc accessor to apply a scalar to selected rows
nls97.iloc[0, 13] = 2
nls97.iloc[1:4, 13] = 1
nls97.gpaoverall.head()

personid
100061   2.00
100139   1.00
100284   1.00
100292   1.00
100583   2.91
Name: gpaoverall, dtype: float64

In [42]:
nls97.gpaoverall

personid
100061   2.00
100139   1.00
100284   1.00
100292   1.00
100583   2.91
         ... 
999291   3.11
999406   2.17
999543    nan
999698    nan
999963   3.78
Name: gpaoverall, Length: 8984, dtype: float64

In [105]:
# 6. set values after filtering
nls97.gpaoverall.nlargest()

personid
312410   4
639701   4
850001   4
279096   4
620216   4
Name: gpaoverall, dtype: float64

📍 `.nlargest()`는 값이 높은 n번째까지 출력

In [44]:
nls97.loc[nls97.gpaoverall>4, 'gpaoverall'] = 4
nls97.gpaoverall.nlargest()

personid
112756   4.00
119784   4.00
160193   4.00
250666   4.00
271961   4.00
Name: gpaoverall, dtype: float64

# (조건에 따라) 시리즈 값 변경

In [45]:
import pandas as pd
import numpy as np

nls97 = pd.read_csv("data/nls97b.csv")
nls97.set_index("personid", inplace=True)
landtemps = pd.read_csv("data/landtemps2019avgs.csv")

In [46]:
# 1. use the numpy where function to create a categorical series with 2 values
landtemps.elevation.quantile(np.arange(0.2,1.1,0.2))

0.20      48.00
0.40     190.50
0.60     393.20
0.80   1,066.80
1.00   9,999.00
Name: elevation, dtype: float64

In [47]:
landtemps['elevation_group'] = np.where(landtemps.elevation>landtemps.elevation.quantile(0.8),'High','Low')
landtemps.elevation_group = landtemps.elevation_group.astype('category')
landtemps.groupby(['elevation_group'])['elevation'].agg(['count','min','max'])

Unnamed: 0_level_0,count,min,max
elevation_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,2409,1067.0,9999.0
Low,9686,-350.0,1066.8


In [48]:
# 2. use the numpy where function to create a categorical series with 3 values
landtemps.elevation.median()

271.3

In [128]:
landtemps.elevation

0          34
1          10
2          27
3         265
4         977
         ... 
12090      36
12091      10
12092       4
12093     986
12094   1,480
Name: elevation, Length: 12095, dtype: float64

In [49]:
landtemps['elevation_group'] = np.where(landtemps.elevation>landtemps.elevation.quantile(0.8),'High',\
                                        np.where(landtemps.elevation>landtemps.elevation.median(),'Medium','Low'))
landtemps.elevation_group = landtemps.elevation_group.astype('category')
landtemps.groupby(['elevation_group'])['elevation'].agg(['count','min','max'])

Unnamed: 0_level_0,count,min,max
elevation_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,2409,1067.0,9999.0
Low,6056,-350.0,271.3
Medium,3630,271.4,1066.8


In [121]:
# 3. use numpy select to evaluate a list of conditions
test = [(nls97.gpaoverall<2) & (nls97.highestdegree=='0. None'), nls97.highestdegree=='0. None', nls97.gpaoverall<2]
result = ['1. Low GPA and No Diploma','2. No Diploma','3. Low GPA']
nls97['hsachieve'] = np.select(test, result, '4. Did Okay')
nls97[['hsachieve','gpaoverall','highestdegree']].head()
nls97.hsachieve.value_counts().sort_index()

1. Low GPA and No Diploma      95
2. No Diploma                 858
3. Low GPA                    459
4. Did Okay                  7572
Name: hsachieve, dtype: int64

In [51]:
# 4. create a flag if individual ever had bachelor degree enrollment
nls97.loc[[100292,100583,100139], 'colenrfeb00':'colenroct04'].T
nls97['baenrollment'] = nls97.filter(like="colenr").\
  apply(lambda x: x.str[0:1]=='3').\
  any(axis=1)

In [52]:
# 5. create a flag if individual ever had bachelor degree enrollment
nls97.loc[[100292,100583,100139], 'colenrfeb00':'colenroct04'].T

personid,100292,100583,100139
colenrfeb00,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct00,3. 4-year college,1. Not enrolled,1. Not enrolled
colenrfeb01,3. 4-year college,1. Not enrolled,1. Not enrolled
colenroct01,3. 4-year college,3. 4-year college,1. Not enrolled
colenrfeb02,3. 4-year college,3. 4-year college,1. Not enrolled
colenroct02,3. 4-year college,1. Not enrolled,1. Not enrolled
colenrfeb03,3. 4-year college,1. Not enrolled,1. Not enrolled
colenroct03,3. 4-year college,1. Not enrolled,1. Not enrolled
colenrfeb04,3. 4-year college,1. Not enrolled,1. Not enrolled
colenroct04,1. Not enrolled,1. Not enrolled,1. Not enrolled


In [53]:
nls97['baenrollment'] = nls97.filter(like="colenr").apply(lambda x: x.str[0:1]=='3').any(axis=1)

In [54]:
nls97.loc[[100292,100583,100139], ['baenrollment']].T

personid,100292,100583,100139
baenrollment,True,True,False


In [55]:
nls97.baenrollment.value_counts()

False    5085
True     3899
Name: baenrollment, dtype: int64

# 문자열 시리즈
* **문자열 시리즈 데이터**를 평가하고 대치하는 작업에 빈번히 사용하는 다양한 메서드 익히기  

In [56]:
import pandas as pd
import numpy as np
nls97 = pd.read_csv('data/nls97c.csv') # 미국 종단 조사(NLS) 데이터
nls97.set_index('personid', inplace=True)

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 35)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

* **summary📌**
        - 문자열의 일부를 위치에 따라 취하기
          ⭐df.{column name}.str.findall('\d+') - 문자열에서 숫자만 추출 > list로 저장
          ⭐df['new column name'] = np.where(df.{column name}.isnull(), np.nan, np.where(df.{column name}.조건메서드, 'True 출력값', 'False 출력값'))
        - 문자열에 특정 패턴이 있는지 확인하기  
          ⭐df.{column name}.str.contains('a string') - 특정 문자를 포함하는지?
          ⭐df.{column name}.str[i:i+j].isin(list) - 리스트에 있는지?
        - 문자열에 있는 공백 찾기
          ⭐df.{column name}.str.startswith(' ') - 공백으로 시작
          ⭐df.{column name}.str.endswith(' ') - 공백으로 끝
        - 문자열에 있는 공백 제거하기
          ⭐df.{column name}.str.strip() - 둘 다 제거
          ⭐df.{column name}.str.lstrip() - 시작 공백 제거
          ⭐df.{column name}.str.rstrip() - 끝 공백 제거
        - 문자열을 분할하기
        - 문자열 길이를 구하기  
        - 둘 이상의 문자열을 연결하기  
        - 대소문자를 변경하기 

In [57]:
# 1. test whether a str pattern exists 
nls97.govprovidejobs.value_counts()

2. Probably          617
3. Probably not      462
1. Definitely        454
4. Definitely not    300
Name: govprovidejobs, dtype: int64

In [119]:
nls97['govprovidejobsdefprob'] = np.where(nls97.govprovidejobs.isnull(), np.nan, np.where(nls97.govprovidejobs.str.contains('not'), 'No', 'Yes'))
nls97['govprovidejobsdefprob'] = np.where(nls97.govprovidejobs.isnull(), np.nan, np.where(nls97.govprovidejobs.str.contains('Pro|not'), 'No', 'Yes'))
nls97[['govprovidejobsdefprob']].sample(3)

Unnamed: 0_level_0,govprovidejobsdefprob
personid,Unnamed: 1_level_1
217031,
479356,
763745,


📍 govprovidejob(정부에서 일자리를 제공해야 하는지)에 대한 응답에 'not'이 포함되는지를 **contain**으로 조사한 결과(Yes or No)를 새로운 열로 생성함

In [120]:
pd.crosstab(nls97.govprovidejobs, nls97.govprovidejobsdefprob)

govprovidejobsdefprob,No,Yes
govprovidejobs,Unnamed: 1_level_1,Unnamed: 2_level_1
1. Definitely,0,454
2. Probably,617,0
3. Probably not,462,0
4. Definitely not,300,0


In [60]:
# 2. handle spaces in a string
nls97.maritalstatus.value_counts()

Married          3064
Never-married    2766
Divorced          663
Separated         154
Widowed            23
Married             2
Name: maritalstatus, dtype: int64

In [61]:
nls97.maritalstatus.str.endswith(' ').any() # ' '으로 끝나는 값이 한 개라도 있는가

True

In [62]:
nls97.maritalstatus.str.startswith(' ').any() # ' '으로 시작하는 값이 한 개라도 있는가

False

In [63]:
nls97['maritalstatus'] = nls97.copy().maritalstatus.str.strip() # 시작과 끝의 공백을 제거 
nls97.maritalstatus.value_counts()

Married          3066
Never-married    2766
Divorced          663
Separated         154
Widowed            23
Name: maritalstatus, dtype: int64

In [64]:
nls97['evermarried'] = np.where(nls97.maritalstatus.isnull(), np.nan, np.where(nls97.maritalstatus.str.strip()=='Never-married', 'No', 'Yes'))
pd.crosstab(nls97.maritalstatus, nls97.evermarried)

evermarried,No,Yes
maritalstatus,Unnamed: 1_level_1,Unnamed: 2_level_1
Divorced,0,663
Married,0,3066
Never-married,2766,0
Separated,0,154
Widowed,0,23


In [65]:
# 3. compare a string value to a list
nls97['receivedba'] = np.where(nls97.highestdegree.isnull(), np.nan, np.where(nls97.highestdegree.str[0:1].isin(['4','5','6','7']),'Yse','No'))
pd.crosstab(nls97.highestdegree, nls97.receivedba)

receivedba,No,Yse
highestdegree,Unnamed: 1_level_1,Unnamed: 2_level_1
0. None,953,0
1. GED,1146,0
2. High School,3667,0
3. Associates,737,0
4. Bachelors,0,1673
5. Masters,0,603
6. PhD,0,54
7. Professional,0,120


In [None]:
# 4. convert a text response to numeric using numbers in the text
pd.concat([nls97.weeklyhrstv, nls97.weeklyhrstv.str.findall('\d+')], axis=1).sample(5)

📍 weeklyhrstv(일주일동안 TV시간에 보낸 시간) 문자열에 포함된 숫자만 얻어내기 위해서 findall에 정규표현식 `'\d+'`를 전달함

In [67]:
def getnum(numlist):
    highval = 0
    if (type(numlist) is list):
        lastval = int(numlist[-1]) # list의 마지막 원소를 int로 변환하여 객체에 저장
        if (numlist[0]=='40'): # list의 첫번째 값이 '40'이면 (='More than 40 hours a week')
            highval = 45
        elif (lastval==2): # list의 마지막 원소를 int로 바꾼 값이 2이면 (='Less than 2 hours per week')
            highval = 1
        else:
            highval = lastval - 5
    else:
        highval = np.nan
    return highval

📍 **getnum 함수**는 둘 이상의 수가 있는 경우, 두 수의 중간에 가깝도록 숫자를 조정해준다

In [68]:
nls97['weeklyhrstvnum'] = nls97.weeklyhrstv.str.findall('\d+').apply(getnum)
pd.crosstab(nls97.weeklyhrstv, nls97.weeklyhrstvnum)            

weeklyhrstvnum,1.00,5.00,15.00,25.00,35.00,45.00
weeklyhrstv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
11 to 20 hours a week,0,0,1145,0,0,0
21 to 30 hours a week,0,0,0,299,0,0
3 to 10 hours a week,0,3625,0,0,0,0
31 to 40 hours a week,0,0,0,0,116,0
Less than 2 hours per week,1350,0,0,0,0,0
More than 40 hours a week,0,0,0,0,0,176


# 날짜

In [69]:
import pandas as pd
import numpy as np
from datetime import datetime

covidcases = pd.read_csv("data/covidcases720.csv")
nls97 = pd.read_csv("data/nls97c.csv")
nls97.set_index("personid", inplace=True)

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 35)
pd.set_option('display.max_rows', 220)
pd.options.display.float_format = '{:,.0f}'.format

In [71]:
# 1. show the birth month and year values
nls97[['birthmonth','birthyear']].head()

Unnamed: 0_level_0,birthmonth,birthyear
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
100061,5.0,1980
100139,9.0,1983
100284,11.0,1984
100292,4.0,1982
100583,,1980


In [72]:
nls97[['birthmonth','birthyear']].isnull().sum()

birthmonth    1
birthyear     0
dtype: int64

In [73]:
nls97.birthmonth.value_counts().sort_index()

1     815
2     693
3     760
4     659
5     689
6     720
7     762
8     782
9     839
10    765
11    763
12    736
Name: birthmonth, dtype: int64

In [74]:
nls97.birthyear.value_counts().sort_index()

1980    1691
1981    1874
1982    1841
1983    1807
1984    1771
Name: birthyear, dtype: int64

In [75]:
# 2. use fillna to fix missing value
nls97.birthmonth.fillna(int(nls97.birthmonth.mean()), inplace=True)

In [77]:
nls97[['birthmonth','birthyear']].isnull().sum()

birthmonth    0
birthyear     0
dtype: int64

In [78]:
# 3. use month and date integers to create a datetime column
nls97['birthdate'] = pd.to_datetime(dict(year=nls97.birthyear, month=nls97.birthmonth, day=15))

In [79]:
nls97.birthdate.describe()

  nls97.birthdate.describe()


count                    8984
unique                     60
top       1981-09-15 00:00:00
freq                      190
first     1980-01-15 00:00:00
last      1984-12-15 00:00:00
Name: birthdate, dtype: object

In [80]:
nls97[['birthmonth','birthyear','birthdate']].head()

Unnamed: 0_level_0,birthmonth,birthyear,birthdate
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,5,1980,1980-05-15
100139,9,1983,1983-09-15
100284,11,1984,1984-11-15
100292,4,1982,1982-04-15
100583,6,1980,1980-06-15


In [81]:
nls97[['birthmonth','birthyear','birthdate']].isnull().sum()

birthmonth    0
birthyear     0
birthdate     0
dtype: int64

In [85]:
# 4. define a function for calculating given start and end date
def calcage(startdate, enddate):
    age = enddate.year - startdate.year
    if (enddate.month<startdate.month or (enddate.month==startdate.month and enddate.day<startdate.day)):
        age = age -1
        # 끝날짜의 월이 시작날짜의 월보다 작으면 e.g. 2020-02-01 ~ 2022-01-01 => (2-1)살 => 1살
        # 또는 끝날짜와 시작날짜의 월이 동일하고, 끝날짜의 일자가 시작날짜보다 작으면 e.g. 2021-02-28 ~ 2020-02-01 => (1-1)살 => 0살
    return age

📍 `{date type}.year`, `{date type}.month`, `{date type}.day` 와 같이 어트리뷰트를 뽑아서 연산에 사용할 수 있다

In [84]:
# 5. calculate age
rundate = pd.to_datetime('2022-02-06') # 현재를 run date라고 치면, 현재 기준 몇살일까?
nls97["age"] = nls97.apply(lambda x: calcage(x.birthdate, rundate), axis=1)
nls97.loc[100061:100583, ['age','birthdate']]

Unnamed: 0_level_0,age,birthdate
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
100061,41,1980-05-15
100139,38,1983-09-15
100284,37,1984-11-15
100292,39,1982-04-15
100583,41,1980-06-15


In [86]:
covidcases.head(2)

Unnamed: 0,iso_code,continent,location,casedate,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,tests_units,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cvd_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,region
0,AFG,Asia,Afghanistan,2019-12-31,0,0,0,0,0,0,0,0,,,,,,,,,38928341,54,19,3,1,1804,,597,10,,,38,0,65,South Asia
1,AFG,Asia,Afghanistan,2020-01-01,0,0,0,0,0,0,0,0,,,,,,,,0.0,38928341,54,19,3,1,1804,,597,10,,,38,0,65,South Asia


In [87]:
# 6. convert a string column to a datetime column
covidcases.iloc[:, 0:6].dtypes

iso_code        object
continent       object
location        object
casedate        object
total_cases    float64
new_cases      float64
dtype: object

💡 casedate가 `object`인 상태

In [88]:
covidcases.iloc[:, 0:6].sample(2, random_state=1).T

Unnamed: 0,13482,2445
iso_code,IMN,BRB
continent,Europe,North America
location,Isle of Man,Barbados
casedate,2020-06-20,2020-04-28
total_cases,336,80
new_cases,0,1


In [90]:
covidcases['casedate'] = pd.to_datetime(covidcases.casedate, format='%Y-%m-%d')
covidcases.iloc[:, 0:6].dtypes

iso_code               object
continent              object
location               object
casedate       datetime64[ns]
total_cases           float64
new_cases             float64
dtype: object

💡 casedate를 `datetime`으로 변환됨
> pd.to_datetime(df.{column name}, format='%Y-%m-%d')

In [91]:
# 7. get descriptive statistics on datetime column
covidcases.casedate.describe()

  covidcases.casedate.describe()


count                   29529
unique                    195
top       2020-05-23 00:00:00
freq                      209
first     2019-12-31 00:00:00
last      2020-07-12 00:00:00
Name: casedate, dtype: object

📍 날짜 데이터의 시작, 끝, 최빈값 등을 구할 때 **.describe()**를 사용할 수 있다

In [92]:
# 8. calculate days since first case by country
firstcase = covidcases.loc[covidcases.new_cases>0,['location','casedate']].sort_values(['location','casedate']).\
            drop_duplicates(['location'], keep='first').rename(columns={'casedate':'firstcasedate'})
covidcases = pd.merge(covidcases, firstcase, left_on=['location'], right_on=['location'], how="left") # '최초 발생일'을 기존 데이터프레임에 갖다 붙이기 
covidcases['dayssincefirstcase'] = covidcases.casedate - covidcases.firstcasedate # '해당 발생일' - '최초 발생일'
covidcases.dayssincefirstcase.describe()

count                         29529
mean     56 days 00:15:12.892410850
std      47 days 00:35:41.813685246
min              -62 days +00:00:00
25%                21 days 00:00:00
50%                57 days 00:00:00
75%                92 days 00:00:00
max               194 days 00:00:00
Name: dayssincefirstcase, dtype: object

* `최초 발생일`을 찾아가는 과정 - **sort_values(), drop_duplicates()** 등을 활용

In [96]:
covidcases.loc[covidcases.new_cases>0,['location','casedate']].head(8)

Unnamed: 0,location,casedate
56,Afghanistan,2020-02-25
63,Afghanistan,2020-03-08
64,Afghanistan,2020-03-11
65,Afghanistan,2020-03-15
66,Afghanistan,2020-03-16
67,Afghanistan,2020-03-17
68,Afghanistan,2020-03-18
71,Afghanistan,2020-03-21


In [97]:
covidcases.loc[covidcases.new_cases>0,['location','casedate']].sort_values(['location','casedate']).drop_duplicates(['location'], keep='first').head(8)

Unnamed: 0,location,casedate
56,Afghanistan,2020-02-25
185,Albania,2020-03-09
368,Algeria,2020-02-26
501,Andorra,2020-03-03
622,Angola,2020-03-22
735,Anguilla,2020-03-27
843,Antigua and Barbuda,2020-03-15
958,Argentina,2020-03-04


In [98]:
covidcases.loc[covidcases.new_cases>0,['location','casedate']].sort_values(['location','casedate']).drop_duplicates(['location'], keep='first').rename(columns={'casedate':'firstcasedate'}).head(8)

Unnamed: 0,location,firstcasedate
56,Afghanistan,2020-02-25
185,Albania,2020-03-09
368,Algeria,2020-02-26
501,Andorra,2020-03-03
622,Angola,2020-03-22
735,Anguilla,2020-03-27
843,Antigua and Barbuda,2020-03-15
958,Argentina,2020-03-04


# 누락 데이터

In [142]:
import pandas as pd

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 12)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.0f}'.format

nls97 = pd.read_csv("data/nls97c.csv")
nls97.set_index("personid", inplace=True)

In [143]:
# 1. set up school record and demographic data frames from the NLS data
schoolrecordlist = ['satverbal','satmath','gpaoverall','gpaenglish','gpamath','gpascience','highestdegree','highestgradecompleted']
demolist = ['maritalstatus','childathome','childnotathome', 'wageincome','weeklyhrscomputer','weeklyhrstv','nightlyhrssleep']
schoolrecord = nls97[schoolrecordlist]
demo = nls97[demolist]
schoolrecord.shape, demo.shape

((8984, 8), (8984, 7))

In [144]:
# 2. check the school record data for missings
schoolrecord.isnull().sum(axis=0) # 컬럼별

satverbal                7578
satmath                  7577
gpaoverall               2980
gpaenglish               3186
gpamath                  3218
gpascience               3300
highestdegree              31
highestgradecompleted    2321
dtype: int64

In [147]:
schoolrecord.tail(15)

Unnamed: 0_level_0,satverbal,satmath,gpaoverall,gpaenglish,gpamath,gpascience,highestdegree,highestgradecompleted
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
998358,,,4.0,370.0,370.0,355.0,6. PhD,20.0
998388,,,,,,,2. High School,
998472,,,3.0,345.0,360.0,267.0,2. High School,12.0
998556,,,3.0,316.0,218.0,274.0,2. High School,12.0
998725,,,,,,,1. GED,11.0
998997,,,,,,,0. None,9.0
999031,,,3.0,298.0,298.0,325.0,5. Masters,
999053,,,,,,,0. None,
999087,,,,,,,2. High School,12.0
999103,,,3.0,283.0,345.0,218.0,2. High School,12.0


In [145]:
misscnt = schoolrecord.isnull().sum(axis=1) # 로우별
misscnt.value_counts().sort_index()

0    1087
1     312
2    3210
3    1102
4     176
5     101
6    2039
7     946
8      11
dtype: int64

In [121]:
schoolrecord.loc[misscnt>=7].head(4).T # 로우(row)기준 결측값이 6개가 넘는 데이터

personid,101705,102061,102648,104627
satverbal,,,,
satmath,,,,
gpaoverall,,,,
gpaenglish,,,,
gpamath,,,,
gpascience,,,,
highestdegree,1. GED,0. None,1. GED,0. None
highestgradecompleted,,,,


📍 `로우(row)별 누락 데이터의 개수를 평가하는 misscnt 객체`를 **loc**에 활용해서 쓸 수 있다.

In [148]:
# 3. remove rows with almost all missing data
schoolrecord = schoolrecord.dropna(thresh=2)
schoolrecord

Unnamed: 0_level_0,satverbal,satmath,gpaoverall,gpaenglish,gpamath,gpascience,highestdegree,highestgradecompleted
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100061,,,3,350,280,315,2. High School,13
100139,,,,,,,2. High School,12
100284,,,,,,,0. None,7
100292,,,3,345,370,300,4. Bachelors,
100583,,,3,283,285,240,2. High School,13
...,...,...,...,...,...,...,...,...
999291,440,350,3,323,280,305,4. Bachelors,16
999406,460,440,2,280,114,143,2. High School,14
999543,,,,,,,2. High School,12
999698,,,,,,,2. High School,12


📍 dropna 메서드의 **thresh** 옵션을 2로 설정하면 > 비누락값이 2개 미만인 행(=누락값이 7~8개인 행) 삭제됨

In [123]:
schoolrecord.isnull().sum(axis=1).value_counts().sort_index()

0    1087
1     312
2    3210
3    1102
4     176
5     101
6    2039
dtype: int64

In [124]:
# 4. assign mean values to missings
int(schoolrecord.gpaoverall.mean())

2

In [125]:
schoolrecord.gpaoverall.isnull().sum()

2023

In [126]:
schoolrecord.gpaoverall.fillna(int(schoolrecord.gpaoverall.mean()), inplace=True)
schoolrecord.gpaoverall.isnull().sum()

0

In [127]:
# 5. use forward fill
demo.wageincome.head().T

personid
100061    12,500
100139   120,000
100284    58,000
100292       nan
100583    30,000
Name: wageincome, dtype: float64

In [128]:
demo.wageincome.isnull().sum()

3893

In [129]:
nls97.wageincome.fillna(method='ffill', inplace=True)

📍 fillna의 `ffill` 옵션은 누락값을 이전에 나온 가까운 비누락값으로 대체해준다

In [131]:
demo = nls97[demolist]
demo.wageincome.head().T

personid
100061    12,500
100139   120,000
100284    58,000
100292    58,000
100583    30,000
Name: wageincome, dtype: float64

In [132]:
demo.wageincome.isnull().sum()

0

In [133]:
# 6. fill missings with the average by group
nls97[['highestdegree','weeksworked17']].head()

Unnamed: 0_level_0,highestdegree,weeksworked17
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
100061,2. High School,48.0
100139,2. High School,52.0
100284,0. None,0.0
100292,4. Bachelors,
100583,2. High School,52.0


In [140]:
workbydegree = nls97.groupby(['highestdegree'])['weeksworked17'].mean().reset_index().rename(columns={'weeksworked17':'meanweeksworked17'})
# 즉, '최종학력' 기준으로 17년도 주당 근로시간을 묶은 뒤, 그룹화한 범위 별로 평균을 계산함

* 중간과정

In [141]:
nls97.groupby(['highestdegree'])['weeksworked17'].mean()

highestdegree
0. None           29
1. GED            35
2. High School    38
3. Associates     40
4. Bachelors      44
5. Masters        45
6. PhD            44
7. Professional   47
Name: weeksworked17, dtype: float64

In [142]:
nls97.groupby(['highestdegree'])['weeksworked17'].mean().reset_index().rename(columns={'weeksworked17':'meanweeksworked17'})

Unnamed: 0,highestdegree,meanweeksworked17
0,0. None,29
1,1. GED,35
2,2. High School,38
3,3. Associates,40
4,4. Bachelors,44
5,5. Masters,45
6,6. PhD,44
7,7. Professional,47


In [143]:
workbydegree

Unnamed: 0,highestdegree,meanweeksworked17
0,0. None,29
1,1. GED,35
2,2. High School,38
3,3. Associates,40
4,4. Bachelors,44
5,5. Masters,45
6,6. PhD,44
7,7. Professional,47


In [153]:
nls97 = nls97.reset_index().merge(workbydegree, left_on=['highestdegree'], right_on=['highestdegree'], how='left').set_index('personid')
# 즉, nls97 데이터와 workbydegree 데이터를 'highestdegree'기준으로 left join 
# => 그 결과로 nls97 데이터에 최종학력별 평균 주당근로시간 칼럼이 추가됨

In [154]:
nls97.weeksworked17.fillna(nls97.meanweeksworked17, inplace=True) # 속한 최종학력 집단의 평균 주당근로시간으로 결측값이 대체됨
nls97[['highestdegree','weeksworked17','meanweeksworked17']].head()

Unnamed: 0_level_0,highestdegree,weeksworked17,meanweeksworked17
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,2. High School,48,38
100139,2. High School,52,38
100284,0. None,0,29
100292,4. Bachelors,44,44
100583,2. High School,52,38
