#### Logic in python(and pandas) 
- <, >, ==, <=, >=
- !=
- df.컬럼명.isin()
- pd.isnull()
- pd.notnull()
- &(and), |(or), ~(not), ^(xor), df.any(), df.all()

In [2]:
import pandas as pd

In [3]:
df = pd.DataFrame({
  "a":[4,5,6,6],
  "b":[7,8,9,9], 
  "c":[10,11,12,12], 
}, index=list("abcd"))
df

Unnamed: 0,a,b,c
a,4,7,10
b,5,8,11
c,6,9,12
d,6,9,12


In [6]:
# b 컬럼의 값 중 7이 아닌 values 추출
df[df["b"] != 7]

Unnamed: 0,a,b,c
b,5,8,11
c,6,9,12
d,6,9,12


In [32]:
# # b 컬럼의 값 중 7이 아닌 values 추출 + a,b 컬럼만 추출
# df[df["b"] != 7][["a","b"]]

# loc
df.loc[df["b"]!=7,["a","b"]]

# iloc
# df.iloc[[1]!=7,[0,2]]

Unnamed: 0,a,b
b,5,8
c,6,9
d,6,9


In [35]:
# a 컬럼에 5를 포함하고 있는가
df.a.isin([5])

# a 컬럼에 5를 포함하고 있지 않는가
df[~df.a.isin([5])]

Unnamed: 0,a,b,c
a,4,7,10
c,6,9,12
d,6,9,12


In [76]:
import numpy as np

df = pd.DataFrame({
  "a":[4,5,np.nan,np.nan],
  "b":[7,8,9,9], 
  "c":[10,11,12,12], 
}, index=list("abcd"))

df

Unnamed: 0,a,b,c
a,4.0,7,10
b,5.0,8,11
c,,9,12
d,,9,12


In [37]:
# 널 값

df.isnull()

# 특정 컬럼 널 값
df.a.isnull()
df["a"].isnull()

a    False
b    False
c    False
d    False
Name: a, dtype: bool

In [52]:
# 각 컬럼에 null 의 값이 몇개 존재하느냐

df.isnull().sum()

a    4
b    0
c    0
dtype: int64

In [47]:
df.notnull().sum()

a    0
b    4
c    4
dtype: int64

In [61]:
df["a"].notnull()

a     True
b    False
c    False
d    False
Name: a, dtype: bool

In [54]:
df.any()

a    False
b     True
c     True
dtype: bool

In [81]:
4 and 5
True and False

df[(df.b == 7) & (df.a == 4)]
df[(df.b == 7) | (df.a == 5)]

Unnamed: 0,a,b,c
a,4.0,7,10
b,5.0,8,11


#### 통계

In [83]:
import seaborn as sns

df = sns.load_dataset("iris")
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [94]:
# 특정 컬럼의 unique 값 count
df["species"].value_counts()

# 컬럼의 값 중 중복제거한 후 개수
df["species"].nunique()

3

In [90]:
# shape : tuple로 나옴 - (행 개수, 컬럼 개수)
df.shape

# 행
df.shape[0]
len(df)

150

In [95]:
# 기술적 통계 정보
# count, mean(평균), std(표준편차), min, 사분위수(25%,50%,75%), max
# object 컬럼 제외(기본)
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [96]:
# 범주형(object) 기술통계요약

df.describe(include=['object'])

Unnamed: 0,species
count,150
unique,3
top,setosa
freq,50


In [109]:
# sum(), min(), max(), mean(), median(), std(), quantile([0.25,0.75])

df['petal_length'].sum()
df.sum()

df['petal_length'].count()
df.count()

df['petal_length'].mean()
# df.mean()

df['petal_length'].quantile([0.25,0.75])

0.25    1.6
0.75    5.1
Name: petal_length, dtype: float64

In [111]:
# train.xlsx 가져온 후 기술적 통계
train = pd.read_excel("./resources/train.xlsx")
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [115]:
# 생존자들 기술적 통계
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [166]:
# 생존자들 나이 평균 : mean()
# train[train["Survived"]==True]['Age'].mean()
# train[train.Survived==True]['Age'].mean()
train.loc[train.Survived==1,['Age']].mean()

# 생존하였고 Pclass가 3보다 적은 사람들의 나이 평균
# train[(train.Survived==True)&(train.Pclass<3)]['Age']

Age    28.34369
dtype: float64

In [168]:
# 생존자들 중 가장 연장자
train[train.Survived==True]['Age'].max()

np.float64(80.0)

#### NaN 처리

In [175]:
df = pd.DataFrame(
    {
        "name":["Alfred","Batman","Catwoman"],
        "toy":[np.nan,"Batmobile","Bullwhip"],
        "born": [np.nan,"1940-04-25",pd.NaT]
    }
)

df

Unnamed: 0,name,toy,born
0,Alfred,,
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [176]:
df.isna()

# Null이 존재 시 제거(행)
df.dropna()

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


In [177]:
df.dropna(axis=1)

Unnamed: 0,name
0,Alfred
1,Batman
2,Catwoman


In [173]:
# any : 컬럼안에 na가 하나라도 존재하면 해당하는 행 or 열 제거
# all : na가 모든 행 or 열에 존재해야 제거

df.dropna(how='all')

Unnamed: 0,name,toy
1,1940-04-25,Batmobile
2,NaT,Bullwhip


In [195]:
student_list = {
    "name" : ["John","Nate","Edward","Zara","Wendy","Nate","John"],
    "job" : ["teacher","teacher","teacher","student","student","teacher","student"],
    "age" : [40,35,37,15,12,None,None]
}

df = pd.DataFrame(student_list)
df

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Edward,teacher,37.0
3,Zara,student,15.0
4,Wendy,student,12.0
5,Nate,teacher,
6,John,student,


In [185]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    7 non-null      object 
 1   job     7 non-null      object 
 2   age     7 non-null      float64
dtypes: float64(1), object(2)
memory usage: 300.0+ bytes


In [197]:
df['age'].fillna(0,inplace=True)

0    40.0
1    35.0
2    37.0
3    15.0
4    12.0
5     0.0
6     0.0
Name: age, dtype: float64

In [207]:
# NaN을 나이의 평균으로 채우기
# mean : 평균 / median : 중앙값

df["age"].fillna(df["age"].mean())

# teacher는 teacher 평균으로, student는 student 평균으로 채우기
df["age"].fillna(df.groupby("job")['age'].transform("mean"))

0    40.000000
1    35.000000
2    37.000000
3    15.000000
4    12.000000
5    37.333333
6    13.500000
Name: age, dtype: float64

In [213]:
df = pd.DataFrame({
  "A":[np.nan,2,2,np.nan],
  "B":[7,8,9,9], 
  "C":[10,11,12,12], 
  "D":[10,11,12,12], 
}, index=list("abcd"))

df

Unnamed: 0,A,B,C,D
a,,7,10,10
b,2.0,8,11,11
c,2.0,9,12,12
d,,9,12,12


In [202]:
# NaN : 결측치 확인
df.info()
df.isna()
df.isnull()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    7 non-null      object 
 1   job     7 non-null      object 
 2   age     5 non-null      float64
dtypes: float64(1), object(2)
memory usage: 300.0+ bytes


Unnamed: 0,name,job,age
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,True
6,False,False,True


In [204]:
# 결측치 0으로 채우기
df.fillna(0)

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Edward,teacher,37.0
3,Zara,student,15.0
4,Wendy,student,12.0
5,Nate,teacher,0.0
6,John,student,0.0


In [214]:
# 특정 값으로 채우기
# A:0, B:1, C:2, D:3
values = {"A":0,"B":1,"C":2,"D":3}
df.fillna(values)

Unnamed: 0,A,B,C,D
a,0.0,7,10,10
b,2.0,8,11,11
c,2.0,9,12,12
d,0.0,9,12,12


In [216]:
# 결측치를 D열의 중앙 값으로 채우기
df.fillna(df["D"].median())

Unnamed: 0,A,B,C,D
a,11.5,7,10,10
b,2.0,8,11,11
c,2.0,9,12,12
d,11.5,9,12,12
