#### Logic in Python(and pandas)
- <, >, ==, <=, >=
- !=
- df.컬럼명.isin()
- pd.isnull()
- pd.notnull()
- &(and), |(or), ~(not), ^(xor), df.any(), df.all()

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({
    "a":[4,5,6,6],
    "b":[7,8,9,9],
    "c":[10,11,12,12]
}, index=list("abcd"))
df

Unnamed: 0,a,b,c
a,4,7,10
b,5,8,11
c,6,9,12
d,6,9,12


In [5]:
# b 컬럼의 값 중 7 이 아닌 values 추출

df[df["b"] != 7]

Unnamed: 0,a,b,c
b,5,8,11
c,6,9,12
d,6,9,12


In [8]:
# b 컬럼의 값 중 7 이 아닌 values 추출 + a,b 컬럼만 추출

# df[df["b"] != 7][["a","b"]]

# loc
df.loc[df["b"] != 7,["a","b"]]

Unnamed: 0,a,b
b,5,8
c,6,9
d,6,9


In [12]:
# a 컬럼에 5를 포함하고 있느냐?
df[df.a.isin([5])]

# a 컬럼에 5를 포함하지 않는
df[~df.a.isin([5])]

Unnamed: 0,a,b,c
a,4,7,10
c,6,9,12
d,6,9,12


In [13]:
import numpy as np

df = pd.DataFrame({
    "a":[4,5,6,6,np.nan],
    "b":[7,8,np.nan,9,9],
    "c":[10,11,12,np.nan,12]
}, index=list(range(1,6)))

df

Unnamed: 0,a,b,c
1,4.0,7.0,10.0
2,5.0,8.0,11.0
3,6.0,,12.0
4,6.0,9.0,
5,,9.0,12.0


In [16]:
# 널 값
df.isnull()

# 특정 컬럼 널 값
df.a.isnull()
df["a"].isnull()

1    False
2    False
3    False
4    False
5     True
Name: a, dtype: bool

In [18]:
df.isnull()
df.isna()

Unnamed: 0,a,b,c
1,False,False,False
2,False,False,False
3,False,True,False
4,False,False,True
5,True,False,False


In [19]:
# 각 컬럼에 null 의 값이 몇개 존재하느냐?

df.isnull().sum()

a    1
b    1
c    1
dtype: int64

In [20]:
# 널이 아닌

df.notnull()

Unnamed: 0,a,b,c
1,True,True,True
2,True,True,True
3,True,False,True
4,True,True,False
5,False,True,True


In [21]:
df.notnull().sum()

a    4
b    4
c    4
dtype: int64

In [22]:
df["a"].notnull()

1     True
2     True
3     True
4     True
5    False
Name: a, dtype: bool

In [23]:
df.any()

a    True
b    True
c    True
dtype: bool

In [31]:
# python and, or, not
4 and 5
True and False

# df[df.b == 7] and df[df.a == 5]

# pandas & | ~
df[(df.b == 7) & (df.a == 5)]
df[(df.b == 7) | (df.a == 5)]

Unnamed: 0,a,b,c
1,4.0,7.0,10.0
2,5.0,8.0,11.0


#### 통계

In [32]:
import seaborn as sns

df = sns.load_dataset("iris")
df.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


In [37]:
# 특정 컬럼의 unique 값 count
df['species'].value_counts()

# 컬럼의 값 중 중복제거한 후 개수
df['species'].nunique()

3

In [36]:
# 튜플로 전체 행, 컬럼 개수
df.shape
# 행
df.shape[0]
len(df)

150

In [38]:
# 수치형 컬럼에 대한 기술적 통계 정보
# count,mean(평균),std(표준편차),min,사분위수(25%,50%,75%),max
# object 컬럼 제외(기본)
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [39]:
# 범주형(object) 기술통계요약

df.describe(include=['object'])

Unnamed: 0,species
count,150
unique,3
top,setosa
freq,50


In [48]:
# sum(), min(), max(), mean(), median(), std(), quantile([0.25,0.75])

df['petal_length'].sum()
df.sum()

df['petal_length'].count()
df.count()

df['petal_length'].mean()
# df.mean()

df['petal_length'].quantile([0.25,0.75])

0.25    1.6
0.75    5.1
Name: petal_length, dtype: float64

In [53]:
# train.xlsx 가져온 후 기술적통계 

df = pd.read_excel("./resources/train.xlsx")
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [55]:
# 생존자들 기술적 통계

df[df["Survived"] == 1].describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,342.0,342.0,342.0,290.0,342.0,342.0,342.0
mean,444.368421,1.0,1.950292,28.34369,0.473684,0.464912,48.395408
std,252.35884,0.0,0.863321,14.950952,0.708688,0.771712,66.596998
min,2.0,1.0,1.0,0.42,0.0,0.0,0.0
25%,250.75,1.0,1.0,19.0,0.0,0.0,12.475
50%,439.5,1.0,2.0,28.0,0.0,0.0,26.0
75%,651.5,1.0,3.0,36.0,1.0,1.0,57.0
max,890.0,1.0,3.0,80.0,4.0,5.0,512.3292


In [56]:
# 생존자들 나이 평균 : mean()

df[df["Survived"] == 1]["Age"].mean()

np.float64(28.343689655172415)

In [57]:
# 생존자들 중 가장 연장자

df[df["Survived"] == 1]["Age"].max()

np.float64(80.0)

#### NaN 처리
- 1) 제거 : dropna()
- 2) 특정한 값으로 대체 : fillna()

In [59]:
df = pd.DataFrame(
    {
        "name":["Alfred","Batman","Catwoman"],
        "toy":[np.nan,"Batmobile","Bullwhip"],
        "born":[np.nan,'1940-04-25',pd.NaT]
    }
)
df

Unnamed: 0,name,toy,born
0,Alfred,,
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [61]:
df.isna()

# 널이 존재 시 제거(행)
df.dropna()

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


In [63]:
# 열기준
df.dropna(axis=1)

Unnamed: 0,name
0,Alfred
1,Batman
2,Catwoman


In [64]:
# any : 컬럼안에 na 하나라도 존재하면 해당하는 행 or 열 제거
# all : na가 모든 행 or 열에 존재해야 제거

df.dropna(how='all')

Unnamed: 0,name,toy,born
0,Alfred,,
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [65]:
value = {"name":'noname', "toy":"Bat", "born":"1900-01-01"}

df.fillna(value=value)

Unnamed: 0,name,toy,born
0,Alfred,Bat,1900-01-01
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,1900-01-01


In [74]:
student_list = {
    "name" : ["John","Nate","Edward","Zara","Wendy","Nate","John"],
    "job" : [
        "teacher","teacher","teacher","student","student","teacher","student"
    ],
    "age":[40,35,37,15,12,None,None]
}

df = pd.DataFrame(student_list)
df

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Edward,teacher,37.0
3,Zara,student,15.0
4,Wendy,student,12.0
5,Nate,teacher,
6,John,student,


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    7 non-null      object 
 1   job     7 non-null      object 
 2   age     5 non-null      float64
dtypes: float64(1), object(2)
memory usage: 300.0+ bytes


In [69]:
# 
df["age"].fillna(0,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(0,inplace=True)


In [78]:
# NaN 을 나이의 평균으로 채우기
# mean : 평균 / median : 중앙값

# df["age"].fillna(df["age"].mean())

# tearcher 는 teacher평균으로, student는 student평균으로 채우기
df["age"].fillna(df.groupby("job")['age'].transform("mean"))

0    40.000000
1    35.000000
2    37.000000
3    15.000000
4    12.000000
5    37.333333
6    13.500000
Name: age, dtype: float64

In [79]:
data = [
    [np.nan, 2, np.nan, 0],
    [3, 4, np.nan, 1],
    [np.nan, np.nan, np.nan, 5]
]


df = pd.DataFrame(data, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


In [85]:
# NaN : 결측치

# 결측치 확인
df.info()
df.isna()
df.isnull()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       1 non-null      float64
 1   B       2 non-null      float64
 2   C       0 non-null      float64
 3   D       3 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 228.0 bytes


Unnamed: 0,A,B,C,D
0,True,False,True,False
1,False,False,True,False
2,True,True,True,False


In [86]:
# 결측치 0으로 채우기

df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5


In [87]:
# 특정 값으로 채우기
values = {"A":0, "B":1, "C":2, "D":3}
df.fillna(value=values)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,1.0,2.0,5


In [89]:
# 결측치를 D열의 중앙값으로 채우기

df.fillna(df["D"].median())

Unnamed: 0,A,B,C,D
0,1.0,2.0,1.0,0
1,3.0,4.0,1.0,1
2,1.0,1.0,1.0,5
