### Py) 통계 - 상관분석
- https://datadoctorblog.com/2023/08/08/Py-Stat-correlation-analysis/

In [6]:
#from google.colab import drive
#drive.mount('/content/drive')

In [7]:
import pandas as pd

In [8]:
df = pd.read_csv("data/bike.csv")
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [10]:
# 전체 행 / 4번 idx부터 끝까지 컬럼을 가져옴
df = df.iloc[:, 4:].copy()
df.head()

Unnamed: 0,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,1,9.84,14.395,81,0.0,3,13,16
1,1,9.02,13.635,80,0.0,8,32,40
2,1,9.02,13.635,80,0.0,5,27,32
3,1,9.84,14.395,75,0.0,3,10,13
4,1,9.84,14.395,75,0.0,0,1,1


### 상관계수 확인하기
- temp 기온
- atemp 체감온도
- 0.98 매우 높은 양의 상관관계를 가짐

In [11]:
df[['temp', 'atemp']].corr()               # 피어슨 상관계수 default

Unnamed: 0,temp,atemp
temp,1.0,0.984948
atemp,0.984948,1.0


In [12]:
# weather의 최소, 최대값 보기
[df['weather'].min(), df['weather'].max()]

[1, 4]

In [13]:
# weather의 unique한 값
df['weather'].unique()

array([1, 2, 3, 4])

### kendall 방법으로 상관 분석
- 날씨 - 순서형
- 상대습도 -  숫자형
- 0.3 양의 상관관계가 있음

In [14]:
df[['weather', 'humidity']].corr(method="kendall")

Unnamed: 0,weather,humidity
weather,1.0,0.324066
humidity,0.324066,1.0


### spearman 방법으로 상관 분석
- 날씨 - 순서형
- 상대습도 -  숫자형
- 0.3 양의 상관관계가 있음

In [15]:
df[['weather', 'humidity']].corr(method="spearman")

Unnamed: 0,weather,humidity
weather,1.0,0.399492
humidity,0.399492,1.0


In [16]:
# 여러 변수를 넣으면, 상관계수 행렬을 만들어 준다.
df.corr().round(2)

Unnamed: 0,weather,temp,atemp,humidity,windspeed,casual,registered,count
weather,1.0,-0.06,-0.06,0.41,0.01,-0.14,-0.11,-0.13
temp,-0.06,1.0,0.98,-0.06,-0.02,0.47,0.32,0.39
atemp,-0.06,0.98,1.0,-0.04,-0.06,0.46,0.31,0.39
humidity,0.41,-0.06,-0.04,1.0,-0.32,-0.35,-0.27,-0.32
windspeed,0.01,-0.02,-0.06,-0.32,1.0,0.09,0.09,0.1
casual,-0.14,0.47,0.46,-0.35,0.09,1.0,0.5,0.69
registered,-0.11,0.32,0.31,-0.27,0.09,0.5,1.0,0.97
count,-0.13,0.39,0.39,-0.32,0.1,0.69,0.97,1.0


In [17]:
# 두 변수만 상관분석을 하는 경우
df["temp"].corr(other = df["atemp"])

0.9849481104817069

# scipy library로 상관분석

In [18]:
# 상관계수 + 가설검정과 p-value 확인하려면
# scipy 라이브러리 필요
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import kendalltau

In [19]:
pearsonr(df['temp'], df['atemp'])

(0.9849481104817065, 0.0)

- pvalue = 0
- 유의 수준 5%로 검정할 경우 귀무가설을 기각하고 대립가설 채택
- 산출된 피어슨 상관계수 0.9849... 값이 유의미 함의 의미

In [21]:
stat, p = pearsonr(df['temp'], df['atemp'])

In [22]:
stat, p

(0.9849481104817065, 0.0)

- stat, p-value결과를 각각 변수에 저장