### Correlation Coefficient: 상관계수

Example: 부모, 자식 간 키의 관계  
부모키 -> 커짐 -> 자식키 -> 커짐 => 양의 상관계수  
부모키 -> 작음 -> 자식키 -> 작음 => 양의 상관계수  

부모키 -> 커짐 -> 자식키 -> 작음 => 음의 상관계수  
부모키 -> 작음 -> 자식키 -> 커짐 => 무상관  
  
피어슨 상관계수   
CoV(부모키, 자식키) / 표준편차(부모키) * 표준편차(자식키)

$$\rho_{X,Y} = \frac{cov(X,Y)}{\sigma(X) * \sigma(Y)}$$

# 추천시스템
- Collaborative Filtering(협업필터링) -> 추천시스템
- 협업필터링 알고리즘을 기반으로 추천시스템 제작
- 협업필터링의 세 종류
    - User-Based 
    - Item-Based
    - Hybrid-Based

### User-Based

In [210]:
from math import sqrt
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

critics = {
    'BTS': {'암수살인':5, '바울':4, '할로윈':1.5},
    '손흥민': {'바울':5, '할로윈':2},
    '레드벨벳': {'암수살인':2.5, '바울':2, '할로윈':1},
    '트와이스': {'암수살인':3.5, '바울':4, '할로윈':5}
}

In [211]:
def get_dist(x1, y1, x2=critics['손흥민']['바울'], y2=critics['손흥민']['할로윈']):
    return sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))

distance, relevancy = {}, {}
best = float('inf')

for person in critics:
    if person == '손흥민':
        continue
        
    if set(critics['손흥민']).intersection(set(critics[person])) == set(critics['손흥민']):
        distance[person] = get_dist(critics[person]['바울'], critics[person]['할로윈'])
        relevancy[person] = 1 / (get_dist(critics[person]['바울'], critics[person]['할로윈']) + 1)

print(distance)
print(relevancy)

{'BTS': 1.118033988749895, '레드벨벳': 3.1622776601683795, '트와이스': 3.1622776601683795}
{'BTS': 0.4721359549995794, '레드벨벳': 0.2402530733520421, '트와이스': 0.2402530733520421}


### 정규화 (Normalization)
$$\frac{각열 데이터 - 각열 최소값}{각열 최대값 - 각열 최소값}$$

In [217]:
critics = {
    '레드벨벳': {
        '택시운전사': 2.5,
        '겨울왕국': 3.5,
        '리빙라스베가스': 3.0,
        '넘버3': 3.5,
        '사랑과전쟁': 2.5,
        '세계대전': 3.0,
    },
    'BTS': {
        '택시운전사': 1.0,
        '겨울왕국': 4.5,
        '리빙라스베가스': 0.5,
        '넘버3': 1.5,
        '사랑과전쟁': 4.5,
        '세계대전': 5.0,
    },
    '블랙핑크': {
        '택시운전사': 3.0,
        '겨울왕국': 3.5,
        '리빙라스베가스': 1.5,
        '넘버3': 5.0,
        '세계대전': 3.0,
        '사랑과전쟁': 3.5,
    },
    '소녀시대': {
        '택시운전사': 2.5,
        '겨울왕국': 3.0,
        '넘버3': 3.5,
        '세계대전': 4.0,
    },
    '마마무': {
        '겨울왕국': 3.5,
        '리빙라스베가스': 3.0,
        '세계대전': 4.5,
        '넘버3': 4.0,
        '사랑과전쟁': 2.5,
    },
    '오마이걸': {
        '택시운전사': 3.0,
        '겨울왕국': 4.0,
        '리빙라스베가스': 2.0,
        '넘버3': 3.0,
        '세계대전': 3.5,
        '사랑과전쟁': 2.0,
    },
    '모모랜드': {
        '택시운전사': 3.0,
        '겨울왕국': 4.0,
        '세계대전': 3.0,
        '넘버3': 5.0,
        '사랑과전쟁': 3.5,
    },
    '우주소녀': {
        '겨울왕국': 4.5, 
         '사랑과전쟁': 1.0,
         '넘버3': 4.0
    }
}

### Sort
* 인덱스 기준: sort_index
* 데이터 기준: sort_values    

In [52]:
obj = pd.Series(range(4), index=['j', 'i', 'k', 'p'])
obj.sort_index()
# obj.sort_index(inplace=True)

i    1
j    0
k    2
p    3
dtype: int64

In [69]:
df = pd.DataFrame(np.arange(8).reshape((2, 4)),
                   index=['three', 'one'],
                   columns=['k', 'j', 'i', 'p'])
df.sort_index(axis=0) # column-wise
df.sort_index(axis=1) # row-wise
df.sort_index(ascending=False) # Descending

Unnamed: 0,k,j,i,p
three,0,1,2,3
one,4,5,6,7


In [72]:
obj = pd.Series([2, 5, -3, np.nan, 0])
obj.sort_values()

2   -3.0
4    0.0
0    2.0
1    5.0
3    NaN
dtype: float64

In [73]:
obj.sort_values(ascending=False)

1    5.0
0    2.0
4    0.0
2   -3.0
3    NaN
dtype: float64

In [82]:
df = pd.DataFrame({'z': [3, 2, -1, 5], 'a': [0, 1, 0, 1]})
df

Unnamed: 0,z,a
0,3,0
1,2,1
2,-1,0
3,5,1


In [83]:
df.sort_values(by='z')

Unnamed: 0,z,a
2,-1,0
1,2,1
0,3,0
3,5,1


In [84]:
df.sort_values(by=['z', 'a'])

Unnamed: 0,z,a
2,-1,0
1,2,1
0,3,0
3,5,1


### Unique Index
* 행의 인덱스는 유일해야 하지만 판다스에서 강제사항은 아님

In [85]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'c', 'c'])
obj

a    0
a    1
b    2
c    3
c    4
dtype: int64

In [86]:
obj.index

Index(['a', 'a', 'b', 'c', 'c'], dtype='object')

In [89]:
obj.index.is_unique

False

In [90]:
obj['a']

a    0
a    1
dtype: int64

In [95]:
type(obj['a'])

pandas.core.series.Series

In [96]:
type(obj['b'])

numpy.int64

In [103]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,-0.667788,0.337318,-0.557298
a,0.505233,0.017367,-1.520091
b,0.976283,0.72219,-2.074744
b,1.408046,-1.440694,0.881634


In [109]:
df.loc['b']

Unnamed: 0,0,1,2
b,0.976283,0.72219,-2.074744
b,1.408046,-1.440694,0.881634


In [112]:
df = pd.DataFrame([[1.3, np.nan],
                  [7.5, -5.5],
                  [np.nan, np.nan],[0.5, -1.5]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.3,
b,7.5,-5.5
c,,
d,0.5,-1.5


In [120]:
df.mean(axis=1)

a    1.3
b    1.0
c    NaN
d   -0.5
dtype: float64

In [122]:
df.mean(axis=1, skipna=False)

a    NaN
b    1.0
c    NaN
d   -0.5
dtype: float64

In [128]:
df.idxmax()

one    b
two    d
dtype: object

In [129]:
df.cumsum()

Unnamed: 0,one,two
a,1.3,
b,8.8,-5.5
c,,
d,9.3,-7.0


In [132]:
obj = pd.Series(['a', 'a', 'b', 'b', 'c'] * 3)
obj.describe()

count     15
unique     3
top        a
freq       6
dtype: object