# 0. Cosine Similarity

$similarity=cos(Θ)=\frac{A⋅B}{||A||\ ||B||}=\frac{\sum_{i=1}^{n}{A_{i}×B_{i}}}{\sqrt{\sum_{i=1}^{n}(A_{i})^2}×\sqrt{\sum_{i=1}^{n}(B_{i})^2}}$

In [44]:
answers = [["남자", "여자"],["17살", "18살"],["INFP", "ENFP", 'ESFJ', "ISFJ", "ISFP", "ESFP", "INTP","INFJ", "ENFJ", "ENTP", "ESTJ", "ISTJ", "INTJ", "ISTP", "ESTP", "ENTJ"],
                ["A", "B", "AB", "O"],
                ["한식", "양식", "중식", "일식"],
                 ["빨강", "주황", "노랑", "초록", "파랑", "보라", "흰색", "검은색"]]

In [45]:
import pandas as pd

data = pd.read_csv("./sample_data.csv")

In [46]:
data.head()

Unnamed: 0,schoolNumber,gender,age,mbti,bloodtype,favoriteFood,favoriteColor
0,10214,남자,17살,ENFJ,B,한식,빨강
1,10215,남자,17살,INFJ,A,중식,파랑
2,10216,여자,18살,ENFP,B,일식,빨강
3,10217,남자,18살,ENTJ,AB,한식,초록
4,10218,여자,17살,ENFJ,O,양식,파랑


In [47]:
# Can bias differ the cosine similarity?
def get_index(df):
    indexes = []
    for i, answer in enumerate(answers):
        no_blank_ele = df[i].strip()
        temp_i = answer.index(no_blank_ele)    
        biased_i = temp_i * (1000-200*i)    
        indexes.append(biased_i)

    return indexes

In [48]:
labels = data.iloc[:, 0]
index_dataframe = data.iloc[:, 1:].apply(get_index, axis=1)

In [49]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

def cos_sim(A, B):
  return dot(A, B)/(norm(A)*norm(B))

In [50]:
cosine_similarity_matrix = np.array([[cos_sim(np.array(index_dataframe[a]), np.array(index_dataframe[b])) for a in range(len(index_dataframe))] for b in range(len(index_dataframe))])

In [51]:
cosine_similarity_matrix

array([[1.        , 0.99205679, 0.39758439, 0.9960881 , 0.9666138 ],
       [0.99205679, 1.        , 0.41209639, 0.98772184, 0.94964256],
       [0.39758439, 0.41209639, 1.        , 0.441682  , 0.55864692],
       [0.9960881 , 0.98772184, 0.441682  , 1.        , 0.9636994 ],
       [0.9666138 , 0.94964256, 0.55864692, 0.9636994 , 1.        ]])

In [52]:
def visualize_cosine_similarity(matrix):
    h, w = matrix.shape

    for h_ in range(h):
        for w_ in range(w):
            print(matrix[h_][w_], end=" ")
        print("\n")

In [53]:
visualize_cosine_similarity(cosine_similarity_matrix)

0.9999999999999999 0.9920567905654849 0.39758438636375143 0.9960881023042134 0.9666137973731176 

0.9920567905654849 1.0 0.412096390413668 0.9877218409080568 0.9496425641327126 

0.39758438636375143 0.412096390413668 1.0 0.44168200311359274 0.5586469198446351 

0.9960881023042134 0.9877218409080568 0.44168200311359274 1.0000000000000002 0.9636993990976406 

0.9666137973731176 0.9496425641327126 0.5586469198446351 0.9636993990976406 0.9999999999999999 



In [54]:

labels.rename(None, inplace=True)

0    10214
1    10215
2    10216
3    10217
4    10218
dtype: int64

In [55]:
cosine_dataframe = pd.DataFrame(cosine_similarity_matrix, columns=labels, index=labels)

In [60]:
cosine_dataframe.head(10)

Unnamed: 0,10214,10215,10216,10217,10218
10214,0.0,0.992057,0.397584,0.996088,0.966614
10215,0.992057,0.0,0.412096,0.987722,0.949643
10216,0.397584,0.412096,0.0,0.441682,0.558647
10217,0.996088,0.987722,0.441682,0.0,0.963699
10218,0.966614,0.949643,0.558647,0.963699,0.0


In [57]:
for i in cosine_dataframe.index:
    cosine_dataframe.loc[i][i] = 0

cosine_dataframe.head()

Unnamed: 0,10214,10215,10216,10217,10218
10214,0.0,0.992057,0.397584,0.996088,0.966614
10215,0.992057,0.0,0.412096,0.987722,0.949643
10216,0.397584,0.412096,0.0,0.441682,0.558647
10217,0.996088,0.987722,0.441682,0.0,0.963699
10218,0.966614,0.949643,0.558647,0.963699,0.0


In [58]:
cosine_dataframe.loc[10214, :]

10214    0.000000
10215    0.992057
10216    0.397584
10217    0.996088
10218    0.966614
Name: 10214, dtype: float64

In [61]:
test = cosine_dataframe.copy()


# prev_index = []
# for schoolNumber in labels:
#     similairty = test.loc[schoolNumber, :]
#     print(similairty)
#     while True:
#         max_similairty = similairty.idxmax()

#         if max_similairty not in prev_index:
#             test.loc[schoolNumber, :] = 0
#             test.loc[:, schoolNumber] = 0
#             prev_index.append(max_similairty)
#             break

#         test.loc[schoolNumber, max_similairty] = 0
#         # print(cosine_dataframe)
#         print(max_similairty)

# print(prev_index)

matching_status = []
for schoolNumber in labels:
    similairty = test.loc[schoolNumber, :]
    
    if similairty.max() == 0:
        print("Matching isn't possible..")
        continue
    max_similairty = similairty.idxmax()

    test.loc[schoolNumber, :] = 0
    test.loc[:, schoolNumber] = 0
    test.loc[max_similairty, :] = 0
    test.loc[:, max_similairty] = 0

    matching_status.append([schoolNumber, max_similairty])    

print(matching_status)

Matching isn't possible..
Matching isn't possible..
Matching isn't possible..
[[10214, 10217], [10215, 10218]]
