### 임베딩(Embedding)

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

### 단어 사전 생성용 텍스트 데이터 생성 + 단어 사전 생성
# 텍스트 데이터 생성
#text = ['나는 배가 고프다', '내일 점심 뭐먹지', '내일 공부 해야겠다', '점심 먹고 공부해야지']

# 단어 사전 생성 --> model.fit() 함수 이용 --> 자동으로 토큰화(띄어쓰기 단위로 토큰화) + 단어 사전 생성
#count.fit(text)
# 생성된 단어 사전 확인하기
#vocab = count.vocabulary_
#print(f'생성된 단어 사전 확인하기 : \n{vocab}')

In [2]:
# 모델 생성
count = CountVectorizer()

In [3]:
text = ['나는 배가 고프다', '내일 점심 뭐먹지', '내일 공부 해야겠다', '점심 먹고 공부해야지']

count.fit(text)
print(count.vocabulary_)

{'나는': 3, '배가': 7, '고프다': 0, '내일': 4, '점심': 8, '뭐먹지': 6, '공부': 1, '해야겠다': 9, '먹고': 5, '공부해야지': 2}


In [7]:
### 보충 : 파이썬 dict --> 크기 순으로 정렬(기분 : key / value)

data = {'apple' : 3, 'cherry' : 1, 'banana' : 2}

# 첫번째 절차 : dict 자료형.items() --> [(key, value), ...., (key n, value n)]
items = data.items()
print(items)
print('-'*80)

# 두번째 절차 :  key를 기준으로 졍렬 --> 오름차순 정렬(기본값)
sorted_key = sorted(items)
print(sorted_key)


dict_items([('apple', 3), ('cherry', 1), ('banana', 2)])
--------------------------------------------------------------------------------
[('apple', 3), ('banana', 2), ('cherry', 1)]


In [5]:
result = count.vocabulary_
print(result)

res = sorted(result.items())
print(res)

{'나는': 3, '배가': 7, '고프다': 0, '내일': 4, '점심': 8, '뭐먹지': 6, '공부': 1, '해야겠다': 9, '먹고': 5, '공부해야지': 2}
[('고프다', 0), ('공부', 1), ('공부해야지', 2), ('나는', 3), ('내일', 4), ('먹고', 5), ('뭐먹지', 6), ('배가', 7), ('점심', 8), ('해야겠다', 9)]


In [20]:
### 한글 문장 --> 임베딩 --> transform(sentence).toarray()

sentence = ['나는 배가 고프다']
sentence1 = [text[0]]
sentence2 = text[:]
#sentence3 = ['오늘의 날씨는 흐리다가 밤부터 비가 올 수 있습니다.']

embedding3 = count.transform(sentence2).toarray()

print(count.transform(sentence).toarray())
print('-'*80)
print(count.transform(sentence1).toarray())
print('-'*80)
print(embedding3)
print('-'*80)
#print(count.transform(sentence3).toarray())

words = sorted(result.keys())
df_embedding = pd.DataFrame(data = embedding3, columns = words)
df_embedding

[[1 0 0 1 0 0 0 1 0 0]]
--------------------------------------------------------------------------------
[[1 0 0 1 0 0 0 1 0 0]]
--------------------------------------------------------------------------------
[[1 0 0 1 0 0 0 1 0 0]
 [0 0 0 0 1 0 1 0 1 0]
 [0 1 0 0 1 0 0 0 0 1]
 [0 0 1 0 0 1 0 0 1 0]]
--------------------------------------------------------------------------------


Unnamed: 0,고프다,공부,공부해야지,나는,내일,먹고,뭐먹지,배가,점심,해야겠다
0,1,0,0,1,0,0,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0
2,0,1,0,0,1,0,0,0,0,1
3,0,0,1,0,0,1,0,0,1,0


In [19]:
words = sorted(result.keys())
print(words)

['고프다', '공부', '공부해야지', '나는', '내일', '먹고', '뭐먹지', '배가', '점심', '해야겠다']


### TfidfVectorizer 모델

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
tfidf = TfidfVectorizer()

text = ['나는 배가 고프다', '내일 점심 뭐먹지', '내일 공부 해야겠다', '점심 먹고 공부해야지']

tfidf.fit(text)
print(tfidf.vocabulary_)

{'나는': 3, '배가': 7, '고프다': 0, '내일': 4, '점심': 8, '뭐먹지': 6, '공부': 1, '해야겠다': 9, '먹고': 5, '공부해야지': 2}


In [27]:
### 한글 문장 --> 임베딩 --> transform(sentence).toarray()

sentence = ['나는 배가 고프다']
sentence1 = [text[0]]
sentence2 = text[:]
#sentence3 = ['오늘의 날씨는 흐리다가 밤부터 비가 올 수 있습니다.']

embedding3 = tfidf.transform(sentence2).toarray()

print(tfidf.transform(sentence).toarray())
print('-'*80)
print(tfidf.transform(sentence1).toarray())
print('-'*80)
print(embedding3)
print('-'*80)
#print(count.transform(sentence3).toarray())


[[0.57735027 0.         0.         0.57735027 0.         0.
  0.         0.57735027 0.         0.        ]]
--------------------------------------------------------------------------------
[[0.57735027 0.         0.         0.57735027 0.         0.
  0.         0.57735027 0.         0.        ]]
--------------------------------------------------------------------------------
[[0.57735027 0.         0.         0.57735027 0.         0.
  0.         0.57735027 0.         0.        ]
 [0.         0.         0.         0.         0.52640543 0.
  0.66767854 0.         0.52640543 0.        ]
 [0.         0.61761437 0.         0.         0.48693426 0.
  0.         0.         0.         0.61761437]
 [0.         0.         0.61761437 0.         0.         0.61761437
  0.         0.         0.48693426 0.        ]]
--------------------------------------------------------------------------------


In [36]:
result1 = tfidf.vocabulary_
print(result1)
print('-'*80)

res = sorted(result1.items())
print(res)
print('-'*80)

words = sorted(result1.keys())
print(words)

df_embedding = pd.DataFrame(data = embedding3, columns = words)
df_embedding

{'나는': 3, '배가': 7, '고프다': 0, '내일': 4, '점심': 8, '뭐먹지': 6, '공부': 1, '해야겠다': 9, '먹고': 5, '공부해야지': 2}
--------------------------------------------------------------------------------
[('고프다', 0), ('공부', 1), ('공부해야지', 2), ('나는', 3), ('내일', 4), ('먹고', 5), ('뭐먹지', 6), ('배가', 7), ('점심', 8), ('해야겠다', 9)]
--------------------------------------------------------------------------------
['고프다', '공부', '공부해야지', '나는', '내일', '먹고', '뭐먹지', '배가', '점심', '해야겠다']


Unnamed: 0,고프다,공부,공부해야지,나는,내일,먹고,뭐먹지,배가,점심,해야겠다
0,0.57735,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0
1,0.0,0.0,0.0,0.0,0.526405,0.0,0.667679,0.0,0.526405,0.0
2,0.0,0.617614,0.0,0.0,0.486934,0.0,0.0,0.0,0.0,0.617614
3,0.0,0.0,0.617614,0.0,0.0,0.617614,0.0,0.0,0.486934,0.0


### 코사인 유사도

In [37]:
### 두 문장 간의 코사인 유사도 측정
'''
1. 두 개의 문장을 CountVectorizer를 이용해서 행렬로 나타낸다.
2. 코사인 유사도 공식을 사용하여 두 문장 간의 유사도를 측정
'''
# 필요한 함수 임폴트
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
ko_text = ["직원이 무단 퇴사를 했는데 손해 배상 청구할 수 있나요?", "무단 퇴사한 직원에 대한 손해 배상 청구가 가능한가요?"]
eng_text = ["What should I do to be a great scientist?", "How can I be a good scientist?"]

In [53]:
ko_count = CountVectorizer()
eng_count = CountVectorizer()

ko_count.fit(ko_text)
eng_count.fit(eng_text)

ko_vocab = ko_count.vocabulary_
eng_vocab = eng_count.vocabulary_

print(ko_vocab)
print('-'*80)
print(eng_vocab)

{'직원이': 7, '무단': 2, '퇴사를': 10, '했는데': 12, '손해': 4, '배상': 3, '청구할': 9, '있나요': 5, '퇴사한': 11, '직원에': 6, '대한': 1, '청구가': 8, '가능한가요': 0}
--------------------------------------------------------------------------------
{'what': 9, 'should': 7, 'do': 2, 'to': 8, 'be': 0, 'great': 4, 'scientist': 6, 'how': 5, 'can': 1, 'good': 3}


In [56]:
ko_words = sorted(ko_vocab.keys())
eng_words = sorted(eng_vocab.keys())

print(ko_words)
print('-'*80)
print(eng_words)

['가능한가요', '대한', '무단', '배상', '손해', '있나요', '직원에', '직원이', '청구가', '청구할', '퇴사를', '퇴사한', '했는데']
--------------------------------------------------------------------------------
['be', 'can', 'do', 'good', 'great', 'how', 'scientist', 'should', 'to', 'what']


In [58]:
'''
ko_text = ["직원이 무단 퇴사를 했는데 손해 배상 청구할 수 있나요?", "무단 퇴사한 직원에 대한 손해 배상 청구가 가능한가요?"]
eng_text = ["What should I do to be a great scientist?", "How can I be a good scientist?"]
'''

ko_sentence1 = [ko_text[0]]
ko_sentence2 = [ko_text[1]]
eng_sentence1 = [eng_text[0]]
eng_sentence2 = [eng_text[1]]

ko_emb1 = ko_count.transform(ko_sentence1).toarray()
ko_emb2 = ko_count.transform(ko_sentence2).toarray()
eng_emb1 = eng_count.transform(eng_sentence1).toarray()
eng_emb2 = eng_count.transform(eng_sentence2).toarray()

print(ko_emb1)
print('-'*80)
print(ko_emb2)
print('-'*80)

print(eng_emb1)
print('-'*80)
print(eng_emb1)
print('-'*80)

[[0 0 1 1 1 1 0 1 0 1 1 0 1]]
--------------------------------------------------------------------------------
[[1 1 1 1 1 0 1 0 1 0 0 1 0]]
--------------------------------------------------------------------------------
[[1 0 1 0 1 0 1 1 1 1]]
--------------------------------------------------------------------------------
[[1 0 1 0 1 0 1 1 1 1]]
--------------------------------------------------------------------------------


In [59]:
### 코사인 유사도 측정
sim_ko = cosine_similarity(ko_emb1, ko_emb2)
sim_eng = cosine_similarity(eng_emb1, eng_emb2)

print(sim_ko)
print(sim_eng)

[[0.375]]
[[0.3380617]]


### CountVectorizer 모델과 코사인 유사도를 이용한 영화 추천 함수 구현

In [60]:
'''
### 실습 개요
- dataset : 2017년 7월 또는 그 이전에 개봉된 영화 45,000편에 대한 각종 정보가
포함되어 있음
- 영화에 대한 줄거리(줄거리, 텍스트 데이터)를 CountVectorizer를 이용하여 행렬로 변환한다.
- 좋아하는 영화의 제목을 입력한다.
- 입력한 영화의 줄거리와 유사한 줄거리를 가지는 영화 제목을 찾아서 추천해준다
### 우리에게 필요한 컬럼 : 제목(title), 줄거리(overview)
'''

'\n### 실습 개요\n- dataset : 2017년 7월 또는 그 이전에 개봉된 영화 45,000편에 대한 각종 정보가\n포함되어 있음\n- 영화에 대한 줄거리(줄거리, 텍스트 데이터)를 CountVectorizer를 이용하여 행렬로 변환한다.\n- 좋아하는 영화의 제목을 입력한다.\n- 입력한 영화의 줄거리와 유사한 줄거리를 가지는 영화 제목을 찾아서 추천해준다\n### 우리에게 필요한 컬럼 : 제목(title), 줄거리(overview)\n'

In [127]:
file_path = '/content/drive/MyDrive/KDT/딥러닝/자연어처리/movies_metadata.csv'

df_movie = pd.read_csv(file_path)
df_movie.info()

  df_movie = pd.read_csv(file_path)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [128]:
### 줄거리, 제목 칼럼 추출
df = df_movie.loc[:, ['overview', 'title']]
df

Unnamed: 0,overview,title
0,"Led by Woody, Andy's toys live happily in his ...",Toy Story
1,When siblings Judy and Peter discover an encha...,Jumanji
2,A family wedding reignites the ancient feud be...,Grumpier Old Men
3,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale
4,Just when George Banks has recovered from his ...,Father of the Bride Part II
...,...,...
45461,Rising and falling between a man and woman.,Subdue
45462,An artist struggles to finish his work while a...,Century of Birthing
45463,"When one of her hits goes wrong, a professiona...",Betrayal
45464,"In a small town live two brothers, one a minis...",Satan Triumphant


In [129]:
df.isnull().sum()

overview    954
title         6
dtype: int64

In [130]:
# 누락확인
df.loc[df['overview'].isnull(), :]

Unnamed: 0,overview,title
32,,Wings of Courage
300,,Roommates
634,,Peanuts – Die Bank zahlt alles
635,,Happy Weekend
641,,The Superwife
...,...,...
45342,,Over/Under
45377,,Simbad e il califfo di Bagdad
45398,,Thick Lashes of Lauri Mäntyvaara
45399,,All at Once


In [131]:
### 누락되는 데이터 존재하는 행 제거 -> df.dropna()
cleaned_df = df.dropna()
print(cleaned_df.isnull().sum())
cleaned_df

overview    0
title       0
dtype: int64


Unnamed: 0,overview,title
0,"Led by Woody, Andy's toys live happily in his ...",Toy Story
1,When siblings Judy and Peter discover an encha...,Jumanji
2,A family wedding reignites the ancient feud be...,Grumpier Old Men
3,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale
4,Just when George Banks has recovered from his ...,Father of the Bride Part II
...,...,...
45461,Rising and falling between a man and woman.,Subdue
45462,An artist struggles to finish his work while a...,Century of Birthing
45463,"When one of her hits goes wrong, a professiona...",Betrayal
45464,"In a small town live two brothers, one a minis...",Satan Triumphant


In [132]:
#cleaned_df = cleaned_df.reset_index(drop = True)
cleaned_df.reset_index(drop = True, inplace = True)
cleaned_df

Unnamed: 0,overview,title
0,"Led by Woody, Andy's toys live happily in his ...",Toy Story
1,When siblings Judy and Peter discover an encha...,Jumanji
2,A family wedding reignites the ancient feud be...,Grumpier Old Men
3,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale
4,Just when George Banks has recovered from his ...,Father of the Bride Part II
...,...,...
44501,Rising and falling between a man and woman.,Subdue
44502,An artist struggles to finish his work while a...,Century of Birthing
44503,"When one of her hits goes wrong, a professiona...",Betrayal
44504,"In a small town live two brothers, one a minis...",Satan Triumphant


In [133]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words = 'english')

In [134]:
### fit() 함수 --> overview 칼럼 --> 단어 사전 생성

# 데이터의 수 --> 10000개로 축소 --> 슬라이싱
data = cleaned_df.iloc[0:10000, :]
#data

# 단어 사전을 생성할 텍스트 데이터 추출
#text = data.loc[:, 'overview']
text = data.loc[:, 'overview'].values
print(text)

print(data.shape)

["Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."
 "When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures."
 "A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is

In [135]:
# 단어 사전 생성
count.fit(text)

vocab = count.vocabulary_
print(vocab)
print('-'*80)

print(len(vocab))

--------------------------------------------------------------------------------
32382


In [136]:
vocab_sort = sorted(vocab.items())
print(vocab_sort)
print('-'*80)

words = sorted(vocab.keys())
print(words)

--------------------------------------------------------------------------------


In [137]:
embeddings = count.transform(text).toarray()
print(embeddings)
print('-'*80)

print(embeddings.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
--------------------------------------------------------------------------------
(10000, 32382)


In [138]:
### 코사인 유사도 계산 --> cosine_similarity(x, y)

# 10000개의 영화 줄거리 각가에 대해서 코사인 유사도 계산
sim = cosine_similarity(embeddings, embeddings)
print(sim)

[[1.         0.02153652 0.         ... 0.         0.         0.        ]
 [0.02153652 1.         0.05170877 ... 0.         0.         0.        ]
 [0.         0.05170877 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [139]:
sim.shape

(10000, 10000)

In [148]:
columns = data.loc[:, 'title'].values
index = data.loc[:, 'title'].values
print(columns)
print(index)


df_sim = pd.DataFrame(data = sim, index = index, columns = columns)
df_sim

['Toy Story' 'Jumanji' 'Grumpier Old Men' ... 'La Chienne' 'Eréndira'
 'The Private Lives of Elizabeth and Essex']
['Toy Story' 'Jumanji' 'Grumpier Old Men' ... 'La Chienne' 'Eréndira'
 'The Private Lives of Elizabeth and Essex']


Unnamed: 0,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,...,Between Your Legs,Zatôichi Meets the One-Armed Swordsman,The Angry Silence,Isadora,San Francisco,Earth,Él,La Chienne,Eréndira,The Private Lives of Elizabeth and Essex
Toy Story,1.000000,0.021537,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.020838,0.0,0.000000,0.039621,0.000000,0.000000,0.000000,0.000000,0.000000
Jumanji,0.021537,1.000000,0.051709,0.000000,0.000000,0.081230,0.000000,0.000000,0.164536,0.0,...,0.017645,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Grumpier Old Men,0.000000,0.051709,1.000000,0.000000,0.031846,0.000000,0.000000,0.024015,0.000000,0.0,...,0.000000,0.000000,0.0,0.030802,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Waiting to Exhale,0.000000,0.000000,0.000000,1.000000,0.000000,0.035921,0.000000,0.028006,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.063246
Father of the Bride Part II,0.000000,0.000000,0.031846,0.000000,1.000000,0.000000,0.047946,0.000000,0.067557,0.0,...,0.021734,0.000000,0.0,0.000000,0.000000,0.000000,0.030949,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Earth,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.055556,0.000000,0.047619,0.000000
Él,0.000000,0.000000,0.000000,0.000000,0.030949,0.000000,0.000000,0.000000,0.020211,0.0,...,0.058521,0.000000,0.0,0.000000,0.000000,0.055556,1.000000,0.025126,0.023810,0.000000
La Chienne,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.017645,0.000000,0.0,0.054153,0.000000,0.000000,0.025126,1.000000,0.000000,0.000000
Eréndira,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.016720,0.000000,0.0,0.000000,0.039621,0.047619,0.023810,0.000000,1.000000,0.000000


In [149]:
### 코사인 유사도 계산 결과 저장하기
file_path = '/content/drive/MyDrive/KDT/딥러닝/자연어처리/df_sim.csv'

df_sim.to_csv(file_path)

In [147]:
df = pd.read_csv(file_path, index_col = 0)
df

Unnamed: 0,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,...,Between Your Legs,Zatôichi Meets the One-Armed Swordsman,The Angry Silence,Isadora,San Francisco,Earth.2,Él,La Chienne,Eréndira,The Private Lives of Elizabeth and Essex
0,1.000000,0.021537,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.020838,0.0,0.000000,0.039621,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.021537,1.000000,0.051709,0.000000,0.000000,0.081230,0.000000,0.000000,0.164536,0.0,...,0.017645,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.051709,1.000000,0.000000,0.031846,0.000000,0.000000,0.024015,0.000000,0.0,...,0.000000,0.000000,0.0,0.030802,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,1.000000,0.000000,0.035921,0.000000,0.028006,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.063246
4,0.000000,0.000000,0.031846,0.000000,1.000000,0.000000,0.047946,0.000000,0.067557,0.0,...,0.021734,0.000000,0.0,0.000000,0.000000,0.000000,0.030949,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.055556,0.000000,0.047619,0.000000
9996,0.000000,0.000000,0.000000,0.000000,0.030949,0.000000,0.000000,0.000000,0.020211,0.0,...,0.058521,0.000000,0.0,0.000000,0.000000,0.055556,1.000000,0.025126,0.023810,0.000000
9997,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.017645,0.000000,0.0,0.054153,0.000000,0.000000,0.025126,1.000000,0.000000,0.000000
9998,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.016720,0.000000,0.0,0.000000,0.039621,0.047619,0.023810,0.000000,1.000000,0.000000
