## 3.3　テキストデータを扱う

In [1]:
import logging
import numpy as np
import pandas as pd
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

### 3.3.2　テーブルデータと共通する・異なる部分

#### Bag of Words

In [2]:
df = pd.DataFrame({'text': [
    'I like kaggle very much',
    'I do not like kaggle',
    'I do really love machine learning'
]})

df

Unnamed: 0,text
0,I like kaggle very much
1,I do not like kaggle
2,I do really love machine learning


In [3]:
vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
bag = vectorizer.fit_transform(df['text'])
bag.toarray()

array([[0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1],
       [1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0],
       [1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0]])

In [4]:
vectorizer.vocabulary_

{'i': 1,
 'like': 4,
 'kaggle': 2,
 'very': 10,
 'much': 7,
 'do': 0,
 'not': 8,
 'really': 9,
 'love': 5,
 'machine': 6,
 'learning': 3}

#### TF-IDF

In [5]:
vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
transformer = TfidfTransformer()

tf = vectorizer.fit_transform(df['text'])
tfidf = transformer.fit_transform(tf)
print(tfidf.toarray())

[[0.         0.31544415 0.40619178 0.         0.40619178 0.
  0.         0.53409337 0.         0.         0.53409337]
 [0.43306685 0.33631504 0.43306685 0.         0.43306685 0.
  0.         0.         0.56943086 0.         0.        ]
 [0.34261996 0.26607496 0.         0.45050407 0.         0.45050407
  0.45050407 0.         0.         0.45050407 0.        ]]


#### Word2vec

In [6]:
sentences = [d.split() for d in df['text']]
model = word2vec.Word2Vec(sentences, vector_size=10, min_count=1, window=2, seed=7)

In [7]:
model.wv['like']

array([ 0.01650858,  0.01069946,  0.00188946,  0.09910005,  0.06153275,
        0.05853238,  0.04005488,  0.02443584, -0.03179482,  0.09779203],
      dtype=float32)

In [8]:
model.wv.most_similar('like')

[('I', 0.4254004955291748),
 ('machine', 0.36355969309806824),
 ('not', 0.311229407787323),
 ('kaggle', -0.004140505567193031),
 ('much', -0.11530754715204239),
 ('do', -0.1529017835855484),
 ('love', -0.25542783737182617),
 ('really', -0.4161785840988159),
 ('learning', -0.44330498576164246),
 ('very', -0.4433840215206146)]

In [9]:
df['text'][0].split()

['I', 'like', 'kaggle', 'very', 'much']

In [10]:
word2vec = np.array([model.wv[word] for word in df['text'][0].split()])
word2vec

array([[ 0.08898099,  0.02501909,  0.03683598,  0.07944275,  0.01565849,
         0.05513714,  0.0667302 , -0.05495857, -0.08889369, -0.03996675],
       [ 0.01650858,  0.01069946,  0.00188946,  0.09910005,  0.06153275,
         0.05853238,  0.04005488,  0.02443584, -0.03179482,  0.09779203],
       [ 0.06329302, -0.03939352, -0.03167932, -0.04431488,  0.04389417,
        -0.04902608,  0.09809195, -0.01098474, -0.00437022,  0.00090965],
       [ 0.03720424, -0.02774719,  0.02864924,  0.01963681, -0.07835456,
        -0.08814968,  0.03203132, -0.02247364,  0.01966591, -0.03539274],
       [-0.09157717,  0.04835419, -0.00529734, -0.08170088, -0.05110302,
         0.00822875,  0.04535742,  0.00155444,  0.02258943,  0.07426786]],
      dtype=float32)

In [11]:
np.mean(word2vec, axis=0)

array([ 0.02288193,  0.00338641,  0.0060796 ,  0.01443277, -0.00167443,
       -0.0030555 ,  0.05645315, -0.01248533, -0.01656068,  0.01952201],
      dtype=float32)

In [12]:
np.max(word2vec, axis=0)

array([0.08898099, 0.04835419, 0.03683598, 0.09910005, 0.06153275,
       0.05853238, 0.09809195, 0.02443584, 0.02258943, 0.09779203],
      dtype=float32)

In [13]:
from gensim.models import word2vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences = word2vec.Text8Corpus('ja.text8')
model = word2vec.Word2Vec(sentences, vector_size=200)
model.wv.most_similar(['経済'])

2024-05-29 16:24:52,704 : INFO : collecting all words and their counts
2024-05-29 16:24:52,706 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-29 16:24:55,886 : INFO : collected 290811 word types from a corpus of 16900026 raw words and 1691 sentences
2024-05-29 16:24:55,886 : INFO : Creating a fresh vocabulary
2024-05-29 16:24:56,043 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 75187 unique words (25.85% of original 290811, drops 215624)', 'datetime': '2024-05-29T16:24:56.043322', 'gensim': '4.3.0', 'python': '3.11.5 (main, Sep 11 2023, 08:31:25) [Clang 14.0.6 ]', 'platform': 'macOS-14.5-arm64-arm-64bit', 'event': 'prepare_vocab'}
2024-05-29 16:24:56,043 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 16577418 word corpus (98.09% of original 16900026, drops 322608)', 'datetime': '2024-05-29T16:24:56.043748', 'gensim': '4.3.0', 'python': '3.11.5 (main, Sep 11 2023, 08:31:25) [Clang 14.0.6 ]', 'platfor

2024-05-29 16:25:46,484 : INFO : EPOCH 3: training on 16900026 raw words (11430752 effective words) took 12.4s, 919984 effective words/s
2024-05-29 16:25:47,497 : INFO : EPOCH 4 - PROGRESS: at 8.28% examples, 936710 words/s, in_qsize 5, out_qsize 0
2024-05-29 16:25:48,508 : INFO : EPOCH 4 - PROGRESS: at 16.62% examples, 939644 words/s, in_qsize 5, out_qsize 0
2024-05-29 16:25:49,522 : INFO : EPOCH 4 - PROGRESS: at 24.96% examples, 939994 words/s, in_qsize 5, out_qsize 0
2024-05-29 16:25:50,524 : INFO : EPOCH 4 - PROGRESS: at 33.23% examples, 940900 words/s, in_qsize 5, out_qsize 0
2024-05-29 16:25:51,530 : INFO : EPOCH 4 - PROGRESS: at 41.45% examples, 939608 words/s, in_qsize 5, out_qsize 0
2024-05-29 16:25:52,542 : INFO : EPOCH 4 - PROGRESS: at 49.79% examples, 939694 words/s, in_qsize 5, out_qsize 0
2024-05-29 16:25:53,553 : INFO : EPOCH 4 - PROGRESS: at 58.13% examples, 940320 words/s, in_qsize 5, out_qsize 0
2024-05-29 16:25:54,568 : INFO : EPOCH 4 - PROGRESS: at 66.47% examples, 

[('財政', 0.6993507146835327),
 ('政策', 0.675098180770874),
 ('社会', 0.6717209815979004),
 ('対外', 0.6704453229904175),
 ('産業', 0.638647735118866),
 ('金融', 0.6264115571975708),
 ('政治', 0.6170356273651123),
 ('格差', 0.6139470934867859),
 ('資本', 0.612656831741333),
 ('農業', 0.5940402150154114)]