# fetch_20newsgroupsのデータセットを使って文章分類

In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np

In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import StandardScaler 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
NewsGroup = fetch_20newsgroups()

In [5]:
data = NewsGroup.data
target = NewsGroup.target
target_names = NewsGroup.target_names

In [6]:
NewsGroup.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [7]:
print(len(data),len(target))

11314 11314


In [8]:
len(target_names)

20

In [12]:
NewsGroup.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

### CountVectorizerを使った文章分類

In [21]:
vectorizer = CountVectorizer(min_df=1)

In [26]:
count_text = ["How to format my head disk", " Hard disk format problems "]
x = vectorizer.fit_transform(count_text)
vectorizer.get_feature_names()

['disk', 'format', 'hard', 'head', 'how', 'my', 'problems', 'to']

In [27]:
print(x.toarray().transpose())

[[1 1]
 [1 1]
 [0 1]
 [1 0]
 [1 0]
 [1 0]
 [0 1]
 [1 0]]


### fetch_20newsgroupsを使用

In [18]:
x_d, x_t, y_d, y_t = train_test_split(data, target, train_size=0.8, random_state=100) 

In [22]:
X_train = vectorizer.fit_transform(x_d)
num_sample,num_feature = X_train.shape

In [39]:
print("sample:%d, feature:%d" % (num_sample,num_feature))

sample:9051, feature:116000


In [None]:
X_train.getrow(3).toarray()

### one-hot表現

In [33]:
def convert_one_hot(corpus, vocab_size):
    '''one-hot表現への変換
    :param corpus: 単語IDのリスト（1次元もしくは2次元のNumPy配列）
    :param vocab_size: 語彙数
    :return: one-hot表現（2次元もしくは3次元のNumPy配列）
    '''
    N = corpus.shape[0]

    if corpus.ndim == 1:
        one_hot = np.zeros((N, vocab_size), dtype=np.int32)
        for idx, word_id in enumerate(corpus):
            one_hot[idx, word_id] = 1

    elif corpus.ndim == 2:
        C = corpus.shape[1]
        one_hot = np.zeros((N, C, vocab_size), dtype=np.int32)
        for idx_0, word_ids in enumerate(corpus):
            for idx_1, word_id in enumerate(word_ids):
                one_hot[idx_0, idx_1, word_id] = 1

    return one_hot

In [None]:
one_hot_vec = convert_one_hot(courpas, len(courpas))

### svdによる次元削減

In [111]:
import matplotlib.pyplot as plt
import numpy as np

### 正の相互情報量

In [51]:
def ppmi(C, verbose=False, eps=1e-8):
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    S = np.sum(C, axis=0)
    total = C.shape[0] * C.shape[1]
    cnt = 0
    
    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
            M[i,j] = max(0,pmi)
            
            if verbose:
                cnt += 1
                if cnt % (total//100) == 0:
                    print('%.1f%% done' % (100*cnt/total))
    return M

### コーパスの生成(優先度1)

In [9]:
def PreProcess(text):
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')
    
    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
    courpas = np.array([word_to_id[w] for w in words])
    return courpas, word_to_id, id_to_word

In [10]:
x_d, x_t, y_d, y_t = train_test_split(data, target, train_size=0.8, random_state=100) 

In [11]:
data_text = ""
for i in range(len(x_d)):
    data_text += x_d[i]

In [12]:
courpas, word_to_id, id_to_word = PreProcess(data_text)

### 共起行列

In [3]:
def CreateCoMatrix(courpus, VocabSize, WindowSize=1):
    CourpusSize = len(courpus)
    CoMatrix = np.zeros((VocabSize, VocabSize), dtype=np.int32)
    
    for idx, WordId in enumerate(courpus):
        for i in range(1, WindowSize+1):
            leftIdx = idx -1
            rightIdx = idx + 1
            
            if leftIdx > 0:
                left_word_id = courpus[leftIdx]
                CoMatrix[WordId, left_word_id] += 1
                
            if rightIdx < CourpusSize:
                right_word_id = courpus[rightIdx]
                CoMatrix[WordId, right_word_id] += 1
                
    return CoMatrix

### コサイン類似度

In [5]:
def cosSimilarity(x,y,eps=1e-8):
    nx = x / np.sqrt(np.sum(x**2) + eps)
    ny = y / np.sqrt(np.sum(y**2) + eps)
    return np.dot(nx,ny)

### 類似度の高い単語

In [40]:
def MostSimilarity(query, word_to_id, id_to_word, word_matrix, top=5):
    
    if query not in word_to_id:
        print("%s is not found." % query)
        return
    print("\n[query]" + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]
    
    vacab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cosSimilarity(word_matrix[i], query_vec)
    
    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity[i]))
        
        count += 1
        if count >= top:
            return

In [18]:
print(word_to_id)

{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}


## 実際の処理

In [54]:
text = "You say goodbye and I say hello."

In [42]:
text1 = data[1]

In [55]:
courpas, word_to_id, id_to_word = PreProcess(text)
vocab_size = len(word_to_id)
C = CreateCoMatrix(courpas, vocab_size)

In [56]:
C

array([[0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 1, 0],
       [0, 1, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0]], dtype=int32)

In [46]:
MostSimilarity('you', word_to_id, id_to_word, C, top=5)


[query]you
 goodbye: 0.7071067758832467
 i: 0.7071067758832467
 hello: 0.7071067758832467
 say: 0.0
 and: 0.0


In [58]:
w = ppmi(C)

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


In [60]:
U, S, V = np.linalg.svd(w)

In [61]:
print(C[0])

[0 1 0 0 0 0 0]


In [62]:
print(w[0])

[ 0. inf  0.  0.  0.  0.  0.]


In [63]:
print(U[0])

[nan  0.  0.  0.  0.  0. nan]


In [64]:
print(U[0,:2])

[nan  0.]


### Word-to-Vecを使ってみた

In [65]:
import numpy as np

In [67]:
c = np.array([1,0,0,0,0,0,0])
w = np.random.randn(7,3)
h = np.dot(c,w)
print(c)
print(w)
print(h)

[1 0 0 0 0 0 0]
[[ 0.46411331 -0.56022573 -1.31550203]
 [-0.86822163  0.46399826 -0.20555579]
 [ 1.09060313 -0.39581706  0.14605464]
 [-1.60314494  1.41141566 -1.29440288]
 [ 0.81284643 -0.74680453  0.61240945]
 [ 0.25651636 -1.42194509  0.31433607]
 [-0.34634716 -0.07689601  1.67753207]]
[ 0.46411331 -0.56022573 -1.31550203]


In [78]:
# サンプルのコンテキストデータ
c0 = np.array([[1, 0, 0, 0, 0, 0, 0]])
c1 = np.array([[0, 0, 1, 0, 0, 0, 0]])

# 重みの初期化
w_in = np.random.randn(7,3)
w_out = np.random.randn(3,7)

# レイヤの生成
in_layer0 = MatMul(w_in)
in_layer1 = MatMul(w_in) 
out_layer = MatMul(w_out)

In [79]:
# 順伝播
h0 = in_layer0.forward(c0) 
h1 = in_layer1.forward(c1) 
h = 0.5 * (h0 + h1)
s = out_layer.forward(h)

In [80]:
print(s)

[[ 0.01229471  1.66198017  0.50338968  0.52212529 -0.34905706  0.5688527
  -0.72771326]]


In [84]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = PreProcess(text) 

In [85]:
print(corpus)

[0 1 2 3 4 1 5 6]


In [86]:
print(word_to_id)

{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}


In [87]:
print(id_to_word)

{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


### データセットの準備

In [105]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(categories="auto", sparse=False)


In [101]:
def create_conotext_target(corpus, window_size=1):
    target = corpus[window_size:-window_size]
    context = []
    
    for idx in range(window_size, len(corpus)-window_size):
        cs = []
        for t in range(-window_size, window_size + 1):
            if t == 0:
                continue
            cs.append(corpus[idx + t])
        context.append(cs)
        
    return np.array(context), np.array(target)

In [102]:
contexts, target = create_conotext_target(corpus, window_size=1)

In [103]:
print(contexts)

[[0 2]
 [1 3]
 [2 4]
 [3 1]
 [4 5]
 [1 6]]


In [104]:
print(target)

[1 2 3 4 1 5]


In [108]:
vocab_size = len(word_to_id)
vocab_size

7

In [109]:
word_to_id

{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}