# fetch_20newsgroupsのデータセットを使って文章分類

In [19]:
pip install nltk

Collecting nltk
  Downloading nltk-3.4.5.zip (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 682 kB/s eta 0:00:01    |▊                               | 30 kB 376 kB/s eta 0:00:04     |██████▊                         | 307 kB 682 kB/s eta 0:00:02
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Created wheel for nltk: filename=nltk-3.4.5-py3-none-any.whl size=1449904 sha256=9ce6ff625db4cb87c650bc8070993f9a036d305980349d68194ad49228c706e4
  Stored in directory: /home/jovyan/.cache/pip/wheels/48/8b/7f/473521e0c731c6566d631b281f323842bbda9bd819eb9a3ead
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.4.5
Note: you may need to restart the kernel to use updated packages.


In [9]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import StandardScaler 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
NewsGroup = fetch_20newsgroups()

In [12]:
data = NewsGroup.data
target = NewsGroup.target
target_names = NewsGroup.target_names

In [11]:
NewsGroup.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [13]:
len(data)

11314

In [14]:
len(target)

11314

In [15]:
target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [23]:
NewsGroup.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

### CountVectorizerを使った文章分類

In [21]:
vectorizer = CountVectorizer(min_df=1)

In [26]:
count_text = ["How to format my head disk", " Hard disk format problems "]
x = vectorizer.fit_transform(count_text)
vectorizer.get_feature_names()

['disk', 'format', 'hard', 'head', 'how', 'my', 'problems', 'to']

In [27]:
print(x.toarray().transpose())

[[1 1]
 [1 1]
 [0 1]
 [1 0]
 [1 0]
 [1 0]
 [0 1]
 [1 0]]


### fetch_20newsgroupsを使用

In [19]:
x_d, x_t, y_d, y_t = train_test_split(data, target, train_size=0.8, random_state=100) 

In [22]:
X_train = vectorizer.fit_transform(x_d)
num_sample,num_feature = X_train.shape

In [39]:
print("sample:%d, feature:%d" % (num_sample,num_feature))

sample:9051, feature:116000


In [None]:
X_train.getrow(3).toarray()

In [10]:
import numpy as np

### コーパスの生成

In [8]:
def PreProcess(text):
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')
    
    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
    courpas = np.array([word_to_id[w] for w in words])
    return courpas, word_to_id, id_to_word

### 共起行列

In [3]:
def CreateCoMatrix(courpus, VocabSize, WindowSize=1):
    CourpusSize = len(courpus)
    CoMatrix = np.zeros((VocabSize, VocabSize), dtype=np.int32)
    
    for idx, WordId in enumerate(courpus):
        for i in range(1, WindowSize+1):
            leftIdx = idx -1
            rightIdx = idx + 1
            
            if leftIdx > 0:
                left_word_id = courpus[leftIdx]
                CoMatrix[WordId, left_word_id] += 1
                
            if rightIdx < CourpusSize:
                right_word_id = courpus[rightIdx]
                CoMatrix[WordId, right_word_id] += 1
                
    return CoMatrix

### コサイン類似度

In [5]:
def cosSimilarity(x,y,eps=1e-8):
    nx = x / np.sqrt(np.sum(x**2) + eps)
    ny = y / np.sqrt(np.sum(y**2) + eps)
    return np.dot(nx,ny)

### 類似度の高い単語

In [None]:
def MostSimilarity(query, word_to_id, id_to_word, word_matrix, top=5):
    
    if query not in word_to_id:
        print("%s is not found." % query)
        return
    print("\n[query]" + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]

In [18]:
print(word_to_id)

{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}


## 実際の処理

In [6]:
text = "You say goodbye and I say hello."

In [11]:
courpas, word_to_id, id_to_word = PreProcess(text)
vocab_size = len(word_to_id)
C = CreateCoMatrix(courpas, vocab_size)

In [12]:
c_0 = C[word_to_id['you']]
c_1 = C[word_to_id['i']]

In [14]:
print(c_0,c_1)
print(cosSimilarity(c_0,c_1))

[0 1 0 0 0 0 0] [0 1 0 1 0 0 0]
0.7071067758832467


In [15]:
C

array([[0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 1, 0],
       [0, 1, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0]], dtype=int32)

In [16]:
c_2 = C[word_to_id['say']]
print(c_2)
print(cosSimilarity(c_0,c_2))

[0 0 1 0 1 1 0]
0.0
