In [6]:
# BoW
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer()
docs = [
    'The sun is shining',
    'In is raining head',
    'Do not go out in the rain'
]


bag = count_vec.fit_transform(docs)
print(count_vec.get_feature_names())

print(bag.toarray())


['do', 'go', 'head', 'in', 'is', 'not', 'out', 'rain', 'raining', 'shining', 'sun', 'the']
[[0 0 0 0 1 0 0 0 0 1 1 1]
 [0 0 1 1 1 0 0 0 1 0 0 0]
 [1 1 0 1 0 1 1 1 0 0 0 1]]


In [8]:
# TD-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
docs = [
    'The sun is shining',
    'In is raining head',
    'Do not go out in the rain'
]

bag = tfidf_vec.fit_transform(docs)

print(tfidf_vec.get_feature_names())

print(bag.toarray())

['do', 'go', 'head', 'in', 'is', 'not', 'out', 'rain', 'raining', 'shining', 'sun', 'the']
[[ 0.          0.          0.          0.          0.42804604  0.          0.
   0.          0.          0.5628291   0.5628291   0.42804604]
 [ 0.          0.          0.5628291   0.42804604  0.42804604  0.          0.
   0.          0.5628291   0.          0.          0.        ]
 [ 0.40301621  0.40301621  0.          0.30650422  0.          0.40301621
   0.40301621  0.40301621  0.          0.          0.          0.30650422]]


In [15]:
import requests
import zipfile
from sklearn.feature_extraction.text import TfidfVectorizer

url = 'http://www.lighthouse-w5.com/python/data/testdata.zip'

tfidf_vec = TfidfVectorizer()
docs = []

res = requests.get(url, stream=True)
if res.status_code == 200:
    with open('testdata.zip', 'wb') as file:
        for chunk in res.iter_content(chunk_size = 2014):
                file.write(chunk)
    
    
    with zipfile.ZipFile('testdata.zip', 'r') as myzip:
        for info in myzip.infolist():
            #print(info.filename)
            
            with myzip.open(info.filename) as myfile:
                rawtext = myfile.read()
                text = rawtext.decode('shift_jis')
                docs.append(text)

    bag = tfidf_vec.fit_transform(docs)

    print(tfidf_vec.get_feature_names())

    print(bag.toarray())

['access', 'and', 'any', 'applications', 'basic', 'book', 'courses', 'crawling', 'data', 'day', 'every', 'for', 'format', 'from', 'graduate', 'in', 'intended', 'introductory', 'is', 'learn', 'learning', 'level', 'machine', 'mechanics', 'methods', 'of', 'range', 'scrape', 'scraping', 'several', 'source', 'storing', 'support', 'teaches', 'techniques', 'the', 'to', 'today', 'undergraduate', 'underlies', 'unlimited', 'upper', 'use', 'we', 'web', 'you']
[[ 0.23898318  0.19281012  0.47796636  0.          0.          0.          0.
   0.23898318  0.19281012  0.          0.          0.          0.23898318
   0.23898318  0.          0.19281012  0.          0.          0.
   0.19281012  0.          0.          0.          0.          0.          0.
   0.          0.          0.19281012  0.          0.23898318  0.          0.
   0.          0.23898318  0.          0.19281012  0.          0.          0.
   0.23898318  0.          0.          0.          0.38562025  0.        ]
 [ 0.          0.   

In [18]:
# BoW
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer(stop_words = 'english')
docs = [
    'The sun is shining',
    'In is raining head',
    'Do not go out in the rain'
]


bag = count_vec.fit_transform(docs)
print(count_vec.get_feature_names())

print(bag.toarray())

['head', 'rain', 'raining', 'shining', 'sun']
[[0 0 0 1 1]
 [1 0 1 0 0]
 [0 1 0 0 0]]


In [29]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk.stem

stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectornizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc : (stemmer.stem(w) for w in analyzer(doc))
    
docs = [
    'The sun is shining',
    'In is raining head',
    'Do not go out in the rain'
]

count_vec = StemmedCountVectornizer(stop_words = 'english')
bag = count_vec.fit_transform(docs)
print(count_vec.get_feature_names())

print(bag.toarray())


['head', 'rain', 'shine', 'sun']
[[0 0 1 1]
 [1 1 0 0]
 [0 1 0 0]]


In [32]:
import requests
import zipfile
from sklearn.feature_extraction.text import TfidfVectorizer

stemmer = nltk.stem.SnowballStemmer('english')

class StemmedTfidfVectornizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc : (stemmer.stem(w) for w in analyzer(doc))

url = 'http://www.lighthouse-w5.com/python/data/testdata.zip'

tfidf_vec = StemmedTfidfVectornizer(stop_words = 'english')
docs = []

res = requests.get(url, stream=True)
if res.status_code == 200:
    with open('testdata.zip', 'wb') as file:
        for chunk in res.iter_content(chunk_size = 2014):
                file.write(chunk)
    
    
    with zipfile.ZipFile('testdata.zip', 'r') as myzip:
        for info in myzip.infolist():
            #print(info.filename)
            
            with myzip.open(info.filename) as myfile:
                rawtext = myfile.read()
                text = rawtext.decode('shift_jis')
                docs.append(text)

    bag = tfidf_vec.fit_transform(docs)

    print(tfidf_vec.get_feature_names())

    print(bag.toarray())


['access', 'applic', 'basic', 'book', 'cours', 'crawl', 'data', 'day', 'format', 'graduat', 'intend', 'introductori', 'learn', 'level', 'machin', 'mechan', 'method', 'rang', 'scrape', 'sourc', 'store', 'support', 'teach', 'techniqu', 'today', 'under', 'undergradu', 'unlimit', 'upper', 'use', 'web']
[[ 0.31590422  0.          0.          0.          0.          0.31590422
   0.25486954  0.          0.31590422  0.          0.          0.
   0.17797493  0.          0.          0.          0.          0.
   0.21156474  0.31590422  0.          0.          0.          0.31590422
   0.          0.          0.          0.31590422  0.          0.
   0.50973908]
 [ 0.          0.          0.49389914  0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.49389914  0.          0.          0.33077001  0.          0.
   0.          0.49389914  0.          0.          0.          0.          0.
   0.          0.          

In [47]:
from janome.tokenizer import Tokenizer

t = Tokenizer()
token = t.tokenize('第９９回全国高校野球選手権大会の第４日（１１日）に初戦を迎える横浜（神奈川）の増田珠（しゅう）中堅手（３年）は９日の午前１１時２分、兵庫県伊丹市の練習場で、いつもの年と同じように黙とうをささげる。自身は長崎市出身で、祖母の久美子さん（７４）は広島で被爆。出身地と関東での「原爆の日」への意識の差に戸惑いながら、「長崎で起きたことを伝えていかなくては」と甲子園で思いを新たにしている。(毎日新聞)')
for n in token:
    print(n)

第	接頭詞,数接続,*,*,*,*,第,ダイ,ダイ
９	名詞,数,*,*,*,*,９,キュウ,キュー
９	名詞,数,*,*,*,*,９,キュウ,キュー
回	名詞,接尾,助数詞,*,*,*,回,カイ,カイ
全国	名詞,一般,*,*,*,*,全国,ゼンコク,ゼンコク
高校	名詞,一般,*,*,*,*,高校,コウコウ,コーコー
野球	名詞,一般,*,*,*,*,野球,ヤキュウ,ヤキュー
選手権	名詞,一般,*,*,*,*,選手権,センシュケン,センシュケン
大会	名詞,一般,*,*,*,*,大会,タイカイ,タイカイ
の	助詞,連体化,*,*,*,*,の,ノ,ノ
第	接頭詞,数接続,*,*,*,*,第,ダイ,ダイ
４	名詞,数,*,*,*,*,４,ヨン,ヨン
日	名詞,接尾,助数詞,*,*,*,日,ニチ,ニチ
（	記号,括弧開,*,*,*,*,（,（,（
１	名詞,数,*,*,*,*,１,イチ,イチ
１	名詞,数,*,*,*,*,１,イチ,イチ
日	名詞,接尾,助数詞,*,*,*,日,ニチ,ニチ
）	記号,括弧閉,*,*,*,*,）,）,）
に	助詞,格助詞,一般,*,*,*,に,ニ,ニ
初戦	名詞,一般,*,*,*,*,初戦,ショセン,ショセン
を	助詞,格助詞,一般,*,*,*,を,ヲ,ヲ
迎える	動詞,自立,*,*,一段,基本形,迎える,ムカエル,ムカエル
横浜	名詞,固有名詞,地域,一般,*,*,横浜,ヨコハマ,ヨコハマ
（	記号,括弧開,*,*,*,*,（,（,（
神奈川	名詞,固有名詞,地域,一般,*,*,神奈川,カナガワ,カナガワ
）	記号,括弧閉,*,*,*,*,）,）,）
の	助詞,連体化,*,*,*,*,の,ノ,ノ
増田	名詞,固有名詞,人名,姓,*,*,増田,マスダ,マスダ
珠	名詞,一般,*,*,*,*,珠,タマ,タマ
（	記号,括弧開,*,*,*,*,（,（,（
しゅう	名詞,一般,*,*,*,*,しゅう,シュウ,シュー
）	記号,括弧閉,*,*,*,*,）,）,）
中堅	名詞,一般,*,*,*,*,中堅,チュウケン,チューケン
手	名詞,接尾,一般,*,*,*,手,シュ,シュ
（	記号,括弧開,*,*,*,*,（,（,（
３	名詞,数,*,*,*,*,３,サン,サン
年	名詞,接尾,助数詞,*,*,*,年,ネン,ネン
）	記号,括弧閉,*,*,*

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
from janome.tokenizer import Tokenizer

def extract_words(text):
    t = Tokenizer()
    return [w.surface for w in t.tokenize(text)]

jvectorizer = CountVectorizer(analyzer=extract_words)

docs = ['今日はとても良い天気です。',
       '彼女は美しく魅力的です。',
       '展望台からの景色は美しかった。']

bag = jvectorizer.fit_transform(docs)

print(jvectorizer.get_feature_names())

print(bag.toarray())

['。', 'から', 'た', 'です', 'とても', 'の', 'は', '今日', '台', '天気', '展望', '彼女', '景色', '的', '美しかっ', '美しく', '良い', '魅力']
[[1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 0 1 0]
 [1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 1 0 1]
 [1 1 1 0 0 1 1 0 1 0 1 0 1 0 1 0 0 0]]


In [60]:
from sklearn.feature_extraction.text import CountVectorizer
from janome.tokenizer import Tokenizer

def extract_words2(text):
    t = Tokenizer()
    token = t.tokenize(text)
    words = []
    for word in token:
        if(word.part_of_speech.find('名詞') >= 0 or
          word.part_of_speech.find('動詞') == 0):
            words.append(word.surface)
    return words

docs = [
    '雪が降り寒さが厳しい夜だ',
    '資料には検討事項などの内容が記載されている・',
    'その時、雨が降って来た'
]

vectorizer = CountVectorizer(analyzer=extract_words2)
bag = vectorizer.fit_transform(docs)
print(vectorizer.get_feature_names())
print(bag.toarray())

['いる', 'さ', 'れ', '事項', '内容', '夜', '時', '来', '検討', '記載', '資料', '降っ', '降り', '雨', '雪']
[[0 1 0 0 0 1 0 0 0 0 0 0 1 0 1]
 [1 1 1 1 1 0 0 0 1 1 1 0 0 0 0]
 [0 0 0 0 0 0 1 1 0 0 0 1 0 1 0]]


In [62]:
from sklearn.feature_extraction.text import CountVectorizer
from janome.tokenizer import Tokenizer

def extract_words3(text):
    t = Tokenizer()
    token = t.tokenize(text)
    words = []
    for word in token:
        if word.part_of_speech.find('名詞') >= 0 :
            words.append(word.surface)
        elif word.part_of_speech.find('動詞') == 0:
            words.append(word.base_form)
    return words

docs = [
    '雪が降り寒さが厳しい夜だ',
    '資料には検討事項などの内容が記載されている・',
    'その時、雨が降って来た'
]

vectorizer = CountVectorizer(analyzer=extract_words2)
bag = vectorizer.fit_transform(docs)
print(vectorizer.get_feature_names())
print(bag.toarray())

['いる', 'さ', 'する', 'れる', '事項', '内容', '夜', '時', '来る', '検討', '記載', '資料', '降る', '雨', '雪']
[[0 1 0 0 0 0 1 0 0 0 0 0 1 0 1]
 [1 0 1 1 1 1 0 0 0 1 1 1 0 0 0]
 [0 0 0 0 0 0 0 1 1 0 0 0 1 1 0]]


In [110]:
import requests
import os.path
from zipfile import ZipFile
import pandas as pd

url = 'http://www.aozora.gr.jp/cards/000148/files/789_ruby_5639.zip'
data = requests.get(url)

def download_file(url):
    filename = url.rsplit('/', 1)[1].split('?')[0]
    print(os.path.exists(filename))
    if not os.path.exists(filename):
        res = requests.get(url, stream=True)
        if res.status_code == 200:
            with open(filename, 'wb') as file:
                for chunk in res.iter_content(chunk_size = 1024):
                    file.write(chunk)
    return filename

def extract_words3(text):
    t = Tokenizer()
    token = t.tokenize(text)
    words = []
    for word in token:
        if word.part_of_speech.find('名詞') >= 0 :
            words.append(word.surface)
        elif word.part_of_speech.find('動詞') == 0:
            words.append(word.base_form)
    return words

texts = []
filename = download_file(url)
print(filename)
with ZipFile(filename, 'r') as dlzip:
    for info in dlzip.infolist():
       with dlzip.open (info.filename) as file:
        print(info)
        raw_text = file.read()
        txt = raw_text.decode('shift_jis')
        texts.append(txt)
        
vectorizer = CountVectorizer(analyzer=extract_words3)
bag = vectorizer.fit_transform(texts)
#print(vectorizer.get_feature_names())
#print(bag)

data = pd.DataFrame(bag.toarray())

data = data.T
words = vectorizer.get_feature_names()
data['word'] = words
data.columns = ['count','word']
data.loc[data['count'] >= 10, ['word','count']].sort_values(by='count', ascending = False)

True
789_ruby_5639.zip
<ZipInfo filename='wagahaiwa_nekodearu.txt' compress_type=deflate external_attr=0x20 file_size=748959 compress_size=350260>


Unnamed: 0,word,count
1295,する,4127
320,いる,2020
1797,の,1645
3509,云う,1399
3486,事,1205
1743,なる,1157
209,ある,1099
2396,もの,1004
3416,主人,934
4858,君,905
