### TF-IDFを計算する
文書中に含まれる単語の重要度を評価する手法
$$
\begin{eqnarray}
\mathrm{tf}-\mathrm{idf}(w, d) &=& \mathrm{tf}(w, d) \times \mathrm{idf}(w)\\
                               &=& 単語wの文書d中での出現回数 \times \log \displaystyle \frac{全文書数}{単語wが出現する文書数} 
                               \end{eqnarray}
$$

In [2]:
# !pip3 install sklearn

Collecting sklearn
  Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Collecting scikit-learn (from sklearn)
  Downloading https://files.pythonhosted.org/packages/a0/c5/d2238762d780dde84a20b8c761f563fe882b88c5a5fb03c056547c442a19/scikit_learn-0.21.3-cp36-cp36m-manylinux1_x86_64.whl (6.7MB)
[K    100% |████████████████████████████████| 6.7MB 149kB/s ta 0:00:01  0% |                                | 20kB 98kB/s eta 0:01:08
[?25hCollecting scipy>=0.17.0 (from scikit-learn->sklearn)
  Downloading https://files.pythonhosted.org/packages/29/50/a552a5aff252ae915f522e44642bb49a7b7b31677f9580cfd11bcc869976/scipy-1.3.1-cp36-cp36m-manylinux1_x86_64.whl (25.2MB)
[K    100% |████████████████████████████████| 25.2MB 48kB/s eta 0:00:01   23% |███████▋                        | 6.0MB 315kB/s eta 0:01:01    41% |█████████████▏                  | 10.3MB 367kB/s eta 0:00:41
[?25hCollecting joblib>=0.11 (from sciki

### SQlite用関数定義

In [3]:
import json

from sklearn.feature_extraction.text import TfidfVectorizer

import sqlite3

conn = None


def connect():
    global conn
    conn = sqlite3.connect('./sample.db')


def close():
    conn.close()


def create_table():
    conn.execute('DROP TABLE IF EXISTS docs')
    conn.execute('''CREATE TABLE docs (
            id          INTEGER PRIMARY KEY AUTOINCREMENT,
            content     TEXT,
            meta_info   BLOB,
            sentence    BLOB,
            chunk       BLOB,
            token       BLOB
        )''')


def load(values):
    conn.executemany(
        'INSERT INTO docs (content, meta_info) VALUES (?,?)',
        values)
    conn.commit()


def get(doc_id, fl):
    row_ls = conn.execute(
        'SELECT {} FROM docs WHERE id = ?'.format(','.join(fl)),
        (doc_id,)).fetchone()
    row_dict = {}
    for key, value in zip(fl, row_ls):
        row_dict[key] = value
    return row_dict


def get_all_ids(limit, offset=0):
    return [record[0] for record in
            conn.execute(
        'SELECT id FROM docs LIMIT ? OFFSET ?',
        (limit, offset))]


def set_annotation(doc_id, name, value):
    conn.execute(
        'UPDATE docs SET {0} = ? where id = ?'.format(name),
        (json.dumps(value), doc_id))
    conn.commit()


def get_annotation(doc_id, name):
    row = conn.execute(
        'SELECT {0} FROM docs WHERE id = ?'.format(name),
        (doc_id,)).fetchone()
    if row[0] is not None:
        return json.loads(row[0])
    else:
        return []