# Imports

In [1]:
import csv
import json
import sys


import numpy


from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2


import shelfy

##### Create SQL DB connection

In [2]:
# Define a database name (we're using a dataset on births, so we'll call it birth_db)
# Set your postgres username/password, and connection specifics
username = 'postgres'
password = 'password'     # change this
host     = 'localhost'
port     = '5432'            # default port that postgres listens on
db_name  = 'book_info'




## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine( 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host, port, db_name) )
print(engine.url)


## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))



# Create connection and cursor object to insert info into db
con = psycopg2.connect(database = db_name, user = username, password = password, host = host)
cursor = con.cursor()

postgresql://postgres:password@localhost:5432/book_info
True


##### Create titles table

In [None]:
# Create the tables (if don't exist)
cursor.execute('''CREATE TABLE IF NOT EXISTS works (
                index BIGSERIAL PRIMARY KEY,
                titles TEXT);''')

cursor.execute('''CREATE TABLE IF NOT EXISTS editions (
                index BIGSERIAL PRIMARY KEY,
                titles TEXT);''')

cursor.execute('''CREATE TABLE IF NOT EXISTS authors (
                index BIGSERIAL PRIMARY KEY,
                authors TEXT);''')

cursor.execute('''CREATE TABLE IF NOT EXISTS publishers (
                index BIGSERIAL PRIMARY KEY,
                publishers TEXT);''')

cursor.execute('''CREATE TABLE IF NOT EXISTS words (
                index BIGSERIAL PRIMARY KEY,
                word TEXT,
                idf real);''')


# Have to commit the table creation
con.commit()

##### Fill titles

# Works
command = '''INSERT INTO works (titles) VALUES (%s);'''


titles_path = shelfy.SHELFY_BASE_PATH + '/raw_data/dumps/' + 'ol_dump_works_2017-12-31.txt'


num_fails = 0
with open(titles_path, 'r') as file_handle:
    
    for row in file_handle:
        try:
            title = json.loads(row.split('\t')[-1])['title']
            cursor.execute(command, (title,))
            con.commit()
        except:
            num_fails += 1
            print('failed', num_fails)
            pass

##### Fill editions

# Editions

command = '''INSERT INTO editions (titles) VALUES (%s);'''


titles_path = shelfy.SHELFY_BASE_PATH + '/raw_data/dumps/' + 'ol_dump_editions_2017-12-31.txt'





num_fails = 0
with open(titles_path, 'r') as file_handle:
    
    
    i = 0
    # Open reader object to parse file
    for row in file_handle:
        
        i += 1
        
        try:
            title = json.loads(row.split('\t')[4])['title']
            cursor.execute(command, (title,))
            
        except:
            num_fails += 1
            print('failed', num_fails)
            pass
        
        if i % 10000 == 0:
            print('dumping 100000')
            con.commit()
            i = 0


##### Fill authors

# Editions

command = '''INSERT INTO authors (authors) VALUES (%s);'''


titles_path = shelfy.SHELFY_BASE_PATH + '/raw_data/dumps/' + 'ol_dump_authors_2017-12-31.txt'


num_fails = 0
with open(titles_path, 'r') as file_handle:
    
    
    i = 0
    # Open reader object to parse file
    for row in file_handle:
        
        i += 1
        
        try:
            author = json.loads(row.split('\t')[4])['name']
            cursor.execute(command, (author,))
            
        except:
            num_fails += 1
            print('failed', num_fails)
        
        if i % 100000 == 0:
            print('dumping 100000')
            con.commit()
            i = 0

##### Fill publishers

In [None]:
# Publishers

command = '''INSERT INTO publishers (publishers) VALUES (%s);'''


titles_path = shelfy.SHELFY_BASE_PATH + '/raw_data/dumps/' + 'ol_dump_editions_2017-12-31.txt'


num_fails = 0
with open(titles_path, 'r') as file_handle:
    
    
    i = 0
    # Open reader object to parse file
    for row in file_handle:
        
        i += 1
        
        try:
            publisher = json.loads(row.split('\t')[4])['publishers'][0]
            
            cursor.execute(command, (publisher,))
            
        except:
            num_fails += 1
            #print('failed', num_fails)
        
        if i % 100000 == 0:
            
            print('dumping 100000')
            con.commit()
            i = 0

## TF-IDF

In [1]:
import sklearn.feature_extraction.text

import csv
import json
import sys


import numpy


from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2


import shelfy

In [2]:
# Define a database name (we're using a dataset on births, so we'll call it birth_db)
# Set your postgres username/password, and connection specifics
username = 'postgres'
password = 'password'     # change this
host     = 'localhost'
port     = '5432'            # default port that postgres listens on
db_name  = 'book_info'




## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine( 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host, port, db_name) )
print(engine.url)


## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))



# Create connection and cursor object to insert info into db
con = psycopg2.connect(database = db_name, user = username, password = password, host = host)
cursor = con.cursor()




postgresql://postgres:password@localhost:5432/book_info
True


In [3]:
# Get works
command = 'SELECT titles FROM works;'
cursor.execute(command)
works = cursor.fetchall()
works = [work[0] for work in works]

In [7]:
#tfidf = sklearn.feature_extraction.text.TfidfVectorizer(works, strip_accents = 'ascii')
vectorizer = sklearn.feature_extraction.text.CountVectorizer(strip_accents = 'unicode')
counts = vectorizer.fit_transform(works[:3])

In [9]:
print(vectorizer.vocabulary_['reali'])

{'le': 5, 'contrat': 2, 'realisable': 6, 'combating': 1, 'surgical': 7, 'infection': 4, 'chirurgie': 0, 'implantaire': 3}


In [8]:
print(counts)

  (0, 6)	1
  (0, 2)	1
  (0, 5)	1
  (1, 4)	1
  (1, 7)	1
  (1, 1)	1
  (2, 3)	1
  (2, 0)	1


In [23]:
#print(counts)
feature_names = vectorizer.get_feature_names()
print(feature_names[10])
print(counts[10])


15
  (0, 2028)	1
  (0, 1795)	1
  (0, 1668)	1
  (0, 99)	1


In [36]:
print(len(vectorizer.vocabulary_))

2791


In [34]:
print(vectorizer.vocabulary_['le'])



print(type(counts[vectorizer.vocabulary_['coffee']]))

1393
<class 'scipy.sparse.csr.csr_matrix'>


In [25]:
print(vectorizer.vocabulary_)

{'le': 1393, 'contrat': 521, 'realisable': 2026, 'combating': 497, 'surgical': 2367, 'infection': 1202, 'chirurgie': 445, 'implantaire': 1194, 'one': 1765, 'coffee': 489, 'with': 2669, 'la': 1374, '7e': 60, 'porte': 1919, 'les': 1418, 'gens': 971, 'de': 583, 'vallee': 2558, 'cerveau': 412, 'bleu': 322, 'recherche': 2030, 'ame': 144, 'mon': 1617, 'pere': 1851, 'hematologie': 1098, 'et': 796, 'soins': 2270, 'infirmiers': 1203, 'apprentissage': 181, 'du': 681, 'vocabulaire': 2593, 'medical': 1537, 'agartha': 99, 'mythe': 1668, 'ou': 1795, 'realite': 2028, 'football': 897, 'livre': 1458, 'belote': 280, 'die': 630, 'moxa': 1638, 'therapie': 2438, 'warmepunktur': 2620, 'eine': 716, 'klassische': 1327, 'chinesische': 443, 'heilmethode': 1094, 'dominique': 653, 'delise': 595, 'the': 2432, 'man': 1491, 'from': 920, 'thrush': 2446, 'offrandes': 1755, 'mer': 1562, 'grande': 1020, 'encyclopedie': 747, 'derisoire': 606, 'tome': 2463, 'swiecie': 2373, 'polszczyzny': 1913, 'guide': 1046, 'des': 607, 

In [31]:
print(works[0])

Le Contrat réalisable


In [24]:
print(feature_names)

['000', '01', '012', '02', '10', '100', '111', '1393', '14', '144', '15', '15th', '16', '17', '1770', '1774', '1775', '1776', '1802', '1831', '1839', '1840', '1886', '1891', '1900', '1919', '1941', '1944', '1947', '1955', '1960', '1968', '1969', '1973', '1979', '1992', '1995', '1999', '19e', '2000', '2002', '2003', '2004', '20th', '21st', '22', '3b', '40', '43', '45', '4v', '50', '500', '59', '64', '66', '71a', '72', '745', '77', '7e', '86', '8b0', '95', '956', '98', '99', '9b0', 'ab', 'abenteuer', 'abordaje', 'about', 'abridgement', 'abwassertechnik', 'ac', 'accedit', 'accesorios', 'acercarse', 'acid', 'across', 'adecuado', 'adentro', 'aditivos', 'admirer', 'adolescente', 'adoro', 'adr', 'advanced', 'advances', 'advents', 'africa', 'afrikanischen', 'afrique', 'afro', 'after', 'agadati', 'again', 'against', 'agape', 'agartha', 'ages', 'agitado', 'agnese', 'agravo', 'agreement', 'agricultural', 'agypten', 'aharonim', 'ahlte', 'ahuvah', 'ahuvati', 'ai', 'aimants', 'aires', 'aisthesis', '

In [15]:
print(vectorizer.transform(['the']))


  (0, 14560)	1


In [14]:
print(vectorizer.transform(['elantris']))


