# Imports

In [2]:
import csv
import json
import sys


import numpy


from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2


import shelfy

##### Create SQL DB connection

In [3]:
# Define a database name (we're using a dataset on births, so we'll call it birth_db)
# Set your postgres username/password, and connection specifics
username = 'postgres'
password = 'password'     # change this
host     = 'localhost'
port     = '5432'            # default port that postgres listens on
db_name  = 'book_info'




## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine( 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host, port, db_name) )
print(engine.url)


## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))



# Create connection and cursor object to insert info into db
con = psycopg2.connect(database = db_name, user = username, password = password, host = host)
cursor = con.cursor()

postgresql://postgres:password@localhost:5432/book_info
True


##### Create titles table

In [4]:
# Create the tables (if don't exist)
cursor.execute('''CREATE TABLE IF NOT EXISTS works (
                index BIGSERIAL PRIMARY KEY,
                titles TEXT);''')

cursor.execute('''CREATE TABLE IF NOT EXISTS editions (
                index BIGSERIAL PRIMARY KEY,
                titles TEXT);''')

cursor.execute('''CREATE TABLE IF NOT EXISTS authors (
                index BIGSERIAL PRIMARY KEY,
                authors TEXT);''')

cursor.execute('''CREATE TABLE IF NOT EXISTS publishers (
                index BIGSERIAL PRIMARY KEY,
                publishers TEXT);''')

cursor.execute('''CREATE TABLE IF NOT EXISTS words (
                index BIGSERIAL PRIMARY KEY,
                word TEXT,
                idf real);''')


# Have to commit the table creation
con.commit()

##### Fill titles

# Works
command = '''INSERT INTO works (titles) VALUES (%s);'''


titles_path = shelfy.SHELFY_BASE_PATH + '/raw_data/dumps/' + 'ol_dump_works_2017-12-31.txt'


num_fails = 0
with open(titles_path, 'r') as file_handle:
    
    for row in file_handle:
        try:
            title = json.loads(row.split('\t')[-1])['title']
            cursor.execute(command, (title,))
            con.commit()
        except:
            num_fails += 1
            print('failed', num_fails)
            pass

##### Fill editions

# Editions

command = '''INSERT INTO editions (titles) VALUES (%s);'''


titles_path = shelfy.SHELFY_BASE_PATH + '/raw_data/dumps/' + 'ol_dump_editions_2017-12-31.txt'





num_fails = 0
with open(titles_path, 'r') as file_handle:
    
    
    i = 0
    # Open reader object to parse file
    for row in file_handle:
        
        i += 1
        
        try:
            title = json.loads(row.split('\t')[4])['title']
            cursor.execute(command, (title,))
            
        except:
            num_fails += 1
            print('failed', num_fails)
            pass
        
        if i % 10000 == 0:
            print('dumping 100000')
            con.commit()
            i = 0


##### Fill authors

# Editions

command = '''INSERT INTO authors (authors) VALUES (%s);'''


titles_path = shelfy.SHELFY_BASE_PATH + '/raw_data/dumps/' + 'ol_dump_authors_2017-12-31.txt'


num_fails = 0
with open(titles_path, 'r') as file_handle:
    
    
    i = 0
    # Open reader object to parse file
    for row in file_handle:
        
        i += 1
        
        try:
            author = json.loads(row.split('\t')[4])['name']
            cursor.execute(command, (author,))
            
        except:
            num_fails += 1
            print('failed', num_fails)
        
        if i % 100000 == 0:
            print('dumping 100000')
            con.commit()
            i = 0

##### Fill publishers

In [None]:
# Publishers

command = '''INSERT INTO publishers (publishers) VALUES (%s);'''


titles_path = shelfy.SHELFY_BASE_PATH + '/raw_data/dumps/' + 'ol_dump_editions_2017-12-31.txt'


num_fails = 0
with open(titles_path, 'r') as file_handle:
    
    
    i = 0
    # Open reader object to parse file
    for row in file_handle:
        
        i += 1
        
        try:
            publisher = json.loads(row.split('\t')[4])['publishers'][0]
            
            cursor.execute(command, (publisher,))
            
        except:
            num_fails += 1
            #print('failed', num_fails)
        
        if i % 100000 == 0:
            
            print('dumping 100000')
            con.commit()
            i = 0

## TF-IDF

In [6]:
import sklearn.feature_extraction.text

import csv
import json
import sys


import numpy


from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2


import shelfy

In [7]:
# Define a database name (we're using a dataset on births, so we'll call it birth_db)
# Set your postgres username/password, and connection specifics
username = 'postgres'
password = 'password'     # change this
host     = 'localhost'
port     = '5432'            # default port that postgres listens on
db_name  = 'book_info'




## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine( 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host, port, db_name) )
print(engine.url)


## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))



# Create connection and cursor object to insert info into db
con = psycopg2.connect(database = db_name, user = username, password = password, host = host)
cursor = con.cursor()




postgresql://postgres:password@localhost:5432/book_info
True


In [13]:
# Get works
command = 'SELECT titles FROM works;'
cursor.execute(command)
works = cursor.fetchall()
works = [work[0] for work in works]

Le Contrat réalisable
Combating Surgical Infection
Chirurgie implantaire
One coffee with
La 7e porte
Les gens de la vallee
Le Cerveau bleu
A la recherche de l'âme de mon père
Hématologie et soins infirmiers
L' apprentissage du vocabulaire médical
L'Agartha? mythe ou réalité?
Football
Livre de la belote (le)
Die Moxa - Therapie. Wärmepunktur - Eine klassische chinesische Heilmethode
Dominique delise
The man from Thrush
Offrandes De La Mer
La grande encyclopédie du dérisoire, tome 1
W swiecie polszczyzny
Guide des sources d'information
Invention du hottentot histoire du regard occidental sur les khoisan XV-XIX
De philippe auguste a la mort de charles V
Choisir et poser portes et fenêtres
Manufactura Justo a Tiempo - Enfoque Practico
De natura
Der Korb
Le travail du sucre, tome 2
Les défis de la travailleuse familiale
Jean Vigo
Thaïlande
Locations meublées et locations saisonnières
Donna Parker on her Own
Analyse statistique des donnees experimentales
Le nouveau menoza
The Study of Africa

"\n# Get editions\ncommand = 'SELECT titles FROM editions;'\ncursor.execute(command)\nworks = cursor.fetchall()\n\n# Get authors\ncommand = 'SELECT titles FROM editions;'\ncursor.execute(command)\nauthors = cursor.fetchall()\n\n# Get publishers\ncommand = 'SELECT titles FROM editions;'\ncursor.execute(command)\npublishers = cursor.fetchall()\n"

In [None]:
for i in range(100):
    print(works[i])

In [9]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer(works, strip_accents = 'ascii')

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8',
        input=[('Le Contrat réalisable',), ('Combating Surgical Infection',), ('Chirurgie implantaire',), ('One coffee with',), ('La 7e porte',), ('Les gens de la vallee',), ('Le Cerveau bleu',), ("A la recherche de l'âme de mon père",), ('Hématologie et soins infirmiers',), ("L' apprentissage du vocabulair...hie',), ('Babel Ouest',), ('La promise',), ('Le Peintre et le Pirate',), ('Collage de serviettes',)],
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
print(works[0])

('Le Contrat réalisable',)
