Imports

In [1]:
import os

import pandas as pd

from models.database import Database
from models.embeddings_model import EmbeddingsModel
from models.job_query import JOBQuery
from utils.downloader import get_glove_vectors
from utils.parser import generate_output_text, generate_input_text
from utils.vectorization import text_vectorization

Generate training-test data

In [4]:
dataset = "data/queries"
cwd = os.getcwd()
files = os.listdir(os.path.join(cwd, *dataset.split("/")))

db = Database(collect_db_info=True)
column_array_index = []
for table, columns in db.tables_attributes.items():
    for column in columns:
        column_array_index.append(table + "_" + column)

# initialize all variables
raw_input_texts = []
input_texts = []
target_texts = []

for file in files:
    f = open(dataset + "/" + file, "r")
    query = f.read().strip()
    raw_input_texts.append(query)
    job_query = JOBQuery(query)
    rows = db.explain_query(query)

    input_text = generate_input_text(job_query.predicates, job_query.rel_lookup)
    input_texts.append(input_text)
    # add '\t' at start and '\n' at end of text.
    target_text = '\t' + generate_output_text(rows, job_query.rel_lookup)[:-1] + '\n'
    target_texts.append(target_text)

Failed parse where statement:  cn.country_code ='[us]'
  AND ct.kind IS NOT NULL
  AND (ct.kind ='production companies'
       OR ct.kind = 'distributors')
  AND it1.info ='budget'
  AND it2.info ='bottom 10 rank'
  AND t.production_year >2000
  AND (t.title LIKE 'Birdemic%'
       OR t.title LIKE '%Movie%')
  AND t.id = mi.movie_id
  AND t.id = mi_idx.movie_id
  AND mi.info_type_id = it1.id
  AND mi_idx.info_type_id = it2.id
  AND t.id = mc.movie_id
  AND ct.id = mc.company_type_id
  AND cn.id = mc.company_id
  AND mc.movie_id = mi.movie_id
  AND mc.movie_id = mi_idx.movie_id
  AND mi.movie_id = mi_idx.movie_id
Failed parse where statement:  ct.kind = 'production companies'
  AND it.info = 'top 250 rank'
  AND mc.note NOT LIKE '%(as Metro-Goldwyn-Mayer Pictures)%'
  AND (mc.note LIKE '%(co-production)%')
  AND t.production_year >2010
  AND ct.id = mc.company_type_id
  AND t.id = mc.movie_id
  AND t.id = mi_idx.movie_id
  AND mc.movie_id = mi_idx.movie_id
  AND it.id = mi_idx.info_type

Vectorize inputs & outputs

In [7]:
raw_input_vectorizer, raw_input_corpus = text_vectorization(pd.DataFrame(raw_input_texts,
                                                                         columns=['input_queries']),
                                                            ['input_queries'], (1, 3))

input_vectorizer, input_corpus = text_vectorization(pd.DataFrame(input_texts, columns=['input_queries']),
                                                    ['input_queries'], (1, 1))

output_vectorizer, output_corpus = text_vectorization(pd.DataFrame(target_texts, columns=['output_queries']),
                                                      ['output_queries'], (1, 3))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  ary = asanyarray(ary)
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package a

In [8]:
print("number of encoder words : ", len(input_vectorizer.vocabulary_.keys()))
print("number of decoder words : ", len(output_vectorizer.vocabulary_.keys()))

number of encoder words :  30
number of decoder words :  464


Train embedding models

In [9]:
glove_vectors = get_glove_vectors()
input_encoder = EmbeddingsModel()
input_encoder.build(input_corpus, glove_vectors)
output_encoder = EmbeddingsModel()
output_encoder.build(output_corpus, glove_vectors)