In [1]:
import pandas as pd
import sqlite3



In [3]:
con = sqlite3.connect("/home/thusitha/work/bigdata/NIPSdataset/archive/database.sqlite")

In [7]:
cur = con.cursor()
res = cur.execute("""SELECT name FROM sqlite_master  WHERE type='table';""")
res.fetchone()



('Papers',)

In [14]:
res = cur.execute("""select TITLE, PDFNAME as PDF, PAPERTEXT from papers limit 1""")
res.fetchone()



('Texture Synthesis Using Convolutional Neural Networks',
 '5633-texture-synthesis-using-convolutional-neural-networks.pdf',
 'Texture Synthesis Using Convolutional Neural\nNetworks\nLeon A. Gatys\nCentre for Integrative Neuroscience, University of Tübingen, Germany\nBernstein Center for Computational Neuroscience, Tübingen, Germany\nGraduate School of Neural Information Processing, University of Tübingen, Germany\nleon.gatys@bethgelab.org\nAlexander S. Ecker\nCentre for Integrative Neuroscience, University of Tübingen, Germany\nBernstein Center for Computational Neuroscience, Tübingen, Germany\nMax Planck Institute for Biological Cybernetics, Tübingen, Germany\nBaylor College of Medicine, Houston, TX, USA\nMatthias Bethge\nCentre for Integrative Neuroscience, University of Tübingen, Germany\nBernstein Center for Computational Neuroscience, Tübingen, Germany\nMax Planck Institute for Biological Cybernetics, Tübingen, Germany\n\nAbstract\nHere we introduce a new model of natura

In [17]:
df = pd.read_sql_query("""select TITLE, PDFNAME as PDF, PAPERTEXT from papers limit 10""", con)
df.head()
df.to_feather("../data/paper_extracts_10.feather")


In [25]:
from sentence_transformers import SentenceTransformer, util

sentence_embedding_model = "all-mpnet-base-v2"
model = SentenceTransformer(sentence_embedding_model)



In [20]:
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district", "London is the best city to live in!"]


query_emb = model.encode(query)
doc_emb = model.encode(docs)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs:
    print(score, doc)

0.7980597615242004 Around 9 Million people live in London
0.5350013375282288 London is the best city to live in!
0.4947284162044525 London is known for its financial district


In [2]:
df = pd.read_feather("../data/paper_extracts_10.feather")
docs_headings = df["Title"].to_list()
docs_text = df["PaperText"].to_list()
#docs_headings_emb = model.encode(docs_headings)


In [3]:
token_num = 512

def remove_non_ascii(string):
    return string.encode('ascii', errors='ignore').decode()
#full_text = remove_non_ascii(docs_text[0])


#docs_text[0]

In [4]:

def split_full_text_into_chunks(full_text, token_num):
    full_text_split = full_text.split("\n\n")
    embedding_segment = []
    chunks = []
    for txt_segment in full_text_split:
        text_words = txt_segment.split(" ")
        if len(embedding_segment) + len(text_words) <= token_num:
            embedding_segment = embedding_segment + text_words
        else:
            chunks.append(" ".join(embedding_segment).replace("\n",""))
            embedding_segment = []
    return chunks


In [18]:
text_chunks_list = []
for title, text in zip(docs_headings, docs_text):
    print(f"Splitting: {title}")
    text_chunks = split_full_text_into_chunks(remove_non_ascii(text), token_num)
    for chunk in text_chunks:
        heading_text_map = {
            "text_chunk" : chunk,
            "title": title
        }
        text_chunks_list.append(heading_text_map)


Splitting: Texture Synthesis Using Convolutional Neural Networks
Splitting: Convolutional Neural Networks with Intra-Layer Recurrent Connections for Scene Labeling
Splitting: Grammar as a Foreign Language
Splitting: Recursive Training of 2D-3D Convolutional Networks for Neuronal Boundary Prediction
Splitting: Generative Image Modeling Using Spatial LSTMs
Splitting: Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks
Splitting: Weakly-supervised Disentangling with Recurrent Transformations for 3D View Synthesis
Splitting: Exploring Models and Data for Image Question Answering
Splitting: Are You Talking to a Machine? Dataset and Methods for Multilingual Image Question
Splitting: Parallel Multi-Dimensional LSTM, With Application to Fast Biomedical Volumetric Image Segmentation


In [23]:
df_chunks = pd.DataFrame(text_chunks_list)
df_chunks.to_feather("../data/paper_extracts_chunked.feature")

In [24]:
df_chunks = pd.read_feather("../data/paper_extracts_chunked.feature")
df_chunks.head()

Unnamed: 0,text_chunk,title
0,Texture Synthesis Using Convolutional NeuralNe...,Texture Synthesis Using Convolutional Neural N...
1,2 Convolutional neural network We use the VGG...,Texture Synthesis Using Convolutional Neural N...
2,3 different features. These feature correlati...,Texture Synthesis Using Convolutional Neural N...
3,conv1_1pool1pool2pool3pool4originalPortilla &...,Texture Synthesis Using Convolutional Neural N...
4,6 Classification performance 1.00.80.60.4 top...,Texture Synthesis Using Convolutional Neural N...


In [36]:
chunks_emb = model.encode(df_chunks["text_chunk"].tolist())


In [41]:

chunks_emb_list = [chunks_emb[i, :] for i in range(chunks_emb.shape[0])]
df_chunks["embeddings"] = chunks_emb_list

In [43]:
df_chunks.to_feather("../data/paper_extracts_embed.feature")