<a href="https://colab.research.google.com/github/wheath/thelazyscholar/blob/main/thelazyscholar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install weaviate-client

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting weaviate-client
  Downloading weaviate_client-3.18.0-py3-none-any.whl (95 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.6/95.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests<2.29.0,>=2.28.0 (from weaviate-client)
  Downloading requests-2.28.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting validators<=0.21.0,>=0.18.2 (from weaviate-client)
  Downloading validators-0.20.0.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting authlib>=1.1.0 (from weaviate-client)
  Downloading Authlib-1.2.0-py2.py3-none-any.whl (214 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.8/214.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: validators
  Bui

In [5]:
import os
import weaviate

client = weaviate.Client(
    url="https://thelazyscholar-uuhw7wj2.weaviate.network",
    additional_headers={
        "X-OpenAI-Api-Key": "NYB",   # Replace w/ your OpenAI Key 
    }
)
client.is_ready()


True

In [33]:
#Clear up the schema, so that we can recreate it

#client.schema.delete_all()
client.schema.get()

# Define the Schema object to use `text-embedding-ada-002` on `title` and `content`, but skip it for `url`
article_schema = {
    "class": "Article",
    "description": "A collection of articles",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        "text2vec-openai": {
          "model": "ada",
          "modelVersion": "002",
          "type": "text"
        },
        "generative-openai": {
            "model": "gpt-3.5-turbo"
            
        }
    },
    "properties": [{
        "name": "title",
        "description": "Title of the article",
        "dataType": ["string"]
    },
    {
        "name": "execsummary",
        "description": "executive summary of the article",
        "dataType": ["text"]
    },
    {
        "name": "url",
        "description": "URL to the article",
        "dataType": ["string"],
        "moduleConfig": { "text2vec-openai": { "skip": True } }
    }]
}

# add the Article schema
client.schema.create_class(article_schema)

# get the schema to make sure it worked
client.schema.get()

{'classes': [{'class': 'Article',
   'description': 'A collection of articles',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'generative-openai': {'model': 'gpt-3.5-turbo'},
    'text2vec-openai': {'model': 'ada',
     'modelVersion': '002',
     'type': 'text',
     'vectorizeClassName': True}},
   'properties': [{'dataType': ['text'],
     'description': 'Title of the article',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-openai': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'title',
     'tokenization': 'whitespace'},
    {'dataType': ['text'],
     'description': 'executive summary of the article',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-openai': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'execs

In [34]:
### Step 2 - configure Weaviate Batch, with
# - starting batch size of 100
# - dynamically increase/decrease based on performance
# - add timeout retries if something goes wrong

client.batch.configure(
    batch_size=10, 
    dynamic=True,
    timeout_retries=3,
#   callback=None,
)

<weaviate.batch.crud_batch.Batch at 0x7f10d4570a30>

In [35]:
### Step 3 - import data

print("Importing Articles")



with client.batch as batch:
   

        properties = {
            "title": "The Cathedral and the Bazaar",
            "execsummary": "The Cathedral and the Bazaar: Musings on Linux and Open Source by an Accidental Revolutionary (abbreviated CatB) is an essay, and later a book, by Eric S. Raymond on software engineering methods, based on his observations of the Linux kernel development process and his experiences managing an open source project, fetchmail. It examines the struggle between top-down and bottom-up design. The essay was first presented by the author at the Linux Kongress on May 27, 1997 in Würzburg (Germany) and was published as the second chapter of the same‑titled book in 1999.",
            "url": "https://en.wikipedia.org/wiki/The_Cathedral_and_the_Bazaar"
        }
        
        batch.add_data_object(properties, "Article")
        
print("Importing Articles complete")       

Importing Articles
Importing Articles complete


In [9]:
# Test that all data has loaded – get object count
result = (
    client.query.aggregate("Article")
    .with_fields("meta { count }")
    .do()
)
print("Object count: ", result["data"]["Aggregate"]["Article"], "\n")

Object count:  [{'meta': {'count': 1}}] 



In [12]:
def query_weaviate(query):
    
    nearText = {
        "concepts": [query],
        "distance": 0.7,
    }

    properties = [
        "title", "execsummary", "url",
        "_additional {certainty distance}"
    ]

    result = (
        client.query
        .get("Article", properties)
        .with_near_text(nearText)
        .with_limit(10)
        .do()
    )
    
    # Check for errors
    if ("errors" in result):
        print ("\033[91mYou probably have run out of OpenAI API calls for the current minute – the limit is set at 60 per minute.")
        raise Exception(result["errors"][0]['message'])
    
    return result["data"]["Get"]["Article"]

In [13]:
results = query_weaviate("bazar")
print(results)

[{'_additional': {'certainty': 0.8965392410755157, 'distance': 0.20692152}, 'execsummary': 'The Cathedral and the Bazaar: Musings on Linux and Open Source by an Accidental Revolutionary (abbreviated CatB) is an essay, and later a book, by Eric S. Raymond on software engineering methods, based on his observations of the Linux kernel development process and his experiences managing an open source project, fetchmail. It examines the struggle between top-down and bottom-up design. The essay was first presented by the author at the Linux Kongress on May 27, 1997 in Würzburg (Germany) and was published as the second chapter of the same‑titled book in 1999.', 'title': 'The Cathedral and the Bazaar', 'url': 'https://en.wikipedia.org/wiki/The_Cathedral_and_the_Bazaar'}]


In [16]:
def generativeai_weaviate(query):
    
    nearText = {
        "concepts": [query],
        "distance": 0.7,
    }

    properties = [
        "title", "execsummary", "url",
        "_additional {certainty distance}"
    ]

    prompt = "Summarize as a tweet {execsummary}"

    result = (
        client.query
        .get("Article", properties)
        .with_near_text(nearText)
        .with_generate(single_prompt=prompt)
        .with_limit(10)
        .do()
    )
    
    # Check for errors
    if ("errors" in result):
        print ("\033[91mYou probably have run out of OpenAI API calls for the current minute – the limit is set at 60 per minute.")
        raise Exception(result["errors"][0]['message'])
    
    return result["data"]["Get"]["Article"]

In [36]:
results = generativeai_weaviate("bazar")
print(results)

[{'_additional': {'certainty': 0.8965162038803101, 'distance': 0.20696759, 'generate': {'error': None, 'singleResult': 'CatB by Eric S. Raymond is an essay turned book on software engineering methods based on his observations of Linux kernel development and managing an open source project. It explores the conflict between top-down and bottom-up design. Presented in 1997 and published in 1999. #opensource #linux'}}, 'execsummary': 'The Cathedral and the Bazaar: Musings on Linux and Open Source by an Accidental Revolutionary (abbreviated CatB) is an essay, and later a book, by Eric S. Raymond on software engineering methods, based on his observations of the Linux kernel development process and his experiences managing an open source project, fetchmail. It examines the struggle between top-down and bottom-up design. The essay was first presented by the author at the Linux Kongress on May 27, 1997 in Würzburg (Germany) and was published as the second chapter of the same‑titled book in 1999

In [22]:
!pip install scholarly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scholarly
  Downloading scholarly-1.7.11-py3-none-any.whl (39 kB)
Collecting arrow (from scholarly)
  Downloading arrow-1.2.3-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting bibtexparser (from scholarly)
  Downloading bibtexparser-1.4.0.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.9/51.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting deprecated (from scholarly)
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting fake-useragent (from scholarly)
  Downloading fake_useragent-1.1.3-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting free-proxy

In [29]:
from scholarly import scholarly, ProxyGenerator

pg = ProxyGenerator()
success = pg.FreeProxies()
scholarly.use_proxy(pg)


In [37]:
from scholarly import scholarly
import sys

#search_query = scholarly.search_pubs('Perception of physical stability and center of mass of 3D objects')
#scholarly.pprint(next(search_query))
#sys.exit()

# Retrieve the author's data, fill-in, and print
# Get an iterator for the author results
search_query = scholarly.search_author('Steven A Cholewiak')
# Retrieve the first result from the iterator
first_author_result = next(search_query)
#scholarly.pprint(first_author_result)

# Retrieve all the details for the author
author = scholarly.fill(first_author_result )
#scholarly.pprint(author)

# Take a closer look at the first publication
first_publication = author['publications'][0]
first_publication_filled = scholarly.fill(first_publication)
scholarly.pprint(first_publication_filled)

# Print the titles of the author's publications
#publication_titles = [pub['bib']['title'] for pub in author['publications']]
#print(publication_titles)

# Which papers cited that publication?
#citations = [citation['bib']['title'] for citation in scholarly.citedby(first_publication_filled)]
#print(citations)

b"{'author_pub_id': '4bahYMkAAAAJ:u5HHmVD_uO8C',\n 'bib': {'abstract': 'The detectability and discriminability of virtual haptic '\n                     'gratings were analyzed in the frequency domain. '\n                     'Detection (Exp. 1) and discrimination (Exp. 2) '\n                     'thresholds for virtual haptic gratings were estimated '\n                     'using a force-feedback device that simulated sinusoidal '\n                     'and square-wave gratings with spatial periods from 0.2 '\n                     'to 38.4 mm. The detection threshold results indicated '\n                     'that for spatial periods up to 6.4 mm (i.e., spatial '\n                     'frequencies >0.156 cycle/mm), the detectability of '\n                     'square-wave gratings could be predicted quantitatively '\n                     'from the detection thresholds of their corresponding '\n                     'fundamental components. The discrimination experiment '\n             