# Setup

In [1]:
# pip install pinecone tqdm
import os
import json
from dotenv import load_dotenv

from openai import OpenAI
from tqdm import tqdm
import torch
import torch.nn as nn

import uuid

from pinecone import Pinecone

import urllib.parse
from urllib.request import urlopen

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from bs4 import BeautifulSoup
from bs4.element import Comment

  from tqdm.autonotebook import tqdm


## Pinecone, OAI, MongoDB

In [2]:
load_dotenv(dotenv_path='secrets.env')

True

In [3]:
# initialize Pinecone
api_key = os.environ['PINECONE_API_KEY']
environment = os.environ['PINECONE_ENVIRONMENT']
pinecone = Pinecone(api_key=api_key, environment=environment)

index_name = "cosine-3072"
pinecone_index = pinecone.Index(index_name)

In [4]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [5]:
def get_text_embedding_3072(text):
    response = client.embeddings.create(
        model="text-embedding-3-large",
        input=[text]
    ).data[0].embedding
    return response

In [6]:
mongoUsername = urllib.parse.quote_plus(os.environ['MONGO_USR'])
mongoPassword = urllib.parse.quote_plus(os.environ['MONGO_PWD'])
uri = f"mongodb+srv://{mongoUsername}:{mongoPassword}@cluster0.afizqne.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
# Create a new client and connect to the server
mongoClient = MongoClient(uri, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

'OpenAI' object has no attribute 'admin'


  if response.this_update > now:
  if response.next_update and response.next_update < now:
  if value.next_update is None:
  value.this_update
  < value.next_update
  assert value.this_update is not None
  assert value.next_update is not None
  value.this_update
  < value.next_update
  cached_value.next_update is not None
  and cached_value.next_update < value.next_update


In [7]:
mongo_db = mongoClient['Search']
namespace_collection = mongo_db["namespaces"]

In [8]:
def save_pinecone_namespaces_to_mongo(namespaces, index):
    to_insert = []
    for namespace in namespaces:
        to_insert.append({
            "_id": uuid.uuid4(),
            "name": namespace,
        })
    namespace_collection.insert_many(namespaces)

# Scraping Website

In [9]:
def non_visible_filter(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

In [10]:
# Returns a tuple (text : string, successful? : boolean)
def url_to_text(url):
    try:
        page = urlopen(url)
    except:
        return "", False
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    text = soup.findAll(text=True)
    visible = filter(non_visible_filter, text)
    return " ".join(item.strip() for item in visible if item.strip()), True

In [69]:
url_to_text("https://realpython.com/python-web-scraping-practical-introduction/#scrape-and-parse-text-from-websites")

('', False)

# Scraping Reddit Comments

Get the url with .json appended to return it as a json file.

In [11]:
page = urlopen("https://www.reddit.com/r/learnpython/comments/16xvuu5/python_reddit_data_scraper_for_beginners/.json")
data = json.loads(page.read())

In [53]:
def get_post_from_json(data):
    return data[0]['data']['children'][0]['data']['selftext']

def get_comments_from_json(data):
    allComments = []
    comments_layer_1 = data[1]['data']['children']

    bfs_queue = []
    for i in range(len(comments_layer_1)):
        bfs_queue.append([i])

    while len(bfs_queue) > 0:
        cur = bfs_queue.pop(0)
        data = comments_layer_1

        # iterate down to the target layer in the json tree/dict
        for i in range(len(cur)):
            if i == len(cur) - 1:
                data = data[cur[i]]['data']
            else:
                data = data[cur[i]]['data']['replies']['data']['children']

        allComments.append(data['body'])

        # check for children/replies and add them to BFS queue
        if data['replies'] != "":
            for i in range(len(data['replies']['data']['children'])):
                bfs_queue.append(cur + [i])
    
    return allComments

In [54]:
get_comments_from_json(data)

['Have you checked out PRAW? That\'s the standard way to do this:\n\nhttps://praw.readthedocs.io/en/stable/\n\nAlternatively, you could look into PushshiftIO, which is a massive third-party scraper of Reddit data.\n\nhttps://pushshift.io/\n\nPRAW has everything but may cap what you can scrape. PushshiftIO doesn\'t have everything, but it does have a lot, and IIRC there is no cap.\n\nLastly, the lowest tech but probably most labor intensive route is to just scrape directly off the site. This can be done by slapping ".json" into the end of any URL to convert its entire contents into a JSON object, which you can then traverse and extract data from more easily than the HTML source. Like literally add ".json" to the end of the URL at the top of your screen now and you\'ll see what I mean.',
 "As we say in France, we're in the same boat, mate!",
 'Hey! I am so excited to see your post here. I am also a linguistic student and now looking for a useful way to collect posts in Reddit. Have you f

In [28]:
def get_hyperlink_comments_from_json(data):
    allComments = []
    links = []
    comments_layer_1 = data[1]['data']['children']

    bfs_queue = []
    for i in range(len(comments_layer_1)):
        bfs_queue.append([i])

    while len(bfs_queue) > 0:
        cur = bfs_queue.pop(0)
        data = comments_layer_1

        # iterate down to the target layer in the json tree/dict
        for i in range(len(cur)):
            if i == len(cur) - 1:
                data = data[cur[i]]['data']
            else:
                data = data[cur[i]]['data']['replies']['data']['children']

        comment = data['body']
        html = data['body_html']
        # look for first occurrence of href=\"
        href_index = html.find('href=\"')
        if href_index != -1:
            href_index += 6 # only 6 because the \ escape key is not included
            href_end_index = html.find('\"', href_index)
            link = html[href_index:href_end_index]
            allComments.append(comment)
            links.append(link)

        # check for children/replies and add them to BFS queue
        if data['replies'] != "":
            for i in range(len(data['replies']['data']['children'])):
                bfs_queue.append(cur + [i])
    

    return links, allComments

In [29]:
get_hyperlink_comments_from_json(data)

(['https://praw.readthedocs.io/en/stable/'],
 ['Have you checked out PRAW? That\'s the standard way to do this:\n\nhttps://praw.readthedocs.io/en/stable/\n\nAlternatively, you could look into PushshiftIO, which is a massive third-party scraper of Reddit data.\n\nhttps://pushshift.io/\n\nPRAW has everything but may cap what you can scrape. PushshiftIO doesn\'t have everything, but it does have a lot, and IIRC there is no cap.\n\nLastly, the lowest tech but probably most labor intensive route is to just scrape directly off the site. This can be done by slapping ".json" into the end of any URL to convert its entire contents into a JSON object, which you can then traverse and extract data from more easily than the HTML source. Like literally add ".json" to the end of the URL at the top of your screen now and you\'ll see what I mean.'])

## Aggregating a List of Reddit threads