In [148]:
from pymongo import MongoClient
import pprint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup

import json


In [149]:
#Connect to hosted MongoDB instance
client = MongoClient()
#Access already existing database
db = client['capstone2_updated']
#Access already existing collection in database
diamond_info = db['diamond_info']

## Parsing one page first to get workflow before iterating through all pages

In [147]:
diamonds = diamond_info.find_one({'pull':1})['html']
soup = BeautifulSoup(diamonds,'html.parser')

example = []
#getting list of shapes
# for i in range(len(soup.find_all('div','shape-cell-wrapper'))):
for i in range(12,17):
    example_diamonds = dict()
    #getting diamond shape
    example_diamonds['shape'] = soup.find_all('div','shape-cell-wrapper')[i].text
    #getting diamond price - some were discounted
    if len(soup.find_all('div','row-cell price')[i].text.split('Now: ')) == 2:
        og_price = soup.find_all('div','row-cell price')[i].text.split('Now: ')[0][6:].replace(',','')
        example_diamonds['original_price'] = int(og_price)
        disc_price = soup.find_all('div','row-cell price')[i].text.split('Now: ')[1][1:].replace(',','')
        example_diamonds['discounted_price'] = int(disc_price)
    else: 
        og_price = soup.find_all('div','row-cell price')[i].text[1:].replace(',','')
        example_diamonds['original_price'] = int(og_price)
        example_diamonds['discounted_price'] = np.nan
    #getting diamond carat
    example_diamonds['carat'] = soup.find_all('div','row-cell carat')[i].text
    #getting diamond cut
    #repeats the cut 2x with no space in between - need to cut string in half
    x = int(len(soup.find_all('div','row-cell cut')[i].text)/2)
    example_diamonds['cut'] = soup.find_all('div','row-cell cut')[i].text[x:]
    #getting diamond color
    example_diamonds['color'] = soup.find_all('div','row-cell color')[i].text
    #getting diamond clarity
    example_diamonds['clarity'] = soup.find_all('div','row-cell clarity')[i].text
    #getting diamond polish
    example_diamonds['polish'] = soup.find_all('div','row-cell polish')[i].text
    #getting diamond symmetry
    example_diamonds['symmetry'] = soup.find_all('div','row-cell symmetry')[i].text
    #getting diamond fluorescence
    example_diamonds['fluorescence'] = soup.find_all('div','row-cell fluorescence')[i].text
    #getting diamond depth
    example_diamonds['depth'] = soup.find_all('div','row-cell depth')[i].text
    #getting diamond table
    example_diamonds['table'] = soup.find_all('div','row-cell table')[i].text
    #getting diamond lw ratio
    example_diamonds['lxwratio'] = soup.find_all('div','row-cell lxwratio')[i].text
    #getting diamond culet
    example_diamonds['culet'] = soup.find_all('div','row-cell culet')[i].text
    example.append(example_diamonds)
example

[{'shape': 'Oval',
  'original_price': 238,
  'discounted_price': nan,
  'carat': '0.30',
  'cut': 'Very Good',
  'color': 'I',
  'clarity': 'SI1',
  'polish': 'Excellent',
  'symmetry': 'Very Good',
  'fluorescence': 'Faint',
  'depth': '64.6',
  'table': '60.0',
  'lxwratio': '1.32',
  'culet': 'None'},
 {'shape': 'Emerald',
  'original_price': 261,
  'discounted_price': 238,
  'carat': '0.31',
  'cut': 'Good',
  'color': 'K',
  'clarity': 'VVS2',
  'polish': 'Excellent',
  'symmetry': 'Very Good',
  'fluorescence': 'Faint',
  'depth': '70.7',
  'table': '62.0',
  'lxwratio': '1.28',
  'culet': 'None'},
 {'shape': 'Pear',
  'original_price': 251,
  'discounted_price': 238,
  'carat': '0.30',
  'cut': 'Very Good',
  'color': 'J',
  'clarity': 'SI1',
  'polish': 'Very Good',
  'symmetry': 'Very Good',
  'fluorescence': 'None',
  'depth': '62.8',
  'table': '61.0',
  'lxwratio': '1.52',
  'culet': 'None'},
 {'shape': 'Pear',
  'original_price': 239,
  'discounted_price': nan,
  'carat':

## Iterate through all pages and parse to dataframe

In [154]:
def parse_to_df(num_pulls):
    diamond_list = []
    for i in range(1,num_pages+1):
        diamonds = diamond_info.find_one({'pull':i})['html']
        soup = BeautifulSoup(diamonds,'html.parser')


        for i in range(len(soup.find_all('div','shape-cell-wrapper'))):
            per_diamond = dict()
            per_diamond['shape'] = soup.find_all('div','shape-cell-wrapper')[i].text  
            
            
            if len(soup.find_all('div','row-cell price')[i].text.split('Now: ')) == 2:
                og_price = soup.find_all('div','row-cell price')[i].text.split('Now: ')[0][6:].replace(',','')
                per_diamond['original_price'] = int(og_price)
                disc_price = soup.find_all('div','row-cell price')[i].text.split('Now: ')[1][1:].replace(',','')
                per_diamond['discounted_price'] = int(disc_price)
            else: 
                og_price = soup.find_all('div','row-cell price')[i].text[1:].replace(',','')
                per_diamond['original_price'] = int(og_price)
                per_diamond['discounted_price'] = np.nan
            
            
            per_diamond['carat'] = soup.find_all('div','row-cell carat')[i].text
            x = int(len(soup.find_all('div','row-cell cut')[i].text)/2)
            per_diamond['cut'] = soup.find_all('div','row-cell cut')[i].text[x:]
            per_diamond['color'] = soup.find_all('div','row-cell color')[i].text
            per_diamond['clarity'] = soup.find_all('div','row-cell clarity')[i].text
            per_diamond['polish'] = soup.find_all('div','row-cell polish')[i].text
            per_diamond['symmetry'] = soup.find_all('div','row-cell symmetry')[i].text
            per_diamond['fluorescence'] = soup.find_all('div','row-cell fluorescence')[i].text
            per_diamond['depth'] = soup.find_all('div','row-cell depth')[i].text
            per_diamond['table'] = soup.find_all('div','row-cell table')[i].text
            per_diamond['lxwratio'] = soup.find_all('div','row-cell lxwratio')[i].text
            per_diamond['culet'] = soup.find_all('div','row-cell culet')[i].text
            diamond_list.append(per_diamond)
        df = pd.DataFrame(diamond_list)
    return df

In [155]:
num_pulls = diamond_info.count()
num_pulls

  """Entry point for launching an IPython kernel.


58

In [None]:
diamond_data = parse_to_df(num_pulls)
diamond_data.to_csv(r'/Users/winglau/Desktop/Docker/Capstone2/updated_diamonds.csv')