## Reading Diamond Prices
This notebook will read in prices of diamonds from major websites, including: James Allen.

It will create an output which is a Pandas Dataframe including prices and 4C related information. This will be written to a CSV.

The Dataframe output will be passed on for Linear Regression.

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib3
import re
from datetime import datetime
import requests
import numpy as np
import time
import sys


In [3]:
#General class and functions to check a robots.txt file. 
 #Check the robots.txt file using this class. 
 #robotlink is the URL of the robots file
 #urlcheck is a string containing the url you want to review. It is searched suing string contains. 
class Robots_Review:
    def __init__(self,robotlink,urlcheck): # constructor method
        self.link=robotlink
        self.tocheck=urlcheck
        robots_file = requests.get(robots)
        code = robots_file.status_code
        if code != 200:
            sys.exit(f"Error, {robotlink} returned a code {code} error!")
        if code == 200:
            req = urllib3.PoolManager()
            response = req.request('GET', robotlink) #Read the robots file.
            soup = BeautifulSoup(response.data, 'html.parser')
            lines = str(soup).splitlines() #Turns the HTML into a set of links. 
            #Now turn the lines into a readable dataframe.
            dataf = pd.DataFrame()
            for line in lines:
                if len(line.split(":"))>1: #Check the line has the colon
                    condition,link = line.split(":")[0],line.split(":")[1]
                    dataf=dataf.append(pd.DataFrame(data=
                        {"Condition":[condition],"Subdirectory":[link]        }
                    ))
            self.robotsdf = dataf
    
    #Check a specific subdirectory. This 
    def find_subdir(self):
        mask = self.robotsdf['Subdirectory'].str.contains(self.tocheck)
        #print(mask)
        output = self.robotsdf[mask]
        return output

### Reading from the James Allen website

In [4]:
#Check robots txt
robots = 'https://www.jamesallen.com/robots.txt'

classtest=Robots_Review(robots,'loose-diamonds')
#print(classtest.robotsdf)
print(classtest.find_subdir())
#We are good to go searching loose-diamonds!

  Condition                                Subdirectory
0     Allow   /loose-diamonds/round-cut/?Cut=TrueHearts
0     Allow       /loose-diamonds/all-diamonds/?CM=True


In [18]:
#Now do the web scraping!
#Create a function, so you can parametrize this more easily. 

# Fetch the html file
caratsearches = np.arange(.2,7,.1) #You will search over all of these carat weights
labgrown=True #Defines if it's lab grown
pagesno = 40 #this is the number of pages you search for every time. 
write = True


def scrape_JamesAllen(caratsearches=np.arange(.2,7,.1),labgrown=True,pages = 40,write = True):
    #Some initiatilizations to start. 
    data=pd.DataFrame() #Define the output.
    pagenolist=np.arange(0,pages) #The number of pages searched on every search query.
    counter = 0 #Counter variables. 
    noter = 0
    #Create the appropriate text for the labgrown file.
    if labgrown == True: lgtext = 'LabGrown'
    if labgrown == False: lgtext = 'Natural'
    if write == True :  #Create the descriptive filename. 
        cmin = round(np.min(caratsearches),1)
        cmax = round(np.max(caratsearches),1)
        filenamer = f'jamesalleninv_{lgtext}_{cmin}-{cmax}_{datetime.now().year}{datetime.now().month}{datetime.now().day}.csv'
        print(f'Initializing. I will write output to {filenamer}')
    for caratwtlo in caratsearches:
        carat_str=f'CaratFrom={str(caratwtlo)}&CaratTo={str(caratwtlo+.1)}' 
        for pageno in pagenolist:
            url = f'https://www.jamesallen.com/loose-diamonds/all-diamonds/page-{pageno}/?Color=Y,J,I,H,G,F,E,D&Shape=all-diamonds&Clarity=I1,SI2,SI1,VS2,VS1,VVS2,VVS1,IF,FL&{carat_str}&IsLabDiamond={str(labgrown)}&resultsView=List'
            #All of the page URLs follow the same format with the exception of one number followed by 'page-'

            req = urllib3.PoolManager()
            res = req.request('GET', url)
            if res.status != 200: #Check it finds a webpage. 
                print(f"Error, {url} returned a code {res.status} error!")
                break #Stops the loop the first time an entry is not found. 
            if res.status == 200:
                soup = BeautifulSoup(res.data, 'html.parser')
                body = soup.find("body")
                contents = soup.find_all(class_= 'product_pod')

                #This defines the general serach text, that will find all values matching the general format 
                #div and galleryItem_1_DIGIT
                #Use regex to do this general serach. 
                stringpat= f"galleryItem_{pageno}"+r"_\d"
                string=re.compile(stringpat)

                #Define the list of terms to be searched.
                search_list=[['Carat Weight','li-carat'],['Shape','li-shape'],['Color','li-color'],['Clarity','li-clarity'],['Cut','li-cut'],['Lab','li-lab']]
                #Search for each entry for each diamond on the webpage.
                for div in body.find_all("div",{"data-qa": string}):
                    #Then search over the names and fields given in search_list
                    inter1=pd.DataFrame()
                    for name,field in search_list:
                        #Create the itemized output.
                        inter = pd.DataFrame(data=
                                            { name:[div.find("div",{"data-qa": field}).text]   })
                        #Then, build a column up from the single div.
                        inter1 = pd.concat([inter1,inter],axis=1)
                    #The serach for a price is slightly different, because it is in a span. So just add in the price.
                    pricesearcher=div.find("span","base-price--xVDZZ").text
                    #Then add it again.
                    inter = pd.DataFrame(data= {
                        
                        "Price (USD $)":[pricesearcher.replace("$", "")] ,
                        "Diamond Type":[f"{lgtext}"],
                        "Timestamp":[datetime.now()],
                        "Webpage":[url.split("/")[2]],"URL":[url]})
                    inter1 = pd.concat([inter1,inter],axis=1)
                    #Finally build the whole ouput
                    data = data.append(inter1)
                
                counter = counter + 1 #Counting the number of iterations.
                # adding 3 seconds time delay to avoid triggering a DNS block
                time.sleep(3)
                if counter > 30 :
                    noter = noter+ counter
                    print(f"I have successfully read in {noter} webpages, with data for {len(data)} stones!")
                    counter = 0
                    time.sleep (27) #Doing a 30 second wait before pulling again, to vaoid DNS blocks
        print(f"Output summary: {data.info()}")
        print(f"output head: {data.head()}")
        if write == True: data.to_csv(f"data\{filenamer}")
        return data

    print(data.info())
    print(data.head())
   


In [17]:
# Fetch the html file
caratsearches = np.arange(.2,7,.1) #You will search over all of these carat weights
labgrown=True #Defines if it's lab grown
pagesno = 40 #this is the number of pages you search for every time. 
write = True


t1=scrape_JamesAllen(np.arange(.2,.4,.1),labgrown=True,pages = 2,write = True)



Initializing. I will write output to jamesalleninv_LabGrown_0.2-0.3_2022429.csv
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 0 to 0
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Carat Weight   9 non-null      object        
 1   Shape          9 non-null      object        
 2   Color          9 non-null      object        
 3   Clarity        9 non-null      object        
 4   Cut            9 non-null      object        
 5   Lab            9 non-null      object        
 6   Price (USD $)  9 non-null      object        
 7   Diamond Type   9 non-null      object        
 8   Timestamp      9 non-null      datetime64[ns]
 9   Webpage        9 non-null      object        
 10  URL            9 non-null      object        
dtypes: datetime64[ns](1), object(10)
memory usage: 864.0+ bytes
Output summary: None
output head:   Carat Weight  Shape   Color Clarity        Cut  Lab Pri