# Web scraping in World Coffee Research
---
- Obtain the arabica coffee varieties to understand the optimal growth for each varieties
- All data is from [World Coffee Research](https://varieties.worldcoffeeresearch.org/varieties)

In [1]:
# Import relevant libraries
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd

## Identify the species name for navigating between pages
---

In [2]:
# getting html from the page using requests, and then formatting using beautifulsoup
catalog_url = "https://varieties.worldcoffeeresearch.org/varieties"
catalog_page = requests.get(catalog_url).text
catalog_soup = soup(catalog_page, "html.parser")

In [48]:
# finding the location of urls located
catalog_body = catalog_soup.find_all("a", class_="variety-tile variety-tile--grid-item")

# create a list to store all the url in the catalog
species_url_list = []
for item in range(len(catalog_body)):
    url = catalog_body[item]["href"]
    species_url_list.append(url)

# identify how many coffee species were found
len(species_url_list)

55

## Getting required information from the coffee species page
---

In [199]:
# create a list to hold all the data and a list of header
data = []
header = ["species name", "brief intro", "plant stature", "leaf tip colour", "bean size", "optimal altitude low (5N_5S)", 
          "optimal altitude mid (5-15N_5-15S)", "optimal altitude high (15N_15S)", "quality potential at high altitude",
          "yield potential", "disease: coffee leaf rust", "disease: coffee berry disease", "disease: nematodes",
          "nutrition requirment", "planting density"]

for url in range(len(species_url_list)):
    #
    species_url = species_url_list[url]
    species_page = requests.get(species_url).text
    species_soup = soup(species_page, "html.parser")

    # extract useful information 
    row = [] # create a list for each row of the data

    # extract the coffee species name without the unwanted itmes
    name = species_soup.find("h1", class_= "toolbar__page-title").text.split()
    # Can use this code but will delete modified the original file. (unwanted_name = name.small.extract())
    try:
        unwanted_name = species_soup.find("h1", class_= "toolbar__page-title").small.text
    except:
        pass
    for x in name:
        if x == unwanted_name:
            name.remove(x)
    species_name = " ".join(name)
    row.append(species_name)
    print(f"Processing {species_name}...")
    # finding the brief intro for the species
    brief_intro = species_soup.find("div", class_= "variety__intro-text").text.strip()
    row.append(brief_intro)
    try:
        # finding the stature of the coffee species
        stature = species_soup.find("div", class_= "attribute-tile stature").find("div", class_= "value").text.strip()
        row.append(stature)
        # finding the leaf tip colour
        leaf_tip_colour = species_soup.find("div", class_= "attribute-tile color").find("div", class_= "value").text.strip()
        row.append(leaf_tip_colour)
        # extract the bean size
        bean_size = species_soup.find("div", class_= "attribute-tile bean-size").find("div", class_= "value").text.strip()
        row.append(bean_size)
    except:
        pass
    try:
        # extract optimal altitude
        altitude = species_soup.find("div", class_= "altitude-groups").text.strip().split("\n\n")
        low_lat_5N_5S = altitude[1]
        row.append(low_lat_5N_5S)
        mid_lat_5S_15N_5S_15S = altitude[4]
        row.append(mid_lat_5S_15N_5S_15S)
        high_lat_15N_15S = altitude[7]  
        row.append(high_lat_15N_15S)
    except:
        pass
    try:
        # quality potential at high altitude (will the coffee bean quality affected a lot of with the altitude)
        quality = species_soup.find("div", class_= "attribute-tile high-altitude-quality").find("div", class_= "value").text.strip()
        row.append(quality)
    except:
        pass
    try:
        # yield potential
        yield_potential = species_soup.find("div", class_= "attribute-tile yield-potential").find("div", class_= "value").text.strip()
        row.append(yield_potential)
    except:
        pass
    try: 
        # plant disease that will affect the coffee plant grow (coffee leaf rust, coffee berry disease, nematodes)
        coffee_leaf_rust = species_soup.find("div", class_= "attribute-tile rust").find("div", class_= "value").text.strip()
        row.append(coffee_leaf_rust)
        coffee_berry_disease = species_soup.find("div", class_= "attribute-tile cbd").find("div", class_= "value").text.strip()
        row.append(coffee_berry_disease)
        nematodes = species_soup.find("div", class_= "attribute-tile nematodes").find("div", class_= "value").text.strip()
        row.append(nematodes)
    except:
        pass
    try:
        # nutrition requirement
        nutrition_req = species_soup.find("tr", class_= "row nutrition").find("td", class_= "cell value").text.strip()
        row.append(nutrition_req)
        # planting density
        density = species_soup.find("tr", class_= "row density").find("td", class_= "cell value").text.strip()
        row.append(density)
    except:
        pass

    # putting all the species data into the datalist
    data.append(row)
    print(f"{species_name} Sucessful")


print("-----------------------------")
print("Data Retrieval Complete")
print("-----------------------------")

### Store into dataframe and export data

In [202]:
# store all the data into dataframe for export data
df = pd.DataFrame.from_records(data, columns= header)

In [205]:
# export data
df.to_csv("output_data_csv/coffee_plant_species.csv", index= False)