<a href="https://colab.research.google.com/github/ybressler/Web-Scraping/blob/master/Web_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*View the tutorial here:* http://ior.ad/6L7j



---


# Web Scraper Tool
"Turn on" the code by executing the first cell of this notebook. Simply hit "command + enter" to run it. Then, hide the code by foldin the header section.

In [0]:
# EXECUTE THIS CELL

# Step 1 
# ------------------------------
from google.colab import files


# Step 2
# ------------------------------
import re

skip_words = [
  'like', 'you', 'you\'ve', 'never',
  'before', 'in', 'a', 'an', 'it\'s', 
  'it', 'and', 'at', 'the', 'as', 
  'those', 'off', 'their','is', 'to', 
  'before','out', 'of', 'for', 'can',
  'us', 'our','it\'s','not','without',
  'up','now', 'on','between','happens',
  'where','find', 'find',
  'while','from','are','then','what','with','being',
  'it\'s','hey','how', 'i','which','about',
  'me','why','be','this','has','within','become','my',
  'work','one','come','fan','would','perhaps','both',
  'its','when','or','no','pm','am',
   'here','go','will','learn','buy',
    'more','please','sale','he','be','word','by','ftd',

]

# Get rid of punctuation
f_only_words = lambda x: re.sub(r'[^\w\s]','', x)

def clean_your_tags(words_list):
  """
  cleans it all up!
  """

  if type(words_list)==str:
    words_list = f_only_words(words_list).lower()
    words_out = [x for x in words_list.split() if x not in skip_words]

  else:
    words_out = []
    for sub_words_list in words_list:
      sub_words_list = f_only_words(sub_words_list).lower()
      words_out += [x for x in sub_words_list.split() if x not in skip_words]

  return words_out

# Step 3
# ------------------------------

import requests
from bs4 import BeautifulSoup
import pandas as pd

def digest_a_web_page( url='https://www.trufflesmystery.com', n_min=1, specific_words = []):
  """
  scrapes a web page and returns a structured dataframe for all the words in the page
  """


  # url = 'https://www.trufflesmystery.com'
  r = requests.get(url)
  html = r.text

  soup = BeautifulSoup(html, 'html.parser')

  # Get the title
  title = f_only_words(soup.title.text.lower())

  # Get the h tags
  h1 = [f_only_words(x.text.lower()) for x in soup.find_all('h1')]
  h2 = [f_only_words(x.text.lower()) for x in soup.find_all('h2')]
  h3 = [f_only_words(x.text.lower()) for x in soup.find_all('h3')]
  body = [f_only_words(x.text.lower()) for x in soup.find_all('p')]

  # If that failed...
  if len(body)==0:
    body = [f_only_words(x.text.lower()) for x in soup.find_all('div', {'class':'paragraph'})]

  # get all the words which are left
  all_words = []
  for x in [title, h1, h2, h3, body]:
    all_words += clean_your_tags(x)


  # get words with highest counts
  unique_words = pd.Series(all_words).value_counts()
  unique_words = list(unique_words[unique_words>n_min].keys().values)

  # Get all the unique words
  unique_words = list(set(unique_words + title.split() + specific_words))


  # ------------------------------------------------

  records = []
  for word in unique_words:
    rec = {'word':word}
    for x, x_label in zip([title, h1, h2, h3, body], ['title', 'h1', 'h2', 'h3', 'body']):
      count = ' '.join(x).count(word) if type(x)!=str else x.count(word)
      rec[x_label] = count
      
    # append your values
    records.append(rec)


  # All done
  df = pd.DataFrame.from_records(records)
  df['total (with body)'] = df.sum(axis=1)
  df['total (without body)'] = df.drop(columns='total (with body)').sum(axis=1) - df['body']
  df.sort_values(by=['total (with body)','total (without body)', 'h1'], ascending=False, inplace=True)
  df.reset_index(drop=True,inplace=True)
  return df



# Step 4
# -------------------------------------------------
pd.options.display.max_colwidth=100

## Use the tool!

In [0]:
# # Enter the url you're interested in here:
# url = 'https://www.trufflesmystery.com/more-about-truffles'

# # what's the cutoff threshold?
# n_min = 2

# # Have any specific words?
# specific_words = ['birthday']

# # Do you want to save your file?
# save_file = False

# # Perform the execution
# df = digest_a_web_page(url, n_min=n_min, specific_words=specific_words)

# if save_file==True:
#   url_name = url.split('.',)[0].split('/')[-1].title()
#   file_name = f'Web Scraping – {url_name}.csv'
#   with open(file_name, 'w') as f:
#     f.write(df.to_csv())
#   files.download(file_name)

# # Put this here to view
# df

# Try it with a form!

Fill out the info and hit "command + enter" to see it in action!

In [5]:
#@title Input for Web Scraper
#@markdown Words about the stuff. _Okay?_

website = "http://yaakovbressler.com" #@param {type:"string"}
minimum_cutoff = 1  #@param {type: "slider", min: 0, max: 5}

#@markdown Enter each word separated by a comma
specific_words = 'birthday' #@param {type: "string"}

save_file_ = "Yes" #@param ["Yes", "No"] {allow-input: false}



# Perform the execution
df = digest_a_web_page(website, n_min=minimum_cutoff, specific_words=specific_words.split(','))


if save_file_==True:
  url_name = url.split('.',)[0].split('/')[-1].title()
  file_name = f'Web Scraping – {url_name}.csv'
  with open(file_name, 'w') as f:
    f.write(df.to_csv())
  files.download(file_name)
else:
  # print(df)
  None
df


Unnamed: 0,word,title,h1,h2,h3,body,total (with body),total (without body)
0,data,1,2,2,0,7,12,5
1,theatre,1,2,1,0,4,8,4
2,science,0,1,2,0,4,7,3
3,producer,1,1,1,0,1,4,3
4,yaakov,2,0,1,0,0,3,3
5,bressler,2,0,1,0,0,3,3
6,tech,0,1,1,0,1,3,2
7,arts,0,0,0,0,3,3,0
8,scientist,1,1,0,0,0,2,2
9,read,0,0,1,0,1,2,1
