<a href="https://colab.research.google.com/github/ybressler/Web-Scraping/blob/master/Web_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Scraper Tool
"Turn on" the code by executing the first cell of this notebook. Simply hit "command + enter" to run it. Then, hide the code by foldin the header section.

In [0]:
# EXECUTE THIS CELL

# Step 1 
# ------------------------------
from google.colab import files


# Step 2
# ------------------------------
import re

skip_words = [
  'like', 'you', 'you\'ve', 'never',
  'before', 'in', 'a', 'an', 'it\'s', 
  'it', 'and', 'at', 'the', 'as', 
  'those', 'off', 'their','is', 'to', 
  'before','out', 'of', 'for', 'can',
  'us', 'our','it\'s','not','without',
  'up','now', 'on','between','happens',
  'where','find', 'find',
  'while','from','are','then','what','with','being',
  'it\'s','hey','how', 'i','which','about',
  'me','why','be','this','has','within','become','my',
  'work','one','come','fan','would','perhaps','both',
  'its','when','or','no','pm','am',
   'here','go','will','learn','buy',
    'more','please','sale','he','be','word','by','ftd',
    'the', '','let', 'do',

]

# Get rid of punctuation
f_only_words = lambda x: re.sub(r'[^\w\s]','', x)

def clean_your_tags(words_list):
  """
  cleans it all up!
  """

  if type(words_list)==str:
    words_list = f_only_words(words_list).lower()
    words_out = [x for x in words_list.split() if x not in skip_words]

  else:
    words_out = []
    for sub_words_list in words_list:
      sub_words_list = f_only_words(sub_words_list).lower()
      words_out += [x for x in sub_words_list.split() if x not in skip_words]

  return words_out

# Step 3
# ------------------------------

import requests
from bs4 import BeautifulSoup
import pandas as pd
import inflect

def digest_a_web_page( url='https://www.trufflesmystery.com', n_min=1, specific_words = []):
  """
  scrapes a web page and returns a structured dataframe for all the words in the page
  """


  # url = 'https://www.trufflesmystery.com'
  r = requests.get(url)
  html = r.text

  soup = BeautifulSoup(html, 'html.parser')

  # Get the title
  title = f_only_words(soup.title.text.lower())

  # Get the description
  description = soup.find('meta', attrs={'property':'og:description'})
 
  # If description meta tag was found, then get the content
  description = f_only_words(description.get('content')) if description else '' 

  # Get the h tags
  h1 = [f_only_words(x.text.lower()) for x in soup.find_all('h1')]
  h2 = [f_only_words(x.text.lower()) for x in soup.find_all('h2')]
  h3 = [f_only_words(x.text.lower()) for x in soup.find_all('h3')]
  body = [f_only_words(x.text.lower()) for x in soup.find_all('p')]

  # If that failed...
  if len(body)==0:
    body = [f_only_words(x.text.lower()) for x in soup.find_all('div', {'class':'paragraph'})]

  # get all the words which are left
  all_words = []
  for x in [title, description, h1, h2, h3, body]:
    all_words += clean_your_tags(x)


  # get words with highest counts
  unique_words = pd.Series(all_words).value_counts()
  unique_words = list(unique_words[unique_words>n_min].keys().values)

  # Get all the unique words
  unique_words = list(set(unique_words + title.split() + specific_words))


  # ------------------------------------------------

  records = []
  for word in unique_words:
    rec = {'word':word}
    for x, x_label in zip([title, description, h1, h2, h3, body], ['title', 'description', 'h1', 'h2', 'h3', 'body']):
      count = ' '.join(x).count(word) if type(x)!=str else x.count(word)
      rec[x_label] = count
      
    # append your values
    records.append(rec)


  # All done
  df = pd.DataFrame.from_records(records)
  
  # drop all the skip words
  df = df[~df['word'].isin(skip_words)]

  # also, merge words that are plural
  p = inflect.engine()
  p_f = lambda x: p.singular_noun(x) if p.singular_noun(x) else x
  df['word'] = df['word'].apply(p_f)
  df = df.groupby('word',as_index=False).sum()

  df['total (with body)'] = df.sum(axis=1)
  df['total (without body)'] = df.drop(columns='total (with body)').sum(axis=1) - df['body']
  df.sort_values(by=['total (with body)','total (without body)', 'h1'], ascending=False, inplace=True)
  df.reset_index(drop=True,inplace=True)


  return df



# Step 4
# -------------------------------------------------
pd.options.display.max_colwidth=100

# You want to be able to see
pd.set_option('display.max_rows', 200)

In [0]:
# # -------   To use this tool   ------------
# # -------  (for python users)  ------------

# # Enter the url you're interested in here:
# url = 'https://www.lindonsupply.com/'

# # what's the cutoff threshold?
# n_min = 

# # Have any specific words?
# specific_words = []

# # Do you want to save your file?
# save_file = False

# # Perform the execution
# df = digest_a_web_page(url, n_min=n_min, specific_words=specific_words)

# if save_file==True:
#   url_name = url.split('.',)[0].split('/')[-1].title()
#   file_name = f'Web Scraping – {url_name}.csv'
#   with open(file_name, 'w') as f:
#     f.write(df.to_csv())
#   files.download(file_name)

# # Put this here to view
# df

# Try it with a form!

Fill out the info and hit "command + enter" to see it in action!

In [4]:
#@title Input for Web Scraper
#@markdown Words about the stuff. _Okay?_

#@markdown For multiple websites, separate each with a comma
website = "https://www.unos.com/, https://rolandsnyc.com/, https://www.rockyspizzanyc.com/" #@param {type:"string"}
minimum_cutoff = 0  #@param {type: "slider", min: 0, max: 5}

#@markdown Enter each word separated by a comma
specific_words = '' #@param {type: "string"}

save_file_ = "No" #@param ["Yes", "No"] {allow-input: false}
filename = 'pizza-stores.csv' #@param {type: "string"}


# ----------------------------------------

if specific_words.strip()=='':
  specific_words=[]


# ----------------------------------------

if ',' in website:
  # do this
  url_list = [x.strip() for x in website.split(',')]
  
  df_list = []
  for url in url_list:
    df = digest_a_web_page(url, n_min=minimum_cutoff, specific_words=specific_words)
    df_list.append(df)

  # merge your dataframes
  df = pd.concat(df_list)
  df_agg = df.groupby('word', as_index=False).agg(sum)

  df_agg.drop(columns='total (with body)', inplace=True)
  df_agg.rename(columns = {'total (without body)':'total'},inplace=True)
  df_agg.sort_values('total', ascending=False, inplace=True)

  # drop small words
  df_agg = df_agg[df_agg['word'].apply(len) >=3]

  # drop low frequency
  df_agg = df_agg[((df_agg['title']>0)|(df_agg['h1']>0)|(df_agg['h2']>=minimum_cutoff))&(df_agg['body']>0)]
  df_agg.reset_index(drop=True, inplace=True)

  # overwrite the previous filename
  df = df_agg

# ----------------------------------------


# Perform the execution

else:
  url = website
  df = digest_a_web_page(url, n_min=minimum_cutoff, specific_words=specific_words.split(','))


if save_file_=="Yes":
  with open(filename, 'w') as f:
    f.write(df.to_csv())
  files.download(filename)
else:
  # print(df)
  None

df.head(100)


Unnamed: 0,word,title,description,h1,h2,h3,body,total
0,clothing,9,10,2,1,0,15,22
1,gender,4,11,2,0,2,15,19
2,fashion,1,6,1,0,7,5,15
3,shirt,0,0,0,0,13,68,13
4,dres,0,0,0,0,13,6,13
5,long,0,0,0,0,9,34,9
6,neutral,3,3,3,0,0,10,9
7,accessory,0,3,0,0,6,4,9
8,sleeve,0,0,0,0,9,44,9
9,high,0,1,0,1,7,1,9
