In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

In [2]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment

In [3]:
webpages_path = "webpages/manually_selected"

In [4]:
def urls_path(folder, positive=True):
    '''Return path of a given file containing urls'''
    pos = "positive" if positive else "negative"
    path = "{}/urls/{}-{}.txt".format(webpages_path, folder, pos)
    return path

def urls_list_folder(folder, positive=True):
    '''Return the list of urls in given folder'''
    path = urls_path(folder, positive)

    with open(path, 'r') as f:
        urls = list(l.strip() for l in f if l[0] != "#")
    f.close()

    return urls

def urls_list(folders):
    '''
    Construct positive and negative url list, concatenated
    for all given folders
    '''
    positives = []
    negatives = []

    for folder in folders:
        positives.extend(urls_list_folder(folder, positive=True))
        negatives.extend(urls_list_folder(folder, positive=False))

    return positives, negatives

In [5]:
positives, negatives = urls_list(["coursera"])

In [6]:
positives[:3], negatives[:3]

(['https://www.coursera.org/learn/fashion-design',
  'https://www.coursera.org/learn/science-of-meditation',
  'https://www.coursera.org/learn/journey-of-the-universe'],
 ['https://www.coursera.org/',
  'https://www.coursera.org/enterprise',
  'https://www.coursera.org/about/partners'])

In [7]:
r = requests.get(positives[0])

In [8]:
r.status_code

200

In [9]:
soup = BeautifulSoup(r.text, "lxml")

In [10]:
print(soup.html.body.prettify()[:1000])

<body>
 <div id="fb-root">
 </div>
 <div id="rendered-content">
  <div class="rc-MetatagsWrapper" data-react-checksum="-53926488" data-reactid="1" data-reactroot="">
   <!-- react-empty: 2 -->
   <div class="rc-PhoenixCdpApplication" data-reactid="3">
    <div data-reactid="4">
     <div class="rc-PhoenixCdp2016" data-reactid="5" data-track="true" data-track-action="click" data-track-app="discovery" data-track-component="page" data-track-page="phoenix_cdp" role="presentation">
      <div class="rc-PageHeaderControls" data-reactid="6">
       <div class="rc-MobilePromoOption" data-reactid="7">
        <span data-reactid="8">
        </span>
       </div>
       <div class="smart-scroll-container" data-reactid="9">
        <div class="SmartScrollWrapper_1g73uxh" data-reactid="10" style="z-index:3000;box-shadow:none;">
         <div class="rc-PageHeader" data-reactid="11">
          <nav class="bt3-navbar c-ph-nav full-width" data-reactid="12">
           <div class="c-container bt3-conta

In [11]:
all_text = soup.html.body.findAll(text=True)
all_text[:10]

[' react-empty: 2 ',
 'Toggle navigation',
 'Navigation open',
 'Navigation closed',
 'Catalog',
 'Browse',
 'Search',
 'For Enterprise',
 'Log In',
 'Sign Up']

In [12]:
def tag_visible(element):
    '''Keep only visible elements'''
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

In [13]:
list(filter(tag_visible, all_text))[:15]

['Toggle navigation',
 'Navigation open',
 'Navigation closed',
 'Catalog',
 'Browse',
 'Search',
 'For Enterprise',
 'Log In',
 'Sign Up',
 'Fashion as Design',
 'Enroll',
 'Overview',
 'Syllabus',
 'FAQs',
 'Creators']

In [14]:
import re

def extract_visible(soup):
    text = soup.html.body.findAll(text=True)
    s = ' '.join(filter(tag_visible, text))
    return re.sub("\s\s+" , " ", s) # remove all double spaces and tabs/newlines/etc.

In [15]:
extract_visible(soup)[:1000]

'Toggle navigation Navigation open Navigation closed Catalog Browse Search For Enterprise Log In Sign Up Fashion as Design Enroll Overview Syllabus FAQs Creators Ratings and Reviews Fashion as Design Enroll Starts Jan 22 Financial Aid is available for learners who cannot afford the fee. Learn more and apply. Home Arts and Humanities Music and Art Fashion as Design The Museum of Modern Art About this course: Among all objects of design, our clothes are the most universal and intimate. Like other kinds of design, fashion thrives on productive tensions between form and function, automation and craftsmanship, standardization and customization, universality and self-expression, and pragmatism and utopian vision. It exists in the service of others, and it can have profound consequencesâ\x80\x94social, political, cultural, economic, and environmental. Fashion as Design focuses on a selection of more than 70 garments and accessories from around the world, ranging from kente cloth to jeans to 3

In [16]:
for folder in ("coursera", "edX", "general"):
    pos_urls, neg_urls = urls_list([folder])
    r_pos = requests.get(pos_urls[0])
    soup_pos = BeautifulSoup(r_pos.text, "lxml")
    
    r_neg = requests.get(neg_urls[0])
    soup_neg = BeautifulSoup(r_neg.text, "lxml")

    print("={}, {}=".format(folder, "positive"))
    print()
    print(extract_visible(soup_pos)[:1000])
    print()
    
    print("={}, {}=".format(folder, "negative"))
    print()
    print(extract_visible(soup_neg)[:1000])
    print()
            

=coursera, positive=

Toggle navigation Navigation open Navigation closed Catalog Browse Search For Enterprise Log In Sign Up Fashion as Design Enroll Overview Syllabus FAQs Creators Ratings and Reviews Fashion as Design Enroll Starts Jan 22 Financial Aid is available for learners who cannot afford the fee. Learn more and apply. Home Arts and Humanities Music and Art Fashion as Design The Museum of Modern Art About this course: Among all objects of design, our clothes are the most universal and intimate. Like other kinds of design, fashion thrives on productive tensions between form and function, automation and craftsmanship, standardization and customization, universality and self-expression, and pragmatism and utopian vision. It exists in the service of others, and it can have profound consequencesâsocial, political, cultural, economic, and environmental. Fashion as Design focuses on a selection of more than 70 garments and accessories from around the world, ranging from kente clot

In [19]:
pos_urls, neg_urls = urls_list(["edX"])

for url in pos_urls[:5]:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "lxml")

    print("={}=".format(folder))
    print()
    print(extract_visible(soup)[:1000])

=general=

 Skip to main content 
=general=

 Skip to main content 
=general=

 Skip to main content 
=general=

 Skip to main content 
=general=

 Skip to main content 
