In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

In [2]:
import requests
import re

from bs4 import BeautifulSoup
from bs4.element import Comment

In [3]:
# Project paths

manual_path = "webpages/manually_selected"

def urls_path(folder, positive=True):
    pos = "positive" if positive else "negative"
    path = "{}/urls/{}-{}.txt".format(manual_path, folder, pos)
    return path

def urls_list(folder, positive=True):
    path = urls_path(folder, positive)
    
    
    with open(path, 'r') as f:
        urls = list(l.strip() for l in f if l[0] != "#")
    f.close()
    
    return urls

In [4]:
# Keep visible text

def tag_visible(element):
    '''Keep only visible elements'''
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def extract_visible(soup):
    text = soup.html.body.findAll(text=True)
    s = ' '.join(filter(tag_visible, text))
    return re.sub("\s\s+" , " ", s) # remove all double spaces and tabs/newlines/etc.

In [5]:
positives = urls_list("coursera") + urls_list("general")
negatives = urls_list("coursera", False) + urls_list("general", False)

In [6]:
def get_soup(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    
    return soup

In [7]:
def construct_features(soup, url, label):
    
    feats = {
        "url": url,
        "label": label,
        "a_count": len(soup.find_all("a")),
        "iframe_count": len(soup.find_all("iframe")),
        "h1_count": len(soup.find_all("h1")),
        "h2_count": len(soup.find_all("h2")),
        "h3_count": len(soup.find_all("h3")),
        "video_count": len(soup.find_all("video")),
        "button_count": len(soup.find_all("button"))
        
    }
    
    return feats.keys(), feats.values()

In [8]:
first_url = positives[0]
columns = construct_features(get_soup(first_url), first_url, True)[0]
columns

dict_keys(['h2_count', 'iframe_count', 'button_count', 'url', 'h1_count', 'video_count', 'a_count', 'h3_count', 'label'])

In [9]:
row_list = []

In [10]:
for url in positives[:5]:
    row_list.append(construct_features(get_soup(url), url, True)[1])
    
for url in negatives[:5]:
    row_list.append(construct_features(get_soup(url), url, False)[1])

In [11]:
df = pd.DataFrame(row_list, columns=columns)
df = df.set_index("url")

In [12]:
df.sample(10)

Unnamed: 0_level_0,h2_count,iframe_count,button_count,h1_count,video_count,a_count,h3_count,label
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
https://www.coursera.org/learn/thomas-berry/,2,0,18,1,0,64,0,True
https://www.coursera.org/unige,0,0,1,1,0,254,0,False
https://www.coursera.org/learn/journey-knowledge-action,2,0,18,1,0,64,0,True
https://www.coursera.org/,1,0,4,2,0,103,11,False
https://www.coursera.org/learn/journey-of-the-universe,2,0,19,1,0,64,0,True
https://www.coursera.org/learn/science-of-meditation,2,0,13,1,0,59,0,True
https://www.coursera.org/enterprise,10,2,1,7,0,32,4,False
https://www.coursera.org/ita,0,0,1,1,0,55,0,False
https://www.coursera.org/about/partners,0,0,1,0,0,37,0,False
https://www.coursera.org/learn/fashion-design,2,0,18,1,0,64,0,True


In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [14]:
rf_estimator = RandomForestClassifier()

In [15]:
X = df.drop("label", axis=1)
y = df["label"]

In [16]:
scores = cross_val_score(rf_estimator, X, y)
scores

array([ 1.  ,  0.75,  1.  ])

In [17]:
import grequests

In [18]:
def url_to_filename(url):
    return url.replace("/", ",")

def filename_to_url(filename):
    return filename.replace(",", "/")

In [19]:
import os
dumps_path = manual_path + "/cache"

def dump_path(url):
    return "{}/{}".format(dumps_path, url_to_filename(url))

def delete_all_files(folder):
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        try:
            if os.path.isfile(file_path):
                os.remove(file_path)
        except Exception as e:
            print(e)
            
def is_cached(url):
    return os.path.isfile(dump_path(url))

def get_cached(url):
    if not is_cached(url):
        raise Exception("{} not cached".format(url))

    with open(dump_path(url), 'r') as f:
        s = f.read()
    f.close()
    
    return s

def get_original_url(response):
    return response.history[0].url if response.history else response.url
        

def async_download_pages(urls, delete_cache=False, threads=15):
    if delete_cache:
        delete_all_files(dumps_path)

    # Only request for pages not already in the dump folder
    reqs = [grequests.get(url) for url in urls if not is_cached(url)]
    
    for r in grequests.imap(reqs, size=threads):
        if r.status_code == 200:
            with open(dump_path(get_original_url(r)), 'w') as f:
                f.write(r.text)
            f.close()
        else:
            print("Error while downloading: {}. Status code: {}".format(r.url, r.status_code))

In [20]:
async_download_pages(["https://www.coursera.org/", "https://www.coursera.org/enterprise"])

In [21]:
all_urls = [url 
            for pos in (True, False) 
            for folder in ("coursera", "edX", "general") 
            for url in urls_list(folder, pos)]


In [22]:
# Download all pages asynchronously
from random import sample
async_download_pages(sample(all_urls, len(all_urls)))

Error while downloading: https://www.amazon.com/Low-Price-With-Free-Shipping/. Status code: 404
Error while downloading: https://business.udemy.com/?ref=ufb_header. Status code: 403
Error while downloading: http://blog.edx.org/learner-stories/. Status code: 403
Error while downloading: http://blog.edx.org/. Status code: 403
Error while downloading: https://www.amazon.com/. Status code: 503
Error while downloading: https://about.udemy.com/blog/?ref=footer. Status code: 403
Error while downloading: https://about.udemy.com/careers/. Status code: 403
Error while downloading: http://blog.openclassrooms.com/en/. Status code: 403


In [23]:
# try again with all the pages

row_list = []

positives = [url for folder in ["coursera", "general"] for url in urls_list(folder, True)]
negatives = [url for folder in ["coursera", "general"] for url in urls_list(folder, False)]

In [24]:
for url in positives:
    try:
        soup = BeautifulSoup(get_cached(url), "lxml")
        row_list.append(construct_features(soup, url, True)[1])
    except:
        print("{} not cached".format(url))
    
for url in negatives:
    try:
        soup = BeautifulSoup(get_cached(url), "lxml")
        row_list.append(construct_features(soup, url, False)[1])
    except:
        print("{} not cached".format(url))

https://business.udemy.com/?ref=ufb_header not cached
https://about.udemy.com/blog/?ref=footer not cached
https://about.udemy.com/careers/ not cached
http://blog.openclassrooms.com/en/ not cached
https://www.amazon.com/ not cached
https://www.amazon.com/Low-Price-With-Free-Shipping/ not cached


In [25]:
df = pd.DataFrame(row_list, columns=construct_features(soup, url, False)[0])
df = df.set_index("url")

In [26]:
rf_estimator = RandomForestClassifier()
X = df.drop("label", axis=1)
y = df["label"]
scores = cross_val_score(rf_estimator, X, y, cv=3, )
np.mean(scores)

0.77959997427487293

In [27]:
rf_estimator.fit(X, y)
rf_estimator.feature_importances_

array([ 0.21419424,  0.07935099,  0.18777779,  0.13393149,  0.03981234,
        0.17544023,  0.16949291])