In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

In [None]:
import requests
import re

from bs4 import BeautifulSoup
from bs4.element import Comment

In [None]:
# Project paths

manual_path = "webpages/manually_selected"

def urls_path(folder, positive=True):
    pos = "positive" if positive else "negative"
    path = "{}/urls/{}-{}.txt".format(manual_path, folder, pos)
    return path

def urls_list(folder, positive=True):
    path = urls_path(folder, positive)
    
    
    with open(path, 'r') as f:
        urls = list(l.strip() for l in f if l[0] != "#")
    f.close()
    
    return urls

In [None]:
# Keep visible text

def tag_visible(element):
    '''Keep only visible elements'''
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def extract_visible(soup):
    text = soup.html.body.findAll(text=True)
    s = ' '.join(filter(tag_visible, text))
    return re.sub("\s\s+" , " ", s) # remove all double spaces and tabs/newlines/etc.

In [None]:
positives = urls_list("coursera") + urls_list("general")
negatives = urls_list("coursera", False) + urls_list("general", False)

In [None]:
def get_soup(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    
    return soup

In [None]:
def construct_features(soup):
    # number of links
    a_count = len(soup.find_all("a"))
    
    # number of iframes
    iframe_count = len(soup.find_all("iframe"))
    
    
    return [a_count, iframe_count]

In [None]:
row_list = []

In [None]:
for url in positives[:10]:
    row_list.append([url, True] + construct_features(get_soup(url)))
    
for url in negatives[:10]:
    row_list.append([url, False] + construct_features(get_soup(url)))

In [None]:
df = pd.DataFrame(row_list, columns=["url", "label", "a_count", "iframe_count"])
df = df.set_index("url")

In [None]:
df.sample(10)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_estimator = RandomForestClassifier()

In [None]:
X = df.drop("label", axis=1)
y = df["label"]

In [None]:
scores = cross_val_score(rf_estimator, X, y)
scores

In [None]:
import grequests

In [None]:
def url_to_filename(url):
    return url.replace("/", ",")

def filename_to_url(filename):
    return filename.replace(",", "/")

In [None]:
import os
dumps_path = manual_path + "/dumps"

def dump_path(url):
    return "{}/{}".format(dumps_path, url_to_filename(url))

def delete_all_files(folder):
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        try:
            if os.path.isfile(file_path):
                os.remove(file_path)
        except Exception as e:
            print(e)
            
def is_cached(url):
    return os.path.isfile(dump_path(url))

def get_cached(url):
    if not is_cached(url):
        raise Exception("{} not cached".format(url))

    with open(dump_path(url), 'r') as f:
        s = f.read()
    f.close()
    
    return s

def get_original_url(response):
    return response.history[0].url if response.history else response.url
        

def async_download_pages(urls, delete_cache=False, threads=15):
    if delete_cache:
        delete_all_files(dumps_path)

    # Only request for pages not already in the dump folder
    reqs = [grequests.get(url) for url in urls if not is_cached(url)]
    
    for r in grequests.imap(reqs, size=threads):
        if r.status_code == 200:
            with open(dump_path(get_original_url(r)), 'w') as f:
                f.write(r.text)
            f.close()
        else:
            print("Error while downloading: {}. Status code: {}".format(r.url, r.status_code))

In [None]:
async_download_pages(["https://www.coursera.org/", "https://www.coursera.org/enterprise"])

In [None]:
all_urls = [url 
            for pos in (True, False) 
            for folder in ("coursera", "edX", "general") 
            for url in urls_list(folder, pos)]


In [None]:
# Download all pages asynchronously
from random import sample
async_download_pages(sample(all_urls, len(all_urls)))

In [None]:
# try again with all the pages

row_list = []

positives = [url for folder in ["coursera", "edX", "general"] for url in urls_list(folder, True)]
negatives = [url for folder in ["coursera", "edX", "general"] for url in urls_list(folder, False)]

In [None]:
for url in positives:
    try:
        soup = BeautifulSoup(get_cached(url), "lxml")
        row_list.append([url, True] + construct_features(soup))
    except:
        print("{} not cached".format(url))
    
for url in negatives:
    try:
        soup = BeautifulSoup(get_cached(url), "lxml")
        row_list.append([url, False] + construct_features(soup))
    except:
        print("{} not cached".format(url))

In [None]:
df = pd.DataFrame(row_list, columns=["url", "label", "a_count", "iframe_count"])
df = df.set_index("url")

In [None]:
df.sample(5)

In [None]:
rf_estimator = RandomForestClassifier()
X = df.drop("label", axis=1)
y = df["label"]
scores = cross_val_score(rf_estimator, X, y, cv=5, )
scores