In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

In [None]:
import requests
import re

from bs4 import BeautifulSoup
from bs4.element import Comment

In [None]:
# Project paths

manual_path = "webpages/manually_selected"

def make_path(folder, positive=True):
    pos = "positive" if positive else "negative"
    path = "{}/urls/{}-{}.txt".format(manual_path, folder, pos)
    return path

def urls_list(folder, positive=True):
    path = make_path(folder, positive)
    
    
    with open(path, 'r') as f:
        urls = list(l.strip() for l in f if l[0] != "#")
    f.close()
    
    return urls

In [None]:
# Keep visible text

def tag_visible(element):
    '''Keep only visible elements'''
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def extract_visible(soup):
    text = soup.html.body.findAll(text=True)
    s = ' '.join(filter(tag_visible, text))
    return re.sub("\s\s+" , " ", s) # remove all double spaces and tabs/newlines/etc.

In [None]:
positives = urls_list("coursera") + urls_list("general")
negatives = urls_list("coursera", False) + urls_list("general", False)

In [None]:
def get_soup(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    
    return soup

In [None]:
def construct_features(soup):
    # number of links
    a_count = len(soup.find_all("a"))
    
    # number of iframes
    iframe_count = len(soup.find_all("iframe"))
    
    
    return [a_count, iframe_count]

In [None]:
row_list = []

In [None]:
for url in positives[:5]:
    row_list.append([url, True] + construct_features(get_soup(url)))
    
for url in negatives[:5]:
    row_list.append([url, False] + construct_features(get_soup(url)))

In [None]:
df = pd.DataFrame(row_list, columns=["url", "label", "a_count", "iframe_count"])
df = df.set_index("url")

In [None]:
df.sample(5)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_estimator = RandomForestClassifier()

In [None]:
X = df.drop("label", axis=1)
y = df["label"]

In [None]:
scores = cross_val_score(rf_estimator, X, y)
scores

In [None]:
import grequests

In [None]:
def async_download_pages(urls, folder_path):
    reqs = [grequests.get(url) for url in urls]
    
    for r in grequests.imap(reqs, size=10):
        with open("{}/{}".format(folder_path, url.replace("/", ",")), 'w') as f:
            f.write(r.text)
        f.close()