In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

%load_ext autoreload
%autoreload 2

In [None]:
import os
import random

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn import metrics

In [None]:
import mpld3
mpld3.enable_notebook()

In [None]:
from crawler import BFS_crawl, draw_graph
from helpers import async_cache_pages, urls_list
from features import construct_text_df

In [None]:
seed = 0
random.seed(seed)

In [None]:
folder_train = ['train-1']
folder_test_easy = ['test-easy']
folder_test_hard = ['test-hard']

In [None]:
def balanced_classes(df):
    n_pos = sum(df.label == True)
    n_neg = sum(df.label == False)

    min_pos_neg = min(n_pos, n_neg)


    return pd.concat([
            df[df.label == False].sample(min_pos_neg, random_state=seed), 
            df[df.label == True].sample(min_pos_neg, random_state=seed)
    ])

In [None]:
def analyse_easy_hard(folder_train, folder_test_easy, folder_test_hard, download_pages=True):
    train_pos, train_neg = urls_list(folder_train)
    
    test_pos_easy, test_neg_easy = urls_list(folder_test_easy)
    test_pos_hard, test_neg_hard = urls_list(folder_test_hard)

    if download_pages:
        print("Downloading pages")
        async_cache_pages(train_pos + train_neg + test_pos_easy + test_neg_easy + test_pos_hard + test_neg_hard)
    
    print("Constructing text dataframes")
    train_df = construct_text_df(train_pos + train_neg, [True] * len(train_pos) + [False] * len(train_neg))
    test_easy_df = construct_text_df(test_pos_easy + test_neg_easy, [True] * len(test_pos_easy) + [False] * len(test_neg_easy))
    test_hard_df = construct_text_df(test_pos_hard + test_neg_hard, [True] * len(test_pos_hard) + [False] * len(test_neg_hard))

    train_df_balanced = balanced_classes(train_df)
   
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression(random_state = seed, max_iter=1000)),
    ])

    print("Training the model")
    pipeline.fit(train_df_balanced.visible_text, train_df_balanced.label)

    print("Predictions on test sets")
    y_easy_pred = pipeline.predict(test_easy_df.visible_text)
    y_easy_test = test_easy_df.label
    
    y_hard_pred = pipeline.predict(test_hard_df.visible_text)
    y_hard_test = test_hard_df.label

    print("==Easy task==")
    print(metrics.classification_report(y_easy_test, y_easy_pred))
    print(metrics.confusion_matrix(y_easy_test, y_easy_pred))
    easy_accuracy = metrics.accuracy_score(y_easy_test, y_easy_pred)
    print("Accuracy: {}".format(easy_accuracy))

    print("==Hard task==")
    print(metrics.classification_report(y_hard_test, y_hard_pred))
    print(metrics.confusion_matrix(y_hard_test, y_hard_pred))
    hard_accuracy = metrics.accuracy_score(y_hard_test, y_hard_pred)
    print("Accuracy: {}".format(hard_accuracy))
    
    return pipeline, easy_accuracy, hard_accuracy


In [None]:
analyse_easy_hard(folder_train, folder_test_easy, folder_test_hard, download_pages=False)

In [None]:
class RootUrlIterator:
    def __init__(self, base, query_beginning, query_ends):
        self.query_index = 0
        self.base = base
        self.query_beginning = query_beginning 
        self.query_ends = query_ends
        
    def __next__(self):
        next_query_end = self.query_ends[self.query_index]
        self.query_index = (self.query_index + 1) % len(self.query_ends)
        
        return "{}{} {}".format(self.base, self.query_beginning, next_query_end)

In [None]:
possible_learning_fields = ["Mathematics", "Science", "Health", "Art", "Music", "Dance", 
                        "Leadership", "Algebra", "Life Science", "Social Studies", "Geography",
                        "Programming", "History", "Biology"]

root_iterator = RootUrlIterator("https://www.google.ch/search?q=", "Online Course", possible_learning_fields)


In [None]:
for i in range(15):
    print(next(root_iterator))

In [None]:
pipeline_initial, accuracy_initial_easy, accuracy_initial_hard = analyse_easy_hard(folder_train, folder_test_easy, folder_test_hard, download_pages=False)

In [None]:
root_initial = next(root_iterator)
print(root_initial)

G_initial = BFS_crawl(random_root, 3, 3, save=True, pipeline=pipeline_initial)

In [None]:
# USE GOOGLE API INSTEAD

In [None]:
draw_graph(G_initial, plt, print_pos=True)

In [None]:
urls = ["http://www.independent.co.uk/"]

In [None]:
async_cache_pages(urls)

In [None]:
from helpers import get_cached
from features import extract_visible
from bs4 import BeautifulSoup

In [None]:
#extract_visible(BeautifulSoup(get_cached(urls[0]), 'lxml'))