In [5]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [6]:
!pip install goose3



In [0]:


import pandas as pd
import pickle
from bs4 import BeautifulSoup                          #scrape information from web pages
from goose3 import Goose                               #extract information from news articles
from collections import Counter                        
import string
from joblib import Parallel, delayed
import sys
from tqdm import tqdm                                    

stop_domains = ['buzzfeed', 'clickhole', 'cnn', 'wikinews', 'upworthy', 'nytimes']


def features(html):
    try:
        soup = BeautifulSoup(html, "lxml")
        g = Goose()
        try:
            goose_article = g.extract(raw_html=html)
        except TypeError:
            goose_article = None
        except IndexError:
            goose_article = None

        size = sys.getsizeof(html)
        html_len = len(html)
        count_of_elements_links = len(soup.find_all('a'))
        count_of_elements_buttons = len(soup.find_all('button'))
        count_of_elements_inputs = len(soup.find_all('input'))
        count_of_elements_ul = len(soup.find_all('ul'))
        count_of_elements_ol = len(soup.find_all('ol'))
        count_of_elements_lists = count_of_elements_ol + count_of_elements_ul
        count_of_elements_h1 = len(soup.find_all('h1'))
        count_of_elements_h2 = len(soup.find_all('h2'))
        if count_of_elements_h1 > 0:
            h1_len = 0
            h1_text = ''
            for x in soup.find_all('h1'):
                text = x.get_text().strip()
                h1_text += text + ' '
                h1_len += len(text)
            total_h1_len = h1_len
            avg_h1_len = h1_len * 1. / count_of_elements_h1
        else:
            total_h1_len = 0
            avg_h1_len = 0
            h1_text = ''

        if count_of_elements_h2 > 0:
            h2_len = 0
            h2_text = ''
            for x in soup.find_all('h2'):
                text = x.get_text().strip()
                h2_len += len(text)
                h2_text += text + ' '
            total_h2_len = h2_len
            avg_h2_len = h2_len * 1. / count_of_elements_h2
        else:
            total_h2_len = 0
            avg_h2_len = 0
            h2_text = ''
        if goose_article is not None:
            parser_html_data_text = goose_article.meta_description + ' ' + h1_text + ' ' + h2_text
            parser_html_data_text = "".join(l for l in parser_html_data_text if l not in string.punctuation)
            parser_html_data_text = parser_html_data_text.strip().lower().split()
            parser_html_data_text = [word for word in parser_html_data_text if word.lower() not in stop_domains]
            parser_html_data_text = ' '.join(parser_html_data_text)
        else:
            parser_html_data_text = h1_text + ' ' + h2_text
            parser_html_data_text = "".join(l for l in parser_html_data_text if l not in string.punctuation)
            parser_html_data_text = parser_html_data_text.strip().lower().split()
            parser_html_data_text = [word for word in parser_html_data_text if word.lower() not in stop_domains]
            parser_html_data_text = ' '.join(parser_html_data_text)

        count_of_elements_images = len(soup.find_all('img'))

        count_of_elements_tags = len([x.name for x in soup.find_all()])
        count_of_elements_unique_tags = len(Counter([x.name for x in soup.find_all()]))

        return [size, html_len, count_of_elements_links, count_of_elements_buttons,
                count_of_elements_inputs, count_of_elements_ul, count_of_elements_ol, count_of_elements_lists,
                count_of_elements_h1, count_of_elements_h2, total_h1_len, total_h2_len, avg_h1_len, avg_h2_len,
                count_of_elements_images, count_of_elements_tags, count_of_elements_unique_tags,
                parser_html_data_text]
    except:
        return [-1, -1, -1, -1,
                -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1,
                -1, -1, -1,
                "no data"]


clickbait_html = pickle.load(open('/content/drive/My Drive/web_scrap/data/clickbait_html.pkl','rb'))
clickbait_features = Parallel(n_jobs=30)(delayed(features)(html) for html in tqdm(clickbait_html))

clickbait_features_df = pd.DataFrame(clickbait_features,
                                     columns=["size", "html_len", "number_of_links", "number_of_buttons",
                                              "number_of_inputs", "number_of_ul", "number_of_ol", "number_of_lists",
                                              "number_of_h1", "number_of_h2", "total_h1_len", "total_h2_len",
                                              "avg_h1_len", "avg_h2_len",
                                              "number_of_images", "number_of_tags", "number_of_unique_tags",
                                              "textdata"])

clickbait_features_df.to_csv('/content/drive/My Drive/web_scrap/data/clickbait_website_features.csv', index=False, encoding='utf-8')

non_clickbait_html = pickle.load(open('/content/drive/My Drive/web_scrap/data/non_clickbait_html.pkl','rb'))
non_clickbait_features = Parallel(n_jobs=30)(delayed(features)(html) for html in tqdm(non_clickbait_html))

non_clickbait_features_df = pd.DataFrame(non_clickbait_features,
                                         columns=["size", "html_len", "number_of_links", "number_of_buttons",
                                                  "number_of_inputs", "number_of_ul", "number_of_ol", "number_of_lists",
                                                  "number_of_h1", "number_of_h2", "total_h1_len", "total_h2_len",
                                                  "avg_h1_len", "avg_h2_len",
                                                  "number_of_images", "number_of_tags", "number_of_unique_tags",
                                                  "textdata"])


non_clickbait_features_df.to_csv('/content/drive/My Drive/web_scrap/data/non_clickbait_website_features.csv', index=False, encoding='utf-8')




  0%|          | 0/2137 [00:00<?, ?it/s][A[A

  0%|          | 1/2137 [00:04<2:47:57,  4.72s/it][A[A

  2%|▏         | 39/2137 [00:04<1:55:30,  3.30s/it][A[A

  3%|▎         | 60/2137 [00:08<1:21:40,  2.36s/it][A[A

  3%|▎         | 66/2137 [00:11<1:02:23,  1.81s/it][A[A

  3%|▎         | 71/2137 [00:14<49:22,  1.43s/it]  [A[A

  4%|▎         | 75/2137 [00:14<35:14,  1.03s/it][A[A

  4%|▎         | 78/2137 [00:14<26:26,  1.30it/s][A[A

  4%|▎         | 80/2137 [00:15<19:27,  1.76it/s][A[A

  4%|▍         | 82/2137 [00:15<17:36,  1.95it/s][A[A

  4%|▍         | 84/2137 [00:16<14:15,  2.40it/s][A[A

  4%|▍         | 86/2137 [00:16<12:11,  2.80it/s][A[A

  4%|▍         | 87/2137 [00:17<15:54,  2.15it/s][A[A

  4%|▍         | 88/2137 [00:17<12:22,  2.76it/s][A[A

  4%|▍         | 89/2137 [00:17<10:24,  3.28it/s][A[A

  4%|▍         | 91/2137 [00:17<08:38,  3.95it/s][A[A

  4%|▍         | 92/2137 [00:18<10:45,  3.17it/s][A[A

  4%|▍         | 93/2137 [00: