In [1]:
import numpy as np
import pandas as pd

import requests
import csv
import time
from bs4 import BeautifulSoup

import torch
from transformers import BertTokenizer, BertModel

from progressbar import ProgressBar
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
model.eval()

def embed(text):
    
    tokenized_text = tokenizer.tokenize(text)
    
    if len(tokenized_text) > 510:
        tokenized_text = tokenized_text[:510]
        
    tokenized_text = ['[CLS]'] + tokenized_text + ['[SEP]']
    
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(indexed_tokens)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():
        output = model(tokens_tensor, segments_tensors)
        hidden_states = output[2] # tensor of shape 
    
    token_states = torch.stack(hidden_states, dim=0)
    token_states = torch.squeeze(token_states, dim=1) 
    token_embeddings = token_states[-2] 
    
    return torch.mean(token_embeddings, dim=0)


In [3]:
embedding_dim = 768

def content_features_fast(url):
    try:
        page = requests.get(url, timeout=5) # wait 12 second before timeout
        
    except(Exception):
        return None
    
    try: 
        soup = BeautifulSoup(page.text, 'html.parser')
        
    except(Exception):
        return None
    
    # find the p tag
    p = soup.find_all('p', limit=30)
    if (len(p)) == 0:
        return None
    para_embedding = np.zeros(embedding_dim)

    for i in range(len(p)):
        para_embedding += embed(p[i].text).numpy()

    return para_embedding / len(p)

In [5]:
folder = '/dlabdata1/lugeon/'
name = 'websites_10_000_5cat'
ext = '.gz'
data = pd.read_csv(folder + name + ext, header=0, names=['id', 'url', 'cat'])

In [6]:
data

Unnamed: 0,id,url,cat
0,133292,http://www.imdb.com/title/tt0044207/,Arts
1,126451,http://www.missgien.net/misc/films/robin.html,Arts
2,252430,http://www.ppmag.com,Arts
3,261535,http://us.imdb.com/title/tt0103488/,Arts
4,125161,http://www.hollywoodjesus.com/powder.htm,Arts
...,...,...,...
49995,1089710,http://www-sul.stanford.edu/depts/ssrg/misc/ra...,Science
49996,1115352,http://www.bowden-bros.com,Science
49997,1046715,http://www.botany.hawaii.edu/faculty/carr/imag...,Science
49998,1055812,http://chem4823.usask.ca/howto.html,Science


In [7]:
tqdm.pandas()
data['emb'] = data.progress_apply(lambda row: content_features_fast(row.url), axis=1)

  from pandas import Panel
  0%|          | 13/50000 [00:19<21:14:38,  1.53s/it]


KeyboardInterrupt: 

In [76]:
data.emb.isnull().value_counts()

True     26812
False    23188
Name: emb, dtype: int64

In [77]:
data_valid = data[~data.emb.isnull()]
data_valid['emb'] = data_valid.apply(lambda row: row.emb.tolist(), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_valid['emb'] = data_valid.apply(lambda row: row.emb.tolist(), axis=1)


In [78]:
data_valid.head()

Unnamed: 0,id,url,cat,emb
0,133292,http://www.imdb.com/title/tt0044207/,Arts,"[0.19638090891848234, -0.2036611011082476, 0.1..."
1,126451,http://www.missgien.net/misc/films/robin.html,Arts,"[-0.46314648985862733, -0.10842716749757528, 0..."
2,252430,http://www.ppmag.com,Arts,"[-0.0398584817137037, -0.30310366010027273, -0..."
4,125161,http://www.hollywoodjesus.com/powder.htm,Arts,"[0.018674095170960452, -0.03543987311422825, 0..."
6,258127,http://www.kspq.com/,Arts,"[0.009379717521369457, -0.16280843317508698, 0..."


In [79]:
data_valid.to_csv(name + '_emb.csv')