### Import neccessaire library for extracting and pre-processing the urls

In [1]:
import os
from urllib.parse import urlparse, unquote_plus
import pandas as pd
import numpy as np
import glob
import re
import itertools
import nltk
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt
from util import print_evaluation_scores
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dropout, Dense
from sklearn.model_selection import train_test_split
from collections import Counter
import tensorflow as tf

nltk.download("stopwords")
nltk.download("punkt")

2021-11-24 11:58:24.066411: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-24 11:58:24.066435: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[nltk_data] Downloading package stopwords to /home/triet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/triet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Extract raw url into meaningful words

In [2]:
STOPWORDS = set(stopwords.words('english') + stopwords.words('french'))
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
REPLACE_IP_ADDRESS = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')


def extract_url(url):
    """
    Extract URL into meaningful words
    :param url: The web URL
    :return: The extracted string
    """
    parsed_unquoted = urlparse(unquote_plus(url))
    text = parsed_unquoted.netloc + ' ' + parsed_unquoted.path + ' ' + parsed_unquoted.params + ' ' + parsed_unquoted.query
    text = text.replace('\n', ' ').lower()
    text = REPLACE_IP_ADDRESS.sub(' ', text)
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub(' ', text)
    text = re.sub("www ", "", text)
    text = ' '.join([w for w in text.split() if w not in STOPWORDS])
    return text

### Delete rows which have only one occurence

In [3]:
def filter_data(df:pd.DataFrame):
    df = df[~df["url"].duplicated()]
    df["target"] = df["target"].apply(lambda x : tuple(x) if isinstance(x, (np.ndarray, list)) else x)
    df_filtered = df.groupby("target").filter(lambda x : len(x) > 1)
    print("Filter complete")

    return df_filtered

### Group all the labels that appear less then threshold (=200)

In [4]:
def group_less_occur_label(target, counter, threshold=200):
    original_len = len(target)
    new_target = list(label for label in target if counter[label] > threshold)
    if len(new_target) < original_len:
        new_target.append('0ther')

    return new_target

### Read data from the parquet file and extract url

In [5]:
def preprocess_data(data_dir):
    """
    Preprocess data
    :param data_dir: Path of data's directory
    :return: None
    """
    dfs = []
    data_files = glob.glob(data_dir + "*.snappy.parquet")
    for data_file in data_files:
        df = pd.read_parquet(data_file, columns=["url", "target"])
        dfs.append(df)
    dfs = pd.concat(dfs, ignore_index=True)
    dfs["url"] = dfs["url"].apply(extract_url)
    
    return dfs

### Clean the data

In [6]:
df = filter_data(preprocess_data("data/"))
# Group and replace all labels that occur less than threshold
counter = Counter(list(label for target in df["target"].values for label in target))
df["target"] = df["target"].apply(lambda x : group_less_occur_label(x, counter))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"] = df["target"].apply(lambda x : tuple(x) if isinstance(x, (np.ndarray, list)) else x)


Filter complete


### Data exploration

In [7]:
nb_labels = len(set(label for target in df["target"].values for label in target))
print(f"Data shape: {df.shape}")
print(f"Number of distinct labels: {nb_labels}")

Data shape: (41878, 2)
Number of distinct labels: 238


### Split the data to train, validation and test set. Use stratified splits because of class imbalance

In [8]:
test_split = 0.1

# Initial train and test split.
train_df, test_df = train_test_split(
    df,
    test_size=test_split,
    stratify=df["target"].values,
)

# Splitting the train set further into validation
# and new train sets.
val_df = train_df.sample(frac=0.25)
train_df.drop(val_df.index, inplace=True)

print(f"Number of rows in training set: {len(train_df)}")
print(f"Number of rows in validation set: {len(val_df)}")
print(f"Number of rows in test set: {len(test_df)}")

Number of rows in training set: 28268
Number of rows in validation set: 9422
Number of rows in test set: 4188


### Get the actual targets from the encoded version

In [9]:
def inverse_multi_hot(encoded_targets):
    hot_indices = np.argwhere(encoded_targets==1.0)[..., 0]
    return np.take(vocab, hot_indices)

### Multi-hot representation of labels
Using the StringLookup layer in tensorflow

In [10]:
labels = tf.ragged.constant(df["target"].values)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot")
lookup.adapt(labels)
vocab = lookup.get_vocabulary()

print("Labels:")
print(vocab)

2021-11-24 11:58:27.380652: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-11-24 11:58:27.380673: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-11-24 11:58:27.380687: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (triet-XPS): /proc/driver/nvidia/version does not exist
2021-11-24 11:58:27.380896: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Labels:
['[UNK]', '0ther', '692', '1494', '1265', '474', '907', '122', '1254', '63', '108', '1119', '184', '381', '1687', '1526', '1686', '925', '531', '1311', '572', '622', '1343', '1573', '358', '1693', '1513', '1187', '909', '408', '1077', '210', '377', '137', '294', '1107', '41', '507', '1546', '1370', '933', '1367', '1094', '1198', '1599', '61', '1721', '908', '270', '1071', '935', '1171', '1533', '1234', '1277', '1193', '937', '211', '1366', '329', '1690', '906', '1143', '1095', '1372', '1146', '34', '22', '1720', '852', '96', '1096', '401', '333', '910', '1259', '1710', '608', '920', '1192', '930', '253', '540', '378', '1692', '1368', '822', '16', '1369', '953', '1348', '1179', '1515', '1111', '1730', '1534', '1781', '1136', '3', '966', '1097', '5182', '1722', '1021', '529', '647', '1142', '5697', '1867', '1602', '105', '1264', '317', '1163', '1549', '78', '997', '374', '1538', '978', '5693', '1106', '1057', '1694', '1292', '1545', '5692', '115', '384', '1049', '1542', '1544', '

An example of transform a set of labels into its multi-hot version and vice-versa

In [11]:
example_target = train_df["target"][0]
print(f"Original target: {example_target}")

multi_hot_target = lookup([example_target])
print(f"Multi hot representation: {multi_hot_target}")

Original target: ['378', '1710', '5529', '96', '0ther']
Multi hot representation: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


### Padding the url to have all the same length

In [12]:
train_df["url"].apply(lambda x : len(x.split(" "))).describe()

count    28268.000000
mean        10.631739
std          5.536528
min          1.000000
25%          7.000000
50%          9.000000
75%         13.000000
max         71.000000
Name: url, dtype: float64

In [13]:
max_len = 30
padding_token = "<pad>"
batch_size = 1028
auto = tf.data.AUTOTUNE

def padding(url, target):
    # Split the given url and calculate its length
    word_splits = tf.strings.split(url, sep=" ")
    url_length = tf.shape(word_splits)[0]

    # Calculate the padding amount
    padding_length = max_len - url_length

    # Check if need to pad or truncate the url
    if padding_length > 0:
        padded_url = tf.pad([url], [[0, padding_length]], constant_values=padding_token)
        padded_url = tf.strings.reduce_join(padded_url, separator="")

    else:
        padded_url = tf.strings.reduce_join(word_splits[:max_len], separator=" ")

    # An extra dimension is needed for vectorization
    return tf.expand_dims(padded_url, -1), target

### Create dataset

In [14]:
def make_dataset(df, is_train=True):
    labels = tf.ragged.constant(df["target"].values)
    multi_hot_labels = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices(
        (df["url"].values, multi_hot_labels)
    )
    # If it is the train dataset, then shuffle it
    dataset = dataset.shuffle(df.shape[0]) if is_train else dataset
    # Pad the url in the dataset
    dataset = dataset.map(padding, num_parallel_calls=auto).cache()

    return dataset.batch(batch_size)

In [15]:
train_dataset = make_dataset(train_df, is_train=True)
test_dataset = make_dataset(test_df, is_train=False)
val_dataset = make_dataset(val_df, is_train=False)

### Dataset preview

In [16]:
text_batch, labels_batch = next(iter(train_dataset))

print(text_batch.shape)

for i, url in enumerate(text_batch[:2]):
    target = labels_batch[i].numpy()[None, ...]
    print(f"Url: {url[0]}")
    print(f"Target: {inverse_multi_hot(target[0])}")

(1028, 1)
Url: b'madame lefigaro fr celebrites nest fille jai elevee thomas markle deplore retrait meghan harry 190120 179219<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
Target: ['0ther' '184']
Url: b'cdiscount com sport skate shop beeper vehicule electrique drift trike rdt100 12 f 1213002 ixi3661546400876 html<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
Target: ['0ther']


2021-11-24 11:58:28.723588: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


### Use Tensorflow TextVectorization to calculate the tf-idf representation of url

In [17]:
# Calculate the number of unique words present in our url
vocabulary_size = train_df["url"].str.split().str.len().max()
print(f"Vocabulary size: {vocabulary_size}")

Vocabulary size: 71


In [18]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=vocabulary_size,
    ngrams=(1, 2),
    output_mode="int",
    output_sequence_length=max_len
)

# Adapt this layer to the train dataset
with tf.device("/CPU:0"):
    text_vectorizer.adapt(train_dataset.map(lambda url, target : url))

# Vectorize all the url in the train, validation and test set
train_dataset = train_dataset.map(
    lambda url, target : (text_vectorizer(url), target),
    num_parallel_calls=auto
).prefetch(auto)

val_dataset = val_dataset.map(
    lambda url, target : (text_vectorizer(url), target),
    num_parallel_calls=auto
).prefetch(auto)

test_dataset = test_dataset.map(
    lambda url, target : (text_vectorizer(url), target),
    num_parallel_calls=auto
).prefetch(auto)

### Create the dense model

In [19]:
def make_dense_model(output_shape):
    simple_model = tf.keras.Sequential([
        Dense(512, activation="relu"),
        Dense(256, activation="relu"),
        Dense(output_shape, activation="sigmoid")
    ])

    return simple_model

### Train dense model

In [20]:
epochs = 20
dense_model = make_dense_model(lookup.vocabulary_size())
dense_model.compile(
    loss="binary_crossentropy",
    optimizer="adam")

history = dense_model.fit(
    train_dataset, epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Evaluate dense model on val and test sets

In [21]:
val_pred = dense_model.predict(val_dataset)
val_pred = np.array(val_pred) >= 0.5

val_unpack = list(val_dataset)
y_val = tf.concat([val_unpack[i][1] for i in range(len(val_unpack))], axis=0)

print_evaluation_scores(val_pred, y_val)

Accuracy: 0.09435364041604755
Hamming loss: 0.014144764012650886
F1 score macro: 0.10242933402042173
F1 score micro: 0.3926473953169095
F1 score weighted: 0.6310514579432869


In [22]:
test_pred = dense_model.predict(test_dataset)
test_pred = np.array(test_pred) >= 0.5

test_unpack = list(test_dataset)
y_test = tf.concat([test_unpack[i][1] for i in range(len(test_unpack))], axis=0)

print_evaluation_scores(test_pred, y_test)

Accuracy: 0.09789875835721108
Hamming loss: 0.014011940871108127
F1 score macro: 0.10223665986990281
F1 score micro: 0.39544807965860596
F1 score weighted: 0.6416394237688312


### Create the dense model

In [23]:
def make_LSTM_model(input_shape, output_shape):
    # Create an input layer 
    input_url = Input((input_shape, 1), dtype='float32')
    # Propagate the input through a LSTM layer with 128-dimensional hidden state
    X = LSTM(128, return_sequences=False)(input_url)
    # Add drop out with probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with number of units equal number of distinct labels(output_shape)
    X = Dense(output_shape, activation='sigmoid')(X)

    # Create the model instance convert input_url to X
    model = Model(input_url, X)

    return model

Now we can have our model

In [24]:
epochs = 20

lstm_model = make_LSTM_model(max_len, lookup.vocabulary_size())
lstm_model.summary()

lstm_model.compile(
    loss="binary_crossentropy",
    optimizer="adam")

history = lstm_model.fit(train_dataset, epochs=epochs)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 30, 1)]           0         
                                                                 
 lstm (LSTM)                 (None, 128)               66560     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 239)               30831     
                                                                 
Total params: 97,391
Trainable params: 97,391
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 

### Evaluate dense model on val and test sets

In [25]:
val_pred = lstm_model.predict(val_dataset)
val_pred = np.array(val_pred) >= 0.5

val_unpack = list(val_dataset)
y_val = tf.concat([val_unpack[i][1] for i in range(len(val_unpack))], axis=0)

print_evaluation_scores(val_pred, y_val)

Accuracy: 0.06421142008066227
Hamming loss: 0.01625635364219236
F1 score macro: 0.0029433220502149628
F1 score micro: 0.2183169268219769
F1 score weighted: 0.7034539700013761


In [26]:
test_pred = lstm_model.predict(test_dataset)
test_pred = np.array(test_pred) >= 0.5

test_unpack = list(test_dataset)
y_test = tf.concat([test_unpack[i][1] for i in range(len(test_unpack))], axis=0)

print_evaluation_scores(test_pred, y_test)

Accuracy: 0.0673352435530086
Hamming loss: 0.016174924969928027
F1 score macro: 0.002961532354160023
F1 score micro: 0.22081047261526615
F1 score weighted: 0.7078062326442456
