# 1) Initialize the project setup

## 1.1) Install the python packages

In [1]:
!pip install html5lib
!pip install bs4
!pip install nltk 
!pip install openpyxl
!pip install pickle-mixin 
!pip install argparse 
!pip install numpy 
!pip install pandas 
!pip install scikit-learn 
!pip install xlrd
!pip install pathlib 
!pip install w3lib



## 1.2) Download the necessary tokenizers

In [2]:
# import the packages
import nltk
# download the tokenizers
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/tobias/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/tobias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 2) Run the experiment source code

## 2.1) Imports, variables

In [3]:
# imports
import os
import re
import csv
import sys
import time
import pickle
import argparse
import numpy as np
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from collections import Counter
from sklearn import preprocessing
from nltk.corpus import stopwords
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.datasets import dump_svmlight_file
from sklearn.svm import SVC

In [45]:
# beautiful soup will show some errors for web pages with parsing errors
# we ignore them for less noise in the code outputs
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="bs4")

In [4]:
# Constants
z1 = 200  # empirical observed standardisation value

## 2.2) Lexica

In [5]:
# Files
lexica_files = {
    "privacy": "lexicon/privacy.txt",
    "contact": "lexicon/contact.txt",
    "stopwords": "lexicon/stopwords.txt",
    "commercial": "lexicon/comm_list.txt"
}

In [6]:
"""
This function loads a lexicon from a file
"""
def load_lexicon_from_file(file_path):
    with open(file_path) as file:
        lexicon = [word.rstrip() for word in file.readlines()]
    return lexicon


class Lexicon:
    def __init__(self, file_paths):
        self.stopwords = set(stopwords.words("english"))
        new_words = load_lexicon_from_file(file_paths["stopwords"])
        self.stopwords = self.stopwords.union(new_words)
        self.commercial = load_lexicon_from_file(file_paths["commercial"])
        self.contact = load_lexicon_from_file(file_paths["contact"])
        self.privacy = load_lexicon_from_file(file_paths["privacy"])


# Load all lexica
lexicon = Lexicon(lexica_files)

## 2.3) Reporting functions for the scores

In [7]:
"""
This function implements the weighted accuracy metric described in Sondhi's study
"""
def weighted_accuracy(bias, tn, tp, fn, fp):
    return (bias * tp + tn) / (bias * (tp + fn) + tn + fp)

In [26]:
"""
This function saves in a file the obtained performance for a concrete cost-factor and feature set
"""
def save_results(dataset, features, results, ts):
    if not os.path.exists("./results"):
        os.makedirs("./results")

    results.to_csv(f"./results/{dataset}_{features}_{ts}.csv", index=False)

## 2.4) Various feature extraction functions
The following functions count occurrences of different types of words

In [9]:
"""
This function calculates the word-based features as their normalized frequency value
"""
def word_features(doc, vectorizer):
    vector = vectorizer.transform([doc])
    doc_to_list = list(vector.toarray()[0])
    maximum = max(doc_to_list)

    if maximum:
        for val in doc_to_list:
            index = doc_to_list.index(val)
            doc_to_list[index] = val / maximum

    return doc_to_list

In [33]:
"""
This function counts the total commercial interest words appearances and returns the normalized frequency total value
"""
def count_commercial_keywords(filename, doc, parser="html5lib"):
    commercial_words = 0

    with open(filename, encoding="utf-8", errors="ignore") as reader:
        soup = BeautifulSoup(reader.read(), parser)
        text = soup.get_text()
        output = text.split(" ")

        for line in output:
            for term in lexicon.commercial:
                if term in line:
                    commercial_words += 1

        doc = doc.split(" ")

    return commercial_words / len(doc)

In [34]:
"""
This function counts the number of commercial links present in a webpage
"""
def count_commercial_links(filename, parser="html5lib"):
    with open(filename, encoding="utf-8", errors="ignore") as reader:
        soup = BeautifulSoup(reader.read(), parser)
        links = Counter([x.get("href") for x in soup.findAll("a")])
        links = links.most_common()
        commercial = 0

        for item in links:
            if item[0]:
                if any(ext in item[0] for ext in lexicon.commercial):
                    commercial += item[1]

    return commercial / z1

In [35]:
"""
This function calculates the link-based features
"""
def count_links(filename, parser="html5lib"):
    with open(filename, encoding="utf-8", errors="ignore") as reader:
        soup = BeautifulSoup(reader.read(), parser)
        links = Counter([x.get("href") for x in soup.findAll("a")])
        links = links.most_common()
        total = 0
        external = 0
        contact = 0
        privacy = 0

        for item in links:
            total += item[1]
            if item[0]:
                if item[0].startswith(("http", "ftp", "www")):
                    external += item[1]
                if any(ext in item[0] for ext in lexicon.contact):
                    contact = 1
                if any(ext in item[0] for ext in lexicon.privacy):
                    privacy = 1

        internal = total - external

    return total / z1, external / z1, internal / z1, contact, privacy  # presence of contact and privacy links are boolean features

In [13]:
"""
This function implements the whole casuistic of feature combinations
"""
def features_calc(docs, corpus, vectorizer, features):
    for filename, doc in zip(docs, corpus):
        doc_features = []

        if features == "link" or features == "comm" or features == "allRem" or features == "allKeep":
            links_counts = count_links(filename)
            doc_features.extend(links_counts)

        if features == "comm" or features == "allRem" or features == "allKeep":
            commercial_links = count_commercial_links(filename)
            commercial_words = count_commercial_keywords(filename, doc)
            doc_features.extend([commercial_links, commercial_words])

        if features == "wordsRem" or features == "wordsKeep" or features == "allRem" or features == "allKeep":
            words = word_features(doc, vectorizer)
            doc_features.extend(words)

        yield doc_features

In [14]:
"""
This function generates the vocabulary for a given corpus
"""
def generate_vocabulary(corpus, min_df):
    vectorizer = CountVectorizer(min_df=min_df)
    vectorizer.fit(corpus)
    return vectorizer

In [15]:
"""
This function normalizes a text to be used as a ML algorithm input
"""
def __normalize_text(line, features):
    line = re.sub('[^a-zA-Z]', ' ', line)  # remove punctuations
    line = line.lower()  # convert to lowercase
    line = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", line)  # remove tags
    line = re.sub("(\\d|\\W)+", " ", line)  # remove special char and digits 
    line = line.split()  # convert string to list

    if features != "wordsKeep" and features != "allKeep":
        line = [word for word in line if not word in lexicon.stopwords]  # remove stopwords

    line = " ".join(line)
    return line

In [43]:
"""
This function extracts clean text from a given HTML file
"""
def preprocess_text(filename, features, parser="html5lib"):
    with open(filename, encoding="utf-8", errors="ignore") as reader:
        soup = BeautifulSoup(reader.read(), parser)
        text = soup.get_text()
        output = text.split("\n")
        lines = []

        for line in output:
            line = __normalize_text(line, features)
            lines.append(line)

        doc = " ".join(lines)
        return doc

In [17]:
"""
This function generates an entire clean corpus from HTML files
"""
def generate_corpus(docs, features):
    corpus = []

    for doc in docs:
        doc = preprocess_text(doc, features)
        corpus.append(doc)

    return corpus

## 2.5) Data loading functions for different data sets

In [18]:
"""
This function loads the CLEF dataset
"""
def data_clef():
    if not os.path.exists("./datasets/CLEF/clef2018collection"):
        print("To perform these experiments you first need to download clef2018collection")
        sys.exit(1)

    X = []
    Y = []

    with open("./datasets/CLEF/CLEF2018_qtrust_20180914.txt", newline="") as assessments:
        reader = csv.reader(assessments, delimiter=" ")
        for row in reader:
            web = row[2]
            rating = int(row[3])

            if rating == 0 or rating == 1 or rating == 2 or rating == 3:  # relabelling process 
                for filename in Path("./datasets/CLEF/clef2018collection").rglob(
                        web):  # this function finds recursively a file in an entire path
                    X.append(filename)
                    break
                Y.append(1)

            elif rating == 7 or rating == 8 or rating == 9 or rating == 10:  # relabelling process 
                for filename in Path("./datasets/CLEF/clef2018collection").rglob(web):
                    X.append(filename)
                    break
                Y.append(-1)

    return np.array(X), np.array(Y)

In [19]:
"""
This function loads the Schwarz dataset
"""
def data_schwarz():
    df = pd.read_excel("./datasets/Schwarz/web_credibility_relabeled.xlsx")
    ratings = df["Likert Rating"]
    urls = df["URL"]
    root = os.getcwd()
    path = "./datasets/Schwarz/CachedPages"
    os.chdir(path)
    cached_pages_dir = os.getcwd()
    X = []
    Y = []

    for url, rating in zip(urls, ratings):
        try:
            url = url.replace("http://", "")
            url = url.split("/")
            if url[-1]:  # this case deals with urls like 'www.adamofficial.com/us/home'
                url = "/".join(url[:-1])
                os.chdir(url)
                f = [f for f in os.listdir() if re.match(url[-1] + '*', f) and os.path.isfile(f)]
            else:
                url = "/".join(url)
                os.chdir(url)
                f = [f for f in os.listdir() if re.match('index*', f) and os.path.isfile(f)]

            X.append(os.path.join(os.getcwd(), f[0]))
            Y.append(rating)
            os.chdir(cached_pages_dir)
        except:
            pass

    os.chdir(root)
    return np.array(X), np.array(Y)

In [20]:
"""
This function loads the Sondhi dataset
"""
def data_sondhi():
    path1 = "./datasets/Sondhi/reliable"
    root = os.getcwd()
    os.chdir(path1)
    arr1 = os.listdir(".")
    path2 = "../unreliable"
    os.chdir(path2)
    arr2 = os.listdir('.')
    X = []
    Y = []

    for rel, unrel in zip(arr1, arr2):
        os.chdir("../reliable")
        X.append("./datasets/Sondhi/reliable/" + rel)
        Y.append(-1)
        os.chdir("../unreliable")
        X.append("./datasets/Sondhi/unreliable/" + unrel)
        Y.append(1)

    os.chdir(root)
    return np.array(X), np.array(Y)

## 2.6) Model training function

In [30]:
def train(dataset="Sondhi", features="link", dump=True, standard=True, cost_factors=[1, 2, 3], seed=1, show_iterations=False):
    if dataset == "Sondhi":
        X, Y = data_sondhi()
        n = 5
        min_df = 1

    elif dataset == "Schwarz":
        X, Y = data_schwarz()
        n = 2
        min_df = 0.5

    elif dataset == "CLEF":
        X, Y = data_clef()
        n = 5
        min_df = 0.4

    else:
        print("Unknown dataset")
        return

    np.random.seed(seed)  # reproducibility seed
    skf = StratifiedKFold(n_splits=n)  # stratified k-fold: preserves the percentage of samples for each class
    ts = str(time.time())
    print(f"Training on {dataset} with standard={standard}, features={features}")
    print("EXPERIMENT ID: ", ts)  # we use the timestamp as experiment id

    # store the performance results in a dataframe
    results = pd.DataFrame()
    """
    For each cost-factor, we perform a n-fold cross validation for the feature set previously selected
    """
    for cost_factor in cost_factors:

        accuracies, f1_micro, f1_rel, f1_unrel = [], [], [], []
        it = 1

        for train_index, test_index in skf.split(X, Y):

            data_train = X[train_index]
            corpus_train = generate_corpus(data_train, features)
            vectorizer = generate_vocabulary(corpus_train, min_df)  # for each fold we reset vocabulary associated to training set

            if dump:
                if not os.path.exists('./models'):
                    os.makedirs('./models')

                pickle.dump(vectorizer, open(
                    f"models/vocabulary_{dataset}_{features}_it{it}_cost_fact{cost_factor}_{ts}.pkl", "wb"))

            data_train = features_calc(data_train, corpus_train, vectorizer, features)
            target_train = Y[train_index]

            if standard:
                list_data_train = list(data_train)
                scaler_x = preprocessing.StandardScaler().fit(list_data_train)

                if dump:
                    pickle.dump(scaler_x, open(
                        f"models/scaler_{dataset}_{features}_it{it}_cost_fact{cost_factor}_{ts}.pkl", "wb"))

                data_train = scaler_x.transform(list_data_train)

            elif not standard:
                data_train = np.array(list(data_train))
                nsamples, nx = data_train.shape
                data_train = data_train.reshape((nsamples, nx))

            if not os.path.exists('./train_data'):
                os.makedirs('./train_data')

            dump_svmlight_file(data_train, target_train, f"train_data/train_{ts}.txt")

            data_test = X[test_index]
            corpus_test = generate_corpus(data_test, features)
            data_test = features_calc(data_test, corpus_test, vectorizer, features)
            target_test = Y[test_index]

            if standard:
                data_test = scaler_x.transform(list(data_test))

            elif not standard:
                data_test = np.array(list(data_test))
                nsamples, nx = data_test.shape
                data_test = data_test.reshape((nsamples, nx))

            dump_svmlight_file(data_test, target_test, f"train_data/test_{ts}.txt")

            if show_iterations:
                print("Training it=", it, "cost-factor=", cost_factor)
                
            clf = SVC(gamma="auto", class_weight={-1:1, 1:cost_factor})
            clf.fit(data_train, target_train)

            if dump:
                filename = f"models/model_{dataset}_{features}_it{it}_cost_fact{cost_factor}_{ts}.dat"
                pickle.dump(clf, open(filename, "wb"))

            if show_iterations:
                print("Predicting it=", it, "cost-factor=", cost_factor)
                
            predictions = clf.predict(data_test)

            tn, fp, fn, tp = confusion_matrix(target_test, predictions).ravel()

            accuracies.append(weighted_accuracy(cost_factor, tn, tp, fn, fp) * 100)
            f1_micro.append(f1_score(target_test, predictions, average="micro"))  # micro: calculates metrics totally by counting the total true positives, false negatives and false positives
            cl = f1_score(target_test, predictions, average=None)  # none: returns scores for each class
            f1_rel.append(cl[0])
            f1_unrel.append(cl[1])
            it += 1

        # persist the results
        entry = pd.DataFrame([{
            "dataset": dataset, 
            "features": features, 
            "standardization": standard,
            "cost factor": cost_factor, 
            "weighted accuracy": np.mean(accuracies), 
            "f1-score": np.mean(f1_micro), 
            "credible f1-score": np.mean(f1_rel), 
            "non-credible f1-score": np.mean(f1_unrel)
        }])
        
        results = pd.concat([results, entry], axis=0)
            
        print(f"Training finished with cost_factor={cost_factor}")
        print("The weighted accuracy is", np.mean(accuracies))
        print("The f1-score is", np.mean(f1_micro))
        print("The credible f1-score is", np.mean(f1_rel))
        print("The non-credible f1-score is", np.mean(f1_unrel))
        
    save_results(dataset, features, results, ts)

# 3) Run the experiment

## Experiment parameters
There exist three options for the dataset:
 - Sondhi
 - Schwarz
 - CLEF
 
Different sets of features can be used:
 - link
 - comm 
 - wordsRem
 - wordsKeep
 - allRem
 - allKeep
 
The training can be done with and without saving the models in the ./models folder:
 - True (with saving)
 - False (without saving)
 
Standardization can be performed
 - True (with standardization)
 - False (without standardization)

In [22]:
seed = 1
features_options = ["link", "comm", "wordsRem", "wordsKeep", "allRem", "allKeep"]
dump = False

## 3.1) Reproduce the Sondhi dataset experiments

In [23]:
dataset = "Sondhi"

### 3.1.1) Reproduce Sondhi results with standardization

In [24]:
standard = True
for features in features_options:
    train(dataset, features, dump, standard, seed=seed)

Training on Sondhi with standard=True, features=link
EXPERIMENT ID:  1673797809.9101937
Training finished with cost_factor=1
The weighted accuracy is 73.61111111111111
The f1-score is 0.736111111111111
The credible f1-score is 0.690005120327701
The non-credible f1-score is 0.7701595904847937
Training finished with cost_factor=2
The weighted accuracy is 79.81481481481481
The f1-score is 0.7333333333333333
The credible f1-score is 0.6684118655068441
The non-credible f1-score is 0.7765810528937455
Training finished with cost_factor=3
The weighted accuracy is 85.13888888888889
The f1-score is 0.7416666666666667
The credible f1-score is 0.6682052398400034
The non-credible f1-score is 0.7882645154898563
Training on Sondhi with standard=True, features=comm
EXPERIMENT ID:  1673798052.6726968
Training finished with cost_factor=1
The weighted accuracy is 76.94444444444444
The f1-score is 0.7694444444444445
The credible f1-score is 0.7575114622963615
The non-credible f1-score is 0.780044808992177

### 3.1.2) Reproduce Sondhi results without standardization

In [None]:
standard = False
for features in features_options:
    train(dataset, features, dump, standard, seed=seed)

## 3.2) Reproduce the Schwarz dataset experiments

In [31]:
dataset = "Schwarz"

### 3.2.1) Reproduce Schwarz results with standardization

In [46]:
standard = True
for features in features_options:
    train(dataset, features, dump, standard, seed=seed)

Training on Schwarz with standard=True, features=link
EXPERIMENT ID:  1673803119.711432
Training finished with cost_factor=1
The weighted accuracy is 93.75
The f1-score is 0.9375
The credible f1-score is 0.9676989676989677
The non-credible f1-score is 0.0
Training finished with cost_factor=2
The weighted accuracy is 88.26135105204872
The f1-score is 0.9375
The credible f1-score is 0.9676989676989677
The non-credible f1-score is 0.0
Training finished with cost_factor=3
The weighted accuracy is 83.399209486166
The f1-score is 0.9375
The credible f1-score is 0.9676989676989677
The non-credible f1-score is 0.0
Training on Schwarz with standard=True, features=comm
EXPERIMENT ID:  1673803212.5468662
Training finished with cost_factor=1
The weighted accuracy is 93.75
The f1-score is 0.9375
The credible f1-score is 0.9676989676989677
The non-credible f1-score is 0.0
Training finished with cost_factor=2
The weighted accuracy is 88.26135105204872
The f1-score is 0.9375
The credible f1-score is 0

### 3.2.2) Reproduce Schwarz results without standardization

In [None]:
standard = False
for features in features_options:
    train(dataset, features, dump, standard, seed=seed)

# 4) Run an experiment with a different seed

In [47]:
seed = 15012023
train("Schwarz", "link", False, True, seed=seed)

Training on Schwarz with standard=True, features=link
EXPERIMENT ID:  1673803872.2660673
Training finished with cost_factor=1
The weighted accuracy is 93.75
The f1-score is 0.9375
The credible f1-score is 0.9676989676989677
The non-credible f1-score is 0.0
Training finished with cost_factor=2
The weighted accuracy is 88.26135105204872
The f1-score is 0.9375
The credible f1-score is 0.9676989676989677
The non-credible f1-score is 0.0
Training finished with cost_factor=3
The weighted accuracy is 83.399209486166
The f1-score is 0.9375
The credible f1-score is 0.9676989676989677
The non-credible f1-score is 0.0
