Page Title

In [1]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [3]:
import tensorflow as tf
import pandas as pd

In [4]:
URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz",
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

In [5]:
### remove unlabeled reviews ###
# The shutil module offers a number of high-level
# operations on files and collections of files.
import os
import shutil
# Create main directory path ("/aclImdb")
main_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
# Create sub directory path ("/aclImdb/train")
train_dir = os.path.join(main_dir, 'train')
# Remove unsup folder since this is a supervised learning task
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)
# View the final train folder
print(os.listdir(train_dir))

['labeledBow.feat', 'neg', 'pos', 'unsupBow.feat', 'urls_neg.txt', 'urls_pos.txt', 'urls_unsup.txt']


In [6]:
# We create a training dataset and a validation
# dataset from our "aclImdb/train" directory with a 80/20 split.
train = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2,
    subset='training', seed=123)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2,
    subset='validation', seed=123)

for i in train.take(1):
  train_feat = i[0].numpy()
  train_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")
train.head()

for j in test.take(1):
  test_feat = j[0].numpy()
  test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")
test.head()


Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,I can't believe that so much talent can be was...,0
1,This movie blows - let's get that straight rig...,0
2,"The saddest thing about this ""tribute"" is that...",0
3,I'm only rating this film as a 3 out of pity b...,0
4,Something surprised me about this movie - it w...,1


In [7]:
InputExample(guid=None,
             text_a="Hello, world",
             text_b=None,
             label=1)

InputExample(guid=None, text_a='Hello, world', text_b=None, label=1)

In [8]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           'DATA_COLUMN', 
                                                                           'LABEL_COLUMN')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            #pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            padding='max_length', # because above is deprecated
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [9]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(
    train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(
    list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(
    list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

In [10]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(
                  from_logits=True),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2


In [1]:
model.save('MLModel/bertiment', save_format="h5")

NameError: name 'model' is not defined

In [8]:
import os

In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=r'C:/Users/thilo/OneDrive/Dokumente/UniversitätLeipzig/DataScience3/InformationRetrieval/irlab/data/chromedriver')

  driver = webdriver.Chrome(options=options, executable_path=r'C:/Users/thilo/OneDrive/Dokumente/UniversitätLeipzig/DataScience3/InformationRetrieval/irlab/data/chromedriver')


In [11]:
result_ids_con = ['If66787cad4a7f247', 'I5267c4a222aad283','I5ad3b457d1b923b1', 'I2db1aa14b7807aed', 'I2b91de90a27d0506', 'I2090f7e152b056ed', 'I1106058d9d4025c9', 'I046ddd15dc82845a', 'I8ccb8618e57ceaec', 'Icc511540b95aee35']
result_ids_pro = ['I680736496d979c9e', 'I74c480d452e7787c', 'I904a6b8f9f436788', 'I0bba050b1349e96e', 'I6982d2a3cfce7e41', 'I963f4855e811efcd', 'I1dfcdb482b33207a', 'I5a9ecd0b71dd3960', 'I26e599e89ba6bf98', 'I2d9b1996f5c349f7']

In [9]:
# make dictionary with all topics
import xml.etree.ElementTree as ET
tree = ET.parse('topics.xml')
topics = tree.findall('topic')

topicsDic = {}

for topic in topics:
    title = topic.find('title').text
    number = topic.find('number').text
    topicsDic[number] = title

In [None]:
data_list = []

data_dir = path_to_images + '/images/'
image_files = [f for f in os.listdir(data_dir)]
for f in image_files:
    subdir = data_dir + f
    subdir_files = [f for f in os.listdir(subdir)]
    #print("\n", f)

    for f2 in subdir_files:
        print("   ", subdir_files.index(f2), " von ", len(subdir_files))
        subsubdir = subdir + "/" + f2 + "/pages/"
        subsubdir_files = [f for f in os.listdir(subsubdir)]
        #print("\n", f2)

        for f3 in subsubdir_files:
            with open(subsubdir + f3 + "/page-url.txt") as f:
                this_link = f.read()
                driver.get(this_link)
                try:
                    elem = driver.find_element(By.XPATH, "/html/body")
                except:
                    elem = driver.find_element(By.XPATH, "/html")
                
            inner = elem.text
            #print(inner)
            try:
                with open(subsubdir + "/" + f3 + '/snapshot/HTMLbody.txt', 'w') as s:
                    s.write(inner)
            except:
                print("Error saving")
                #data_list.append(f2 + '\t' + inner)

    #         # save doc to txt-file
    #         filename = os.path.join(subsubsubdir + "text_pp.txt"  )
    #         print(filename)
    #         f = open(filename, "w")
    #         f.write(doc)
    #         f.close()   
            if(subdir_files.index(f2) == 3):
                break
    break
#print(data_list)