In [None]:
# When running in Google Colab

!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0


In [None]:
# When running in google colab

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
"""
Step 1: Importing all the libraries 

Importing the libraries which are necessary for data processing
as well as for neural network models.
"""

import pandas as pd
import glob
import os
import random
import numpy as np
from tkinter import filedialog as fd

# 'transformers' is the library which provides pre-trained models
# we are using BertTokenizer for tokenizing the sentences
# we are going to use tensorflow version of a BertModel i.e., TFBertModel
from transformers import BertTokenizer, TFBertModel
from sklearn import preprocessing
from keras.utils import to_categorical
import tensorflow as tf

# seeding the random numbers for getting same accuracy value
os.environ['PYTHONHASHSEED']='0'
random.seed(6)
np.random.seed(6)
tf.random.set_seed(6)

In [None]:
"""
Step 2: Creating train, dev and test datasets 

Loading all the 64 datasets taken from https://github.com/Linguistic-Data-Science-Lab/German_EO_verbs/tree/main/annotations 
and saved in local path. Concatenating it into a single dataset using pandas.
After concatenating, we are selecting 5 specific columns i.e., Verb, Token ID, Sentence, non-psych and not_of_interest.
Removing the data where 'not_of_interest' column value is 'x'
Then, updating 'non-psych' column and shuffling the dataset. After shuffling, we are splitting it into 
train, dev and test datasets in the ratio of 40:30:30.
"""

# Loading the 64 datasets from local path
#path = "C:\Velsadhana\Masterscourse\Linguistics Data Science\Sum sem 2022\Research project 1\German_EO_verbs-main\German_EO_verbs-main\datasets"                  
#datasets = glob.glob(os.path.join(path, "*.csv"))     
#all_df = (pd.read_csv(ds,sep=";") for ds in datasets)

# Concatenating it into a single big dataframe
#concat_df = pd.concat(all_df, ignore_index=True)
# Saving the concated df in our local path
#concat_df.to_excel('concat_df.xlsx', index=False)

# First and foremost thing is to save the concatenated dataset in our local path manually !!
# This dataset is available in https://github.com/velsadhana/Research-project1-Identifying-psych-verbs/tree/main/Datasets
# (Please note that, I have removed one long sentence from the dataset, as it is a duplicate and has wrong Toke_ID)

def create_df():
    # for running in colab
    concat_df=pd.read_excel("/content/drive/MyDrive/Research project 1/concat_df.xlsx")

    # for running in local machine
    # A dialog box will appear and select the concatenated dataset from your location.
    #concat_ds = fd.askopenfilename()
    #concat_df=pd.read_excel(concat_ds)

    # selecting only particular columns
    temp_df = concat_df[['Verb','Token_ID', 'Sentence', 'non-psych','not_of_interest']]

    # deleting rows where 'not_of_interest' column == x & X
    df = temp_df[(temp_df['not_of_interest'] != 'x') & (temp_df['not_of_interest'] != 'X')]

    # deleting 'not_of_interest' column
    df.drop('not_of_interest', inplace=True, axis=1)

    # updating the 'non-psych' column values as psych in place of blank or non-psych in place of 'x'
    df.loc[df["non-psych"] == "x", "non-psych"] = "non-psych"
    df["non-psych"].fillna("psych", inplace = True)

    # shuffling the dataframe
    shuffle_df = df.sample(frac = 1,random_state=1)

    # splitting the shuffled dataframe into train, dev and test data
    # slicing train, dev and test in the ratio of 40:30:30 respectively
    train_df, dev_df, test_df = np.split(shuffle_df, [int(.4*len(shuffle_df)),int(.7*len(shuffle_df))])

    # counting the size of each dataset
    print("train_df size", len(train_df))
    print("dev_df size", len(dev_df))
    print("test_df size", len(test_df))

    # counting the no. of verbs in each dataset
    n1=len(pd.unique(train_df['Verb']))
    n2=len(pd.unique(dev_df['Verb']))
    n3=len(pd.unique(test_df['Verb']))
    print("verb count in train, dev & test", n1,n2,n3)
    return [train_df, dev_df, test_df]


In [None]:
"""
Step 3: Tokenizing the sentence using BERT tokenizer

Tokenizing the sentence using BertTokenizer class and "bert-base-german-cased" model and
store the tokenized input_ids and tokenized words in seperate lists.
"""

def tokenize(sentence):
    # list for storing the input_ids & tokenized words
    inp_ids = []
    token_wrds = []
    max_input_ids = 420

    # using "bert-base-german-cased" model, since the dataset is in German
    tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")   

    for i in sentence:
        tokns = tokenizer(i)
        temp_ids=tokns["input_ids"]
        inp_ids.append(temp_ids)
        temp_words= tokenizer.convert_ids_to_tokens(tokns["input_ids"])
        token_wrds.append(temp_words)

    # padding the input ids
    input_ids_pad = [i + [0]*(max_input_ids-len(i)) for i in inp_ids]

    # converting the input_ids to numpy arrays inorder to pass it into bert layer.
    inpIds=np.array(input_ids_pad)
    return [inpIds,token_wrds]



In [None]:
"""
Step 4: Creating output labels i.e., psych and non-psych which are labelled as 1 and 0 respectively.

Converting labels into numpy arrays using LabelEncoder and fit_transform.
LabelEncoder convert the string labels(psych/non-psych) into integers and 
fit_transform converts those integers into numpy arrays.
"""

def get_out_labels(labels):
    label = preprocessing.LabelEncoder()
    outLabels = label.fit_transform(labels)
    print("no. of output labels", len(outLabels))
    print("Sample output labels", outLabels[0:3])
    return outLabels

In [None]:
"""
Step 5: Building the model2 

This is a standard approach when using BERT model. Here, Bert will be the intermediate layer between input and output layer.

Instead of passing bert embeddings to the input layer, we have to pass sentence input_ids here. To the output layer, we have 
to pass only the CLS(sentence) embeddings obtained from bert.
"""

# Building the model2 

def build_model2():

  # input layer which receives input_ids from tokens as input.
  input_tensor = tf.keras.layers.Input(shape=(420,), dtype=tf.int32)

  # bert layer, which receives tensors from input layer and generate bert embeddings.
  bertModel = TFBertModel.from_pretrained("bert-base-german-cased")
  bert_output = bertModel(input_tensor)
  cls = bert_output[0][:,0,:]

  # classifier layer which receives CLS embeddings from bert layer and generate output labels
  classifier = tf.keras.layers.Dense(1, activation="sigmoid")(cls)

  # defining the model
  model2 = tf.keras.models.Model(input_tensor,classifier)

  # compiling the model
  model2.compile(
          optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001),
          loss="binary_crossentropy",
          metrics=["accuracy"])

  model2.summary()
  return model2
