In [None]:
# installing tansformers library when running in Google Colab

!pip install transformers


In [None]:
# When running in google colab

from google.colab import drive
drive.mount('/content/drive')

In [None]:
"""
Step 1: Importing all the libraries 

Importing the libraries which are necessary for data processing
as well as for neural network models.
"""
# pandas, glob, os & numpy for pre-processing the dataset
import pandas as pd
import glob
import os
import numpy as np

# for generating random numbers
import random

# needed only when running in local machine 
from tkinter import filedialog as fd

# 'transformers' is the library which provides pre-trained models
# we are using BertTokenizer for tokenizing the sentences
# we are going to use tensorflow version of a BertModel i.e., TFBertModel
from transformers import BertTokenizer, TFBertModel

# for creating output labels
from sklearn import preprocessing

# for creating keras model 
import tensorflow as tf

# for displaying model accuracy in graph
import matplotlib.pyplot as plt

# for saving and loading model and Bert embeddings
import pickle

# seeding the random numbers for getting same accuracy value
os.environ['PYTHONHASHSEED']='0'
random.seed(6)
np.random.seed(6)
tf.random.set_seed(6)

In [None]:

# Step 2: Creating train, dev and test datasets 

# Loading the 64 datasets taken from https://github.com/Linguistic-Data-Science-Lab/German_EO_verbs/tree/main/annotations and save in the local path
# Load the saved datasets from local path.
#path = "C:\Velsadhana\Masterscourse\Linguistics Data Science\Sum sem 2022\Research project 1\German_EO_verbs-main\German_EO_verbs-main\datasets"                  
#datasets = glob.glob(os.path.join(path, "*.csv"))     
#all_df = (pd.read_csv(ds,sep=";") for ds in datasets)

# Concatenating it into a single big dataframe
#concat_df = pd.concat(all_df, ignore_index=True)
# Saving the concated df in our local path
#concat_df.to_excel('concat_df.xlsx', index=False)

def create_df():
    """
    First and foremost thing is to save the concat_df file in local path manually !!  
    This file is available in https://github.com/velsadhana/Research-project1-Identifying-psych-verbs/tree/main/Datasets
    (Please note that, I have removed one long sentence from the dataset, as it is a duplicate and has wrong Toke_ID)

    This function reads concat_df file and select 5 specific columns i.e., Verb, Token ID, Sentence, non-psych and not_of_interest.
    Removing the data where 'not_of_interest' column value is 'x'. Then, updating 'non-psych' column and shuffling the dataset. After 
    shuffling, it split it into train, dev and test datasets in the ratio of 40:30:30.

    Input: None
    Output: returns a list containing 3 dataframes such as train, dev and test
    """

    # for running in colab
    concat_df=pd.read_excel("/content/drive/MyDrive/Research project 1/concat_df.xlsx")

    # for running in local machine
    # A dialog box will appear and select the concatenated dataset from your location.
    #concat_ds = fd.askopenfilename()
    #concat_df=pd.read_excel(concat_ds)

    # selecting only particular columns
    temp_df = concat_df[['Verb','Token_ID', 'Sentence', 'non-psych','not_of_interest']]

    # deleting rows where 'not_of_interest' column == x & X
    df = temp_df[(temp_df['not_of_interest'] != 'x') & (temp_df['not_of_interest'] != 'X')]

    # deleting 'not_of_interest' column
    df.drop('not_of_interest', inplace=True, axis=1)

    # updating the 'non-psych' column values as psych in place of blank or non-psych in place of 'x'
    df.loc[df["non-psych"] == "x", "non-psych"] = "non-psych"
    df["non-psych"].fillna("psych", inplace = True)

    # saving it as excel
    df.to_excel('/content/drive/MyDrive/Research project 1/df.xlsx', index=False)

    # shuffling the dataframe
    shuffle_df = df.sample(frac = 1,random_state=1)
    # saving the shuffled data frame into excel file
    shuffle_df.to_excel('/content/drive/MyDrive/Research project 1/shuffle_df.xlsx', index=False)

    # splitting the shuffled dataframe into train, dev and test data
    # slicing train, dev and test in the ratio of 40:30:30 respectively
    train_df, dev_df, test_df = np.split(shuffle_df, [int(.4*len(shuffle_df)),int(.7*len(shuffle_df))])

    # saving into excel files
    train_df.to_excel('/content/drive/MyDrive/Research project 1/train_df.xlsx',index=False)
    dev_df.to_excel('/content/drive/MyDrive/Research project 1/dev_df.xlsx',index=False)
    test_df.to_excel('/content/drive/MyDrive/Research project 1/test_df.xlsx',index=False)

    # counting the size of each dataset
    print("train_df size: ", len(train_df))
    print("dev_df size: ", len(dev_df))
    print("test_df size: ", len(test_df))

    # counting the no. of verbs in each dataset
    n1=len(pd.unique(train_df['Verb']))
    n2=len(pd.unique(dev_df['Verb']))
    n3=len(pd.unique(test_df['Verb']))
    print("verb count in train, dev & test: ", n1,n2,n3)
    return [train_df, dev_df, test_df]


In [None]:

# Step 3: Tokenizing the sentence using BERT tokenizer

def tokenize(sentence):
    """
    This function tokenizing the sentence using BertTokenizer class and "bert-base-german-cased" model and
    store the tokenized input_ids and tokenized words in seperate lists.

    Input: list of sentences 
    Output: returns a list containing input_ids and tokenized words of all sentences
    """

    # list for storing the input_ids & tokenized words
    inp_ids = []
    token_wrds =[]
    tokens = []
    

    # using "bert-base-german-cased" model, since the dataset is in German
    tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")   

    for i in sentence:
        tokns = tokenizer(i)
        tokens.append(tokns)
        
        temp_ids=tokns["input_ids"]
        inp_ids.append(temp_ids)
        temp_words= tokenizer.convert_ids_to_tokens(tokns["input_ids"])
        token_wrds.append(temp_words)

    # padding the input ids
    input_ids_max= max(map(len,inp_ids))
    print("Maximum no. of tokens",input_ids_max)
    input_ids_pad = [i + [0]*(input_ids_max-len(i)) for i in inp_ids]
 
    print("Output of Bert tokenizer for sample sentence:", tokens[6])
    print("Input_ids: ",input_ids_pad[6])
    print("Tokenized words of input_ids: ",token_wrds[6])

    # converting the input_ids to numpy arrays inorder to pass it into bert layer.
    inpIds=np.array(input_ids_pad)
    return [inpIds,token_wrds]



In [None]:
# Step 4: Creating output labels i.e., psych and non-psych which are labelled as 1 and 0 respectively.

def get_out_labels(labels):
    """
    This function converting the labels into numpy arrays using LabelEncoder and fit_transform.
    LabelEncoder convert the string labels(psych/non-psych) into integers i.e., 1 for psych and 0 for non-psych.
    fit_transform converts those integers into numpy arrays.

    Input: list of 'non-psych' column values 
    Output: returns a numpy array of output labels
    """
    
    label = preprocessing.LabelEncoder()
    outLabels = label.fit_transform(labels)
    print("no. of output labels", len(outLabels))
    print("Sample output labels", outLabels[0:3])
    return outLabels

In [None]:
# Step 5: Finding the position of target verb in tokenized words 

# Step 5.1: Getting verb data such as verbs, verb count, previous & next words of a verb

def get_verb_data(sentence, token_id):
    """
    This function get the data of verb whose position = tokenID-1 based on pseudocode
    and store data in individual lists.

    Input: list of sentences, list of values of 'Token_ID' column 
    Output: returns a list containing verb, count of verb in sentence, previous & next word of verb, previous & next word of verb if verb occupies 
            first position in sentence and words of splitted sentence
    """
    # list for storing the verbs
    vrbs = []
    # list to store no. of occurences of a verb in each sentence
    vrb_cnt = []
    # list to store the previous & next words of a verb when it occurs more than once in a sentence
    pn_lst=[]
    # list to store the previous & next words of a verb when it occurs more 
    # than once as well as in first position in a sentence
    pn1_lst=[]
    # list to store the words list of every sentence
    wrds_cnt=[]

    # finding the verbs whose position = tokenID-1 and storing it in list
    for (i,j) in zip(sentence, token_id):
        wrds_lst = i.split()
        temp_vrb=wrds_lst[j-1]
        vrbs.append(temp_vrb)

        # finding the verb count
        temp_cnt= int(wrds_lst.count(temp_vrb))
        vrb_cnt.append(temp_cnt)

        # finding previous and next words of a verb when it's count is >1
        if temp_cnt>1 and j-1>0:
            temp_lst=[wrds_lst[j-2],wrds_lst[j]]
            pn_lst.append(temp_lst)
        if temp_cnt>1 and j-1==0:
            temp1_lst=["[CLS]",wrds_lst[j]]
            pn1_lst.append(temp1_lst)
        wrds_cnt.append(wrds_lst)
        
    return [vrbs,vrb_cnt,pn_lst,pn1_lst,wrds_cnt]  

In [None]:
# Step 5.2: Finding the sequenced position.

def find_verb_pos(token_words,verbs):
    """
    This function picks the position which are in sequence for every sentence based on pseudocode.

    Input: list of tokenized words (2nd output of step3), list of verbs (1st output of step5.1)
    Output: returns a list containing position of verbs in tokenized words
    """

    # lists for storing the sub-words of verbs and their positions.
    sub_words=[]
    pos=[]

    # enumerate is used to count the index of words in a list. It is mainly used to find the index of 
    # duplicate words in a list.
    for (i,j) in zip(token_words,verbs):
        temp_sub_words = [wrd for loc,wrd in enumerate(i) if wrd.replace('##', '') in j]
        temp_pos =[loc for loc,wrd in enumerate(i) if wrd.replace('##', '') in j]
        sub_words.append(temp_sub_words)
        pos.append(temp_pos)    

    # selecting only the sequenced numbers from the 'pos' list
    seq_position=[]
    for (i,j,k) in zip(sub_words,verbs,pos):
        if j in i:
            temp_seq_pos=[y for x,y in zip(i,k) if x==j]
        else:
            temp_seq_pos=[]
            p=0
            for x,y in zip(i,k):
                 p=p+1
                 if x in j:
                    l=k[p-1:]
                    for t in zip(l, l[1:]):
                        if t[0]+1 == t[1]:
                            temp_seq_pos=list(set(temp_seq_pos+list(t)))
                            temp_seq_pos.sort()
                    break
        seq_position.append(temp_seq_pos)
    return seq_position


In [None]:
# Sub function of below Step 5.3.

def find_exact_verb(pn_list,verb_count,seq_pos,token_words):
   
    seqpos_len=int(len(seq_pos))
    splts= int(seqpos_len / verb_count)
    x=0
    y= splts

    # finding which verb is our needed verb based on matching the next and previous words
    if token_words[seq_pos[x]-1].replace('##', '')in pn_list[0] and token_words[seq_pos[y-1]+1].replace('##', '') in pn_list[1]:
        seq_pos[x:y]
    else:
        for cnt in range(2,verb_count+1):
            x=y
            y+=splts
            if token_words[seq_pos[x]-1].replace('##', '') in pn_list[0] and token_words[seq_pos[y-1]+1].replace('##', '') in pn_list[1]:
                seq_pos[x:y]
                break
    return seq_pos[x:y]


In [None]:
# Step 5.3: Finding and getting the position of verbs which we need, if the same verb occurs more than once

def get_verb_pos(verb_count,token_id,seq_pos,token_words,pn_list,pn1_list):
    """
    This function find the position of target verb when the same verb repeats in sentence. It can be done 
    by matching the previous and next words of a verb. In the sense, first we already have the previous and next word of target verb in a list.
    Now, we need to check for which verb the previous and next word is matching. The verb which matches this condition is our required verb.

    Input: list of verb counts(2nd o/p of step5.1), list of values of 'Token_ID column', list of sequenced position(o/p of step5.2), 
           list of tokenized words (2nd o/p of step3), list of previous and next words(3rd o/p of step5.1),
           list of list of previous and next words for verb which occupies 1st position in sentence(4th o/p of step5.1).
    Output: returns a list containing position of target verbs in tokenized words
    """
    actual_pos=[]
    val=0
    val1=0

    for i,j,k,l in zip(verb_count,token_id,seq_pos,token_words):
        if i>1 and j-1 > 0:
            exact_vrbpos=find_exact_verb(pn_list[val],i,k,l)
            actual_pos.append(exact_vrbpos)
            val=val+1
        elif i>1 and j-1 == 0:
            exact_vrbpos=find_exact_verb(pn1_list[val1],i,k,l)
            actual_pos.append(exact_vrbpos)
            val1=val1+1
        elif len(k) > 1:
            temp=[]
            for t in zip(k, k[1:]):
                if t[0]+1 == t[1]:
                    temp=list(set(temp+list(t)))
                    temp.sort()
                else: 
                    break
            actual_pos.append(temp)
        else:
            actual_pos.append(k)
          
    maxLenact= max(map(len,actual_pos))
    #print("Max no.of splits for verbs", maxLenact)
    print("position of verb in tokenized words:", actual_pos[6])
    return actual_pos



In [None]:
# Step 6: Processing step 3 & step 5 (5.1, 5.2, 5.3) at a time.

def get_bert_input(sentence,token_id):
    """
    This function process all the functions of step 3 & step 5 (5.1, 5.2, 5.3) at a time instead of processing one by one.

    Input: list of sentences, list of values of 'Token_ID' column
    Output: returns a list containing input_ids and verb position in tokenized words
    """
    tokens=tokenize(sentence)
    verb_data=get_verb_data(sentence,token_id)
    seq_pos = find_verb_pos(tokens[1],verb_data[0])
    verb_pos = get_verb_pos(verb_data[1],token_id,seq_pos,tokens[1],verb_data[2],verb_data[3])
    return [tokens[0],verb_pos]

In [None]:
# Step 7: Getting the Bert embeddings

def get_bert(bert_input_ids):
  """
  This function process "bert-base-german-cased" model and gets whole embeddings from it.

  Input: numpy arrays of input_ids (1st output of step6)
  Output: last hidden state output of bert, which is a 3d tensor containing embeddings of all sentences
  """

  bertModel = TFBertModel.from_pretrained("bert-base-german-cased")
  bert_output = bertModel(bert_input_ids)
  l_h_s=bert_output[0]
  #print("Last hidden state output of Bert model", l_h_s[6])
  return l_h_s
  

In [None]:
# Step 8: Selecting only sentence and verb embeddings from the Bert embeddings 

def get_sent_and_verb_embed(bert_embedding,no_of_sentences,verb_pos):
  """
  This function retrives only the CLS embedding and verb embedding from above whole embedding.
  
  Input: 3d tensor containing bert embedding(o/p of step7, no. of sentences, position of verb in tokenized words(2nd o/p of step6)
  Output: 3d tensor containing CLS embedding and verb embedding
  """

  temp_tensors=[]
  for i in range(len(no_of_sentences)):
    cls = bert_embedding[i,0,:]
    temp=[cls]
    for p in verb_pos[i]:
      x=bert_embedding[i,p,:]
      temp.append(x)
    temp_tensor=tf.stack(temp,axis=0)
    temp_tensors.append(temp_tensor)
  ragged_embed=tf.ragged.stack(temp_tensors,axis=0)
  
  sentVerbEmbd=ragged_embed.to_tensor()
  print("bert embedding shape",sentVerbEmbd[6].shape)
  print("sample embedding",sentVerbEmbd[6])
  return sentVerbEmbd

In [None]:
# Step 9: Building the model.

def build_model1():
    """
    This function builds the neural network model for psych verbs classification using keras API sourced by tensorflow library 

    Input: none
    Output:  model, that contains input, flattening and output layer.
    """
    # input layer which receives sentence and verb embeddings(from bert) as input.
    inPut = tf.keras.Input(shape=(6,768), dtype=tf.float32)

    # flattening the input
    flatten=tf.keras.layers.Flatten()(inPut)

    # classifier layer 
    outPut = tf.keras.layers.Dense(1, activation="sigmoid")(flatten)

    # defining the model
    model1 = tf.keras.models.Model(inPut,outPut)

    # compiling the model
    model1.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001),
            loss="binary_crossentropy",
            metrics=["accuracy"])
  
    model1.summary()
    return model1
