In [1]:
import pandas as pd
import numpy  as np

import json
import os

from tqdm import tqdm

### Loading lists of first and last arabic names

In [2]:
lFirstName,lLastName = np.load('list_names.npy', allow_pickle=True)
len(lFirstName), len(lLastName)

(24224, 3069)

### Loading dataset JSONL files to DF

In [3]:
if False:
    data        = []
    iSentence   = 0
    dfData      = pd.DataFrame()
    folder_path =  'ar/' #'he/'

    for (root,dirs,files) in os.walk(folder_path, topdown=True):
        files.sort()
        for file in tqdm(files):
            file_name = os.path.join(root, file)
            with open(file_name, encoding="utf8") as f:
                lWords = []
                lLabel = []
                for line in f:
                    lLine  = line.split()
                    if len(lLine) != 0:
                        lWords.append(lLine[0])
                        lLabel.append(lLine[1])
                    else:
                        iSentence += 1
                        dfSentence           = pd.DataFrame(lWords, columns=['sentence'])
                        dfSentence['label' ] = pd.Series(lLabel)
                        dfSentence['senIdx'] = iSentence
                        dfData = dfData.append(dfSentence)
                        lWords     = []
                        lLabel     = []
    dfData.to_pickle('dfArData.pkl') #'dfHeData.pkl' for hebrew, 'dfArData.pkl' for arabic                    
    dfData

### Loading DF of NER labeled sentences

In [5]:
dfData = pd.read_pickle('dfArData.pkl')
dfData.head(3)

Unnamed: 0,sentence,label,senIdx
0,"""",O,1
1,نحن,O,1
2,نشرح,O,1


### Buileding a NER dataset only for names

In [6]:
lSentences = dfData.senIdx.unique().tolist()
len(lSentences)   

30778

In [7]:
if False:    
    dfPersonSentences = pd.DataFrame()

    for i in tqdm(lSentences):
        df                = pd.DataFrame()
        dfSen            = dfData[dfData.senIdx==i]
        lSentenceLabeles = dfSen.label.unique().tolist()
        # check if there are person entities in the sentence
        if any("PER" in s for s in lSentenceLabeles):
            sentence = dfSen.sentence.tolist()
            label    = dfSen.label.tolist()
            new_row  = {
                        'senIdx'   : int(i),
                        'sentence' : sentence,
                        'label'    : label,
                        'sper'     : 1 if 'S-PER' in label else 0,
                        'bper'     : 1 if 'B-PER' in label else 0,
                        'iper'     : 1 if 'I-PER' in label else 0,
                        'eper'     : 1 if 'E-PER' in label else 0
            }
            df       = df.append(new_row, ignore_index=True) 
            dfPersonSentences = dfPersonSentences.append(df)
        else: 
            pass
    dfPersonSentences.to_pickle('dfArPerSen.pkl') #'dfHePerSen.pkl' for hebrew, 'dfArPerSen.pkl' for arabic
    dfPersonSentences

In [16]:
lColumns          = [  'senIdx', 'sentence', 'sper', 'bper', 'iper', 'eper',  'label']
dfPersonSentences = pd.read_pickle('dfArPerSen.pkl')
dfPersonSentences = dfPersonSentences[lColumns].reset_index(drop=True)
dfPersonSentences.head(3)

Unnamed: 0,senIdx,sentence,sper,bper,iper,eper,label
0,5.0,"[رئيس, ال, مجلس،, عباس, أبو, عواد،, متفائلا, :...",0.0,1.0,1.0,1.0,"[B-TTL, I-TTL, E-TTL, B-PER, I-PER, E-PER, O, ..."
1,6.0,"[يقف, عباس, ابو, عواد, ،, رئيس, ال, مجلس, ال, ...",0.0,1.0,1.0,1.0,"[O, B-PER, I-PER, E-PER, O, B-TTL, I-TTL|B-ORG..."
2,8.0,"[و, بشر, عواد, ال, عائلة, أن, ه, ب, الاضافة, ا...",1.0,0.0,0.0,0.0,"[O, O, S-PER, O, O, O, O, O, O, O, O, O, O, O,..."


In [14]:
dfPersonSentences[['sper','bper','iper','eper']].describe().iloc[1]

sper    0.340090
bper    0.612941
iper    0.203859
eper    0.613915
Name: mean, dtype: float64

In [54]:
lUpdFirstName = [x for x in lFirstName if x not in lLastName ]
lUpdLastName  = [x for x in lLastName  if x not in lFirstName]
len(lUpdFirstName), len(lUpdLastName)

(21719, 564)

In [67]:
def labeling(l,lUpdFirstName,lUpdLastName):
    return ['F' if x in lUpdFirstName else 'L' if x in lUpdLastName else 'O' for x in l]

dfPersonSentences['name_label'] = dfPersonSentences.sentence.apply(labeling,args=(lUpdFirstName,lUpdLastName,))
dfPersonSentences

Unnamed: 0,senIdx,sentence,sper,bper,iper,eper,label,name_label
0,5.0,"[رئيس, ال, مجلس،, عباس, أبو, اسكندر, متفائلا, ...",0.0,1.0,1.0,1.0,"[B-TTL, I-TTL, E-TTL, B-PER, I-PER, E-PER, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,6.0,"[يقف, عباس, ابو, عواد, ،, رئيس, ال, مجلس, ال, ...",0.0,1.0,1.0,1.0,"[O, B-PER, I-PER, E-PER, O, B-TTL, I-TTL|B-ORG...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,8.0,"[و, بشر, عواد, ال, عائلة, أن, ه, ب, الاضافة, ا...",1.0,0.0,0.0,0.0,"[O, O, S-PER, O, O, O, O, O, O, O, O, O, O, O,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,10.0,"[هذه, اديك, زيارة, هي, جزء, من, حملة, ', ال, ج...",0.0,1.0,0.0,1.0,"[O, O, O, O, O, O, O, O, B-MISC, I-MISC, I-MIS...","[F, L, O, O, O, O, F, O, O, O, O, O, O, F, O, ..."
4,29.0,"["", عندما, بدأ, ال, إغلاق, توقف, ال, ناس, عن, ...",0.0,1.0,0.0,1.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, F, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...,...,...,...,...
5126,30710.0,"[تم, توثيق, ال, حادث،, و, حظي, ب, استنكارات, و...",0.0,1.0,0.0,1.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[F, O, O, O, O, F, O, O, O, O, O, F, O, O, O, ..."
5127,30714.0,"[شارك, في, ال, زيارة, أيضا, نائب, ال, قائد, ال...",0.0,1.0,1.0,1.0,"[O, O, O, O, O, B-TTL, B-TTL|I-TTL, I-TTL|I-TT...","[O, O, O, O, O, F, O, O, O, O, O, O, O, O, O, ..."
5128,30743.0,"[أورلي, سيلفينجر, ،, ال, مدير, ال, عام, "", بطي...",0.0,1.0,0.0,1.0,"[B-PER, E-PER, O, B-TTL, I-TTL, I-TTL, E-TTL, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
5129,30766.0,"[في, رياض, ال, أطفال, يستخدم, "", بطيرم, "", قصة...",1.0,0.0,0.0,0.0,"[O, O, O, O, O, O, S-ORG, O, O, O, O, O, S-PER...","[O, O, O, O, O, O, O, O, F, F, O, O, O, O, O, ..."
