In [None]:
import spacy
import pandas as pd
import numpy as np



In [None]:
df = pd.read_csv("Social_media_profiles.csv")

In [None]:
# Load the English language model
nlp = spacy.load("en_core_web_sm")

def filter_names(text):
    names = set()
    doc = nlp(text)
    for entity in doc.ents:
        if entity.label_ == "PERSON":
            names.add(entity.text)
    #names = sorted(list(names))
    return names



In [None]:
text = "John and Mary went to the park. They met with Mr. Smith."
names = filter_names(text)
print(names)

{'Mary', 'John', 'Smith'}


In [None]:
df.head()

Unnamed: 0,Person_id,Account_Name,Bio,Profile_photo_address,Verified,Ground_truth_label
0,1,Jeff Bezos,Amazon. Blue Origin. Washington Post. Bezos Ea...,https://pbs.twimg.com/profile_images/159155831...,True,Real
1,2,Jeff Bezos Private ,Amazon. Blue Origin. Washington Post. Bezos Ea...,https://pbs.twimg.com/profile_images/170164714...,False,Fake
2,3,Elon Musk,,https://pbs.twimg.com/profile_images/178004448...,True,Real
3,4,Jennifer Widom,Dean @StanfordEng,http://cs.stanford.edu/people/widom/photos/por...,True,Real
4,5,Lisa Randall,Physicist and author. Dark Matter and the Dino...,https://pbs.twimg.com/profile_images/476919936...,True,Real


In [None]:
df['Account_Name'].apply(filter_names)

0            {Jeff Bezos}
1                      {}
2             {Elon Musk}
3        {Jennifer Widom}
4          {Lisa Randall}
5            {Yann LeCun}
6             {Elon Musk}
7            {Yann LeCun}
8                      {}
9                      {}
10                     {}
11    {Leonardo DiCaprio}
12          {Max Tegmark}
13            {Andrew Ng}
14          {Max Tegmark}
15            {Andrew Ng}
16      {Geoffrey Hinton}
17    {Leonardo DiCaprio}
18       {Demis Hassabis}
19                     {}
20           {Fei-Fei Li}
21        {Daphne Koller}
22        {Daphne Koller}
Name: Account_Name, dtype: object

The names with non-name strings are being removed

In [None]:
test_case = df['Account_Name'][1]

In [None]:
type(test_case)

str

In [None]:
doc = nlp(test_case)
for entity in doc.ents:
    print(entity)
    #if entity.label_ == "PERSON":


In [None]:
doc

Jeff Bezos Private 

In [None]:
doc.ents

()

In [None]:
test_case

'Jeff Bezos Private \uea00'

In [None]:
doc = nlp('Jeff Bezos Private')

In [None]:
doc.ents

(Jeff Bezos Private,)

In [None]:
doc = nlp(text)

In [None]:
doc.ents

(John, Mary, Smith)

In [None]:
text = "John Smith and Mary Sue went to the park. They met with Mr. Smith."

In [None]:
doc = nlp(text)
doc.ents

(John Smith, Mary Sue, Smith)

Cleary text needs to be broken down by tokens

In [None]:
def filter_names(text):
    names = []
    doc = nlp(text)
    for token in doc:
        if token.ent_type_ == "PERSON":
            names.append(token.text)
    return names

text = "Dr. Ray Patel is a professor at the university."
names = filter_names(text)
print(names)


['Ray', 'Patel']


In [None]:
nlp(text)

Dr. Ray Patel is a professor at the university.

In [None]:
for token in doc:
    print(token)

John
Smith
and
Mary
Sue
went
to
the
park
.
They
met
with
Mr.
Smith
.


In [None]:
text = "John Smith and Mary Sue went to the park. They met with Mr. Smith."

In [None]:
filter_names(text)

['John', 'Smith', 'Mary', 'Sue', 'Smith']

This clearly seems more promising

In [None]:
df['Account_Name'].apply(filter_names)

0            [Jeff, Bezos]
1                       []
2             [Elon, Musk]
3        [Jennifer, Widom]
4          [Lisa, Randall]
5            [Yann, LeCun]
6             [Elon, Musk]
7            [Yann, LeCun]
8                       []
9                       []
10                      []
11    [Leonardo, DiCaprio]
12          [Max, Tegmark]
13            [Andrew, Ng]
14          [Max, Tegmark]
15            [Andrew, Ng]
16      [Geoffrey, Hinton]
17    [Leonardo, DiCaprio]
18       [Demis, Hassabis]
19                      []
20       [Fei, -, Fei, Li]
21        [Daphne, Koller]
22        [Daphne, Koller]
Name: Account_Name, dtype: object

This problem still remains

In [None]:
test_case

'Jeff Bezos Private \uea00'

In [None]:
doc = nlp(test_case)

In [None]:
for i in doc:
    print(i)

Jeff
Bezos
Private



In [None]:
for i in doc:
    print(i.ent_type_)







In [None]:
doc = nlp('Jeff Bezos took a walk')

In [None]:
for i in doc:
    print(i)

Jeff
Bezos
took
a
walk


In [None]:
for i in doc:
    print(i.ent_type_)

PERSON
PERSON





Clearly there is some strange discrepency here

In [None]:
test_case
doc = nlp(test_case)
for i in doc:
    print(i)
for i in doc:
    print(i.ent_type_)

Jeff
Bezos
Private







In [None]:
doc = nlp("Jeff Bezos Private \uea00")
for i in doc:
    print(i)
for i in doc:
    print(i.ent_type_)



Jeff
Bezos
Private







In [None]:
test_case

'Jeff Bezos Private \uea00'

In [None]:
from name_filtering import return_name_list

In [None]:
return_name_list('Official Jeff Bezos')

'jeffbezos'

In [None]:
df['Account_Name'].apply(return_name_list)

0       jeffbezos
1       jeffbezos
2                
3        jennifer
4     lisarandall
5            yann
6                
7            yann
8                
9                
10               
11       leonardo
12            max
13         andrew
14            max
15         andrew
16               
17       leonardo
18               
19               
20      fei-feili
21         daphne
22         daphne
Name: Account_Name, dtype: object

In [None]:
df['Account_Name']

0                  Jeff Bezos
1        Jeff Bezos Private 
2                   Elon Musk
3              Jennifer Widom
4                Lisa Randall
5                  Yann LeCun
6           Elon Musk (Memes)
7              Dr. Yann LeCun
8                     BEYONCÉ
9         Ariana Grande Today
10              Ariana Grande
11          Leonardo DiCaprio
12                Max Tegmark
13                  Andrew Ng
14                Max Tegmark
15                  Andrew Ng
16            Geoffrey Hinton
17          Leonardo DiCaprio
18             Demis Hassabis
19    Demis Hassabis Official
20                 Fei-Fei Li
21              Daphne Koller
22    Professor Daphne Koller
Name: Account_Name, dtype: object

So far NER seems like a very poor way to go. Next I will try using a rules based system on a dataset of first and last names

In [None]:
from names_dataset import NameDataset, NameWrapper

In [None]:
nd = NameDataset()

In [None]:
print(NameWrapper(nd.search('Widom')).describe)

, 


In [None]:
'Hassabis' in nd.last_names

False

In [None]:
'Widom' in nd.last_names

False

It would seem that a lot of last and first names are simply not in the corpus. This creates a situation where there are other options to consider such as going through all the official verified accounts and adding their last and first names to the corpus.

The real problem is if these accounts are just associated with a username, how do we extract first and last name from that? As it turns out, doing this with NER is much harder than one would hope. One could think about using an LLM and that would probably work very well but it would be expensive.

Probably the best solution is to require verified users to provide their first and last name.

It will be useful to again look at the sample data:

In [None]:
df

Unnamed: 0,Person_id,Account_Name,Bio,Profile_photo_address,Verified,Ground_truth_label
0,1,Jeff Bezos,Amazon. Blue Origin. Washington Post. Bezos Ea...,https://pbs.twimg.com/profile_images/159155831...,True,Real
1,2,Jeff Bezos Private ,Amazon. Blue Origin. Washington Post. Bezos Ea...,https://pbs.twimg.com/profile_images/170164714...,False,Fake
2,3,Elon Musk,,https://pbs.twimg.com/profile_images/178004448...,True,Real
3,4,Jennifer Widom,Dean @StanfordEng,http://cs.stanford.edu/people/widom/photos/por...,True,Real
4,5,Lisa Randall,Physicist and author. Dark Matter and the Dino...,https://pbs.twimg.com/profile_images/476919936...,True,Real
5,6,Yann LeCun,Professor at NYU. Chief AI Scientist at Meta.\...,https://pbs.twimg.com/profile_images/148357786...,True,Real
6,7,Elon Musk (Memes),This account is not affiliated with the real E...,https://pbs.twimg.com/profile_images/174140309...,False,Fan
7,8,Dr. Yann LeCun,Head of AI at Meta,https://upload.wikimedia.org/wikipedia/commons...,False,Fake
8,9,BEYONCÉ,,https://pbs.twimg.com/profile_images/177011918...,True,Real
9,10,Ariana Grande Today,the #1 fan source for the latest ariana grande...,https://pbs.twimg.com/profile_images/176614057...,False,Fan


I think that a cross-join is a very powerful option here. The problem is how expensive it is. With a cross join, we can get all combinations, and then use a where clause or equivalent to filter the results and just have the results where the user-names match or we can use regex to make sure they are a close match. But that is expensive so I'm going to write a different program for extracting first and last names from non-verified users, and assume that first and last names are provided for the verified users.
By far the best approach is:

1) To split the username into substrings, splitting it on spaces, underscores, or take your pick.

2) Only retain those substrings that are in the list of first and/or last names of verified users.

3) Whichever substring comes first is recognized as first name, and whichever one is last is the last name

4) Most likely everything should be in lower-case in order to run this

In [None]:
from name_filtering import split_string

In [None]:
split_string('sfdfsfd sdfsdf_sdf')

['sfdfsfd', 'sdfsdf', 'sdf']

In [None]:
#creating a function to extract

In [None]:
def extract_names2(word_list):
    first_last = [i for i in word_list if i in name_list]


In [None]:
'Jeff Bezos Private '.lower()

'jeff bezos private \uea00'

further testing using the new paradigm will proceed in a new notebook: testing custom rules based method for name extraction. Furthermore the data-set will be updated to reflect the reality that you want to obtain first and last names from verified users through user input
