In [1]:
import os
import pandas as pd
import numpy as np
import nltk

### Using corpus of names

In [2]:
def format_names():
    female_fn = '../names/female.txt'
    male_fn = '../names/male.txt'

    males = pd.read_csv(male_fn, skiprows=4)
    males.rename(columns={males.columns[0]:'MaleNames'}, inplace=True)

    females = pd.read_csv(female_fn, skiprows=4)
    females.rename(columns={females.columns[0]:'FemaleNames'}, inplace=True)

    merged = females.combine_first(males)
    merged = merged.applymap(lambda s:s.lower() if type(s) == str else s)
    
    return merged

In [12]:
merged = format_names()

In [3]:
def check_name_sex(name, merged):
    name = name.lower()
    maleflags = merged['MaleNames'].eq(name).any()
    femaleflags = merged['FemaleNames'].eq(name).any()
    if maleflags and femaleflags:
        val = 'ambiguous'
    elif maleflags:
        val = 'male'
    elif femaleflags:
        val = 'female'
    else:
        val = 'unknown'
    print('The name {0} was decided to be {1}'.format(name, val))
    return val
        

In [6]:
benchmark_names = ['john', 'mary', 'jean', 'ali', 'susan','mamta','harry','hermione','ron','peter','marvin','julie','jeremy','salvador']

In [7]:
for i in benchmark_names:
    check_name_sex(i, merged)

The name john was decided to be male
The name mary was decided to be female
The name jean was decided to be ambiguous
The name ali was decided to be ambiguous
The name susan was decided to be female
The name mamta was decided to be unknown
The name harry was decided to be male
The name hermione was decided to be female
The name ron was decided to be male
The name peter was decided to be male
The name marvin was decided to be male
The name julie was decided to be ambiguous
The name jeremy was decided to be male
The name salvador was decided to be male


### Using gender guesser

In [8]:
import gender_guesser.detector as gender
d = gender.Detector()

for i in benchmark_names:
    #check_str = unicode(i.capitalize(), "utf-8")
    #check_str = i.capitalize().decode("utf-8")
    print("The name {0} was decided to be {1}".format(i, d.get_gender(i.capitalize())))

The name john was decided to be male
The name mary was decided to be mostly_female
The name jean was decided to be male
The name ali was decided to be male
The name susan was decided to be female
The name mamta was decided to be female
The name harry was decided to be male
The name hermione was decided to be unknown
The name ron was decided to be male
The name peter was decided to be male
The name marvin was decided to be male
The name julie was decided to be female
The name jeremy was decided to be male
The name salvador was decided to be male


### Reading from bioarxiv

In [1]:
from bs4 import BeautifulSoup
import requests
import re

In [2]:
page_link = 'https://t.co/EtiFK4DUZe'

scratch space

In [3]:
import pandas as pd

page_response = requests.get('https://t.co/EtiFK4DUZe')
page_content = BeautifulSoup(page_response.content, "html.parser")
all_meta_info = page_content.find_all("meta")
listify = list(all_meta_info)

#names = [name for name in listify if 'Contributor' in name]
#soup = BeautifulSoup(page_response,'lxml')
#table = soup.find_all('table')[0] 
#df = pd.read_html(str(table))
#df.head()

In [5]:
def words_in_line(l):
    words = re.findall(r'"(.*?)"', str(l))
    return words
    
vals = list(map(words_in_line, listify))

In [14]:
# "DC.Date"
date_info = [v[0] for v in vals if "DC.Date" in v]
title_info = [u[0] for u in vals if "citation_title" in u]

In [19]:
page_content

<!DOCTYPE html>

<html dir="ltr" lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:mml="http://www.w3.org/1998/Math/MathML">
<head prefix="og: http://ogp.me/ns# article: http://ogp.me/ns/article# book: http://ogp.me/ns/book#">
<!--[if IE]><![endif]-->
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<link href="//stats.g.doubleclick.net" rel="dns-prefetch"/>
<link href="//scholar.google.com" rel="dns-prefetch"/>
<link href="//d33xdlntwy0kbs.cloudfront.net" rel="dns-prefetch"/>
<link href="//www.google-analytics.com" rel="dns-prefetch"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=3, minimum-scale=1, user-scalable=yes" name="viewport"/>
<link href="https://www.biorxiv.org/sites/default/files/images/favicon.ico" rel="shortcut icon" type="image/vnd.microsoft.icon"/>
<meta content="article" name="type"/>
<meta content="/biorxiv/early/2019/04/06/600668.atom" name="HW.identifier"/>
<meta content="biorxiv;600668v1" name="HW.pisa"/>
<meta content="t

In [51]:
allnames = [n[0] for n in vals if 'DC.Contributor' in n]

In [52]:
allnames

['Xuhui Luo', 'Mingkuang Wang', 'Guiping Hu', 'Boqi Weng']

end of scratch space

In [55]:
def extract_name(page_link):
    page_response = requests.get(page_link, timeout=5)
    page_content = BeautifulSoup(page_response.content, "html.parser")
    all_meta_info = page_content.find_all("meta")
    meta_name = all_meta_info[14]
    actual_name = re.findall(r'"(.*?)"', str(meta_name))[0]
    first_name = actual_name.split(" ")[0]
    return first_name

In [16]:
name_link = extract_name(page_link)

In [53]:
def retweet_or_no(link):
    df = format_names()
    name = extract_name(link)
    label = check_name_sex(name, df)
    if label == 'female':
        return True
    return False

In [56]:
tweet = retweet_or_no(page_link)
print(tweet)

The name xuhui was decided to be unknown
False
