In [1]:
import yaml
import requests
from bs4 import BeautifulSoup
from pymed import PubMed
import urllib.request
import requests
import gdown
import os
import pandas as pd
from PIL import Image

### Publications

In [2]:
# url = 'https://weissmanlab.ucsf.edu/publications/publications.html'
# page = requests.get(url)

# soup = BeautifulSoup(page.text, 'html.parser')

# old_website_pubs = [(a.get('href'), a.text) for pub_div in soup.find_all(id='publication') for a in pub_div.find_all('a')]

In [2]:
pubmed = PubMed(tool="MyTool", email="lenail@mit.edu")
results = list(pubmed.query("Jonathan Weissman", max_results=500))
len(results)

252

In [4]:
pubmed_pubs = []

for result in results:
    try:
        pubmed_pubs.append(
            dict(title=result.title,
                authors=[str(author['firstname'])+' '+str(author['lastname']) for author in result.authors],
                publication_date=result.publication_date.strftime("%d/%m/%y"),
                publication_year=result.publication_date.strftime("%Y"),
                pubmed_id=result.pubmed_id.split('\n')[0],
                abstract=result.abstract,
                doi=result.doi.split('\n')[0],
                journal=result.journal))

    except:
        # print(result.toDict())

        print(result.title)
        print(result.authors)
        print(result.publication_date)
        print(result.pubmed_id)
        print(result.abstract)
        print(result.doi)
        print(result.journal)

In [5]:
replacements = {
    'bioRxiv : the preprint server for biology': 'biorxiv',
    'Methods in molecular biology (Clifton, N.J.)': 'Methods in molecular biology',
    'Current biology : CB': 'Current biology',
    'Science (New York, N.Y.)': 'Science',
    'Proceedings of the National Academy of Sciences of the United States of America': 'PNAS',
    'Molecular & cellular proteomics : MCP': 'Molecular & cellular proteomics'
}

In [6]:
for p in pubmed_pubs:
    if p['journal'] in replacements:
        p['journal'] = replacements[p['journal']]

In [7]:
pubmed_pubs = pd.DataFrame.from_records(pubmed_pubs)

In [8]:
with open('_data/publications.yml', 'r') as f:
    old_pubs = pd.json_normalize(yaml.safe_load(f))

#### New DOIs:

In [9]:
pubmed_pubs[ ~pubmed_pubs.doi.isin(old_pubs.doi)]


Unnamed: 0,title,authors,publication_date,publication_year,pubmed_id,abstract,doi,journal
0,FET fusion oncoproteins disrupt physiologic DN...,"[Asmin Tulpule, Shruti Menon, Marcus Breese, Y...",03/07/23,2023,37398210,The genetic principle of synthetic lethality i...,10.21203/rs.3.rs-2869150/v1,Research square


#### Updates to existing IDs:

In [10]:
pubmed_pubs['update'] = 'current'
old_pubs['update'] = 'previous'

df = pd.concat([pubmed_pubs[pubmed_pubs.doi.isin(old_pubs.doi)],old_pubs[old_pubs.doi.isin(pubmed_pubs.doi)]]).reset_index(drop=True)
df.loc[df.astype(str).drop_duplicates(subset=['title', 'authors', 'publication_date', 'publication_year', 'pubmed_id', 'abstract', 'doi', 'journal'], keep=False).index].sort_values(by='doi')

Unnamed: 0,title,authors,publication_date,publication_year,pubmed_id,abstract,doi,journal,update
333,Combinatorial genetics in liver repopulation a...,"[Kirk J Wangensteen, Yue J Wang, Zhixun Dou, A...",02/11/17,2017,29091290,Clustered regularly interspaced short palindro...,10.1002/hep.29626,Hepatology,previous
82,Combinatorial genetics in liver repopulation a...,"[Kirk J Wangensteen, Yue J Wang, Zhixun Dou, A...",02/11/17,2017,29091290,Clustered regularly interspaced short palindro...,10.1002/hep.29626,"Hepatology (Baltimore, Md.)",current
362,Targeting the AAA ATPase p97 as an Approach to...,"[Daniel J Anderson, Ronan Le Moigne, Stevan Dj...",12/11/15,2015,26555175,p97 is a AAA-ATPase with multiple cellular fun...,10.1016/j.ccell.2015.10.002,Cancer cell,previous
111,Targeting the AAA ATPase p97 as an Approach to...,"[Daniel J Anderson, Ronan Le Moigne, Stevan Dj...",12/11/15,2015,26555175,p97 is a AAA-ATPase with multiple cellular fun...,10.1016/j.ccell.2015.10.002,Cancer cell,current
406,Ribosome profiling provides evidence that larg...,"[Mitchell Guttman, Pamela Russell, Nicholas T ...",03/07/13,2013,23810193,Large noncoding RNAs are emerging as an import...,10.1016/j.cell.2013.06.009,Cell,previous
...,...,...,...,...,...,...,...,...,...
368,Pharmacological dimerization and activation of...,"[Carmela Sidrauski, Jordan C Tsai, Martin Kamp...",16/04/15,2015,25875391,The general translation initiation factor eIF2...,10.7554/eLife.07314,eLife,previous
113,Paradoxical resistance of multiple myeloma to ...,"[Diego Acosta-Alvear, Min Y Cho, Thomas Wild, ...",04/09/15,2015,26327694,"Hallmarks of cancer, including rapid growth an...",10.7554/eLife.08153,eLife,current
364,Paradoxical resistance of multiple myeloma to ...,"[Diego Acosta-Alvear, Min Y Cho, Thomas Wild, ...",04/09/15,2015,26327694,"Hallmarks of cancer, including rapid growth an...",10.7554/eLife.08153,eLife,previous
81,The Human Cell Atlas.,"[Aviv Regev, Sarah A Teichmann, Eric S Lander,...",06/12/17,2017,29206104,The recent advent of methods for high-throughp...,10.7554/eLife.27041,eLife,current


#### Removal of IDs:

In [11]:
old_pubs[ ~old_pubs.doi.isin(pubmed_pubs.doi)]

Unnamed: 0,title,authors,publication_date,publication_year,pubmed_id,abstract,doi,journal,update


#### For now: 

In [12]:
pubmed_pubs = pd.concat([pubmed_pubs[ ~pubmed_pubs.doi.isin(old_pubs.doi)], old_pubs])

pubmed_pubs = pubmed_pubs.drop('update', axis=1)
pubs_records = [{k:v for k,v in m.items() if str(v) != 'nan'} for m in pubmed_pubs.to_dict(orient='records')]


In [13]:
open('_data/publications.yml', 'w').write(yaml.dump(pubs_records, default_flow_style=False, sort_keys=False))

395437

### Previous Site

In [3]:
people_url = 'https://weissmanlab.ucsf.edu/people/people.html'
alum_url = 'https://weissmanlab.ucsf.edu/people/alumni.html'
people_page = requests.get(people_url)
alum_page = requests.get(alum_url)

people_soup = BeautifulSoup(people_page.text, 'html.parser')
alum_soup = BeautifulSoup(alum_page.text, 'html.parser')

def get_text(elem, id):
    elem = elem.find(id=id)
    if elem: return elem.get_text(separator='|')
    else: return ''

people = [dict(name=get_text(person, 'peo_name').replace('|', ' '),
           pos=get_text(person, 'peo_pos'),
           email=get_text(person, 'peo_email'),
           img=person.find('img').get('src')) for person in people_soup.find_all(id='peo_grid')]
alums = [dict(name=get_text(person, 'peo_name').replace('|', ' '),
          pos=get_text(person, 'alum_pos'),
          now=get_text(person, 'alum_current_2line'),
          img=(person.find('img').get('src') if person.find('img') else None)) for person in alum_soup.find_all(id='peo_grid')][1:]

for p in people: p['first_name'] = p['name'].split(' ')[0]
for p in people: p['last_name'] = p['name'].split(' ')[-1]
for p in alums: p['first_name'] = p['name'].split(' ')[0]
for p in alums: p['last_name'] = p['name'].split(' ')[-1]

for p in people: p['alum'] = 'false'
for p in alums: p['alum'] = 'true'

In [4]:
for person in people:
    urllib.request.urlretrieve('https://weissmanlab.ucsf.edu/people/'+person['img'], 'assets/img/people/'+person['img'])

In [7]:
for person in alums:
    urllib.request.urlretrieve('https://weissmanlab.ucsf.edu/people/'+person['img'], 'assets/img/people/'+person['img'].split('/')[-1])

    person['img'] = person['img'].split('/')[-1]

In [8]:
people = people + alums
for p in people: p['id'] = p['first_name']+p['last_name']

In [9]:
def rename_img(p):
    if p['img'] == 'weibezahn.jpg' or p['img'] == '':
        p['img'] = ''
    else:
        change_from = f'assets/img/people/{p["img"]}'
        change_to = f'assets/img/people/{p["id"]+"."+p["img"].split(".")[-1]}'
        if os.path.exists(change_from): os.rename(change_from, change_to)
        p["img"] = change_to

for p in people: rename_img(p)

In [13]:
pd.DataFrame(people).to_csv('old_site_people.csv', index=False)

### Survey

In [15]:
people = pd.read_csv('old_site_people.csv')

In [29]:
survey_responses = pd.read_csv('https://docs.google.com/spreadsheets/d/1uxl0DQpqHvNdgI92UVx1Vyt4TQhkgt74cPp1dEn3Gjw'+'/export?format=csv')

In [30]:
survey_responses = survey_responses.rename(columns={'Name':'name', 'Role':'pos', 'Email':'email', 'Portrait / head shot':'img'})[['name', 'pos', 'email', 'img']]

In [32]:
survey_responses['first_name'] = survey_responses['name'].str.split(' ').str[0]
survey_responses['last_name'] = survey_responses['name'].str.split(' ').str[-1]
survey_responses['id'] = survey_responses['first_name']+survey_responses['last_name']
survey_responses['alum'] = False

In [46]:
for id, url in survey_responses.set_index('id')['img'].items():
    print(id, url)
    if type(url) == str:
        change_to = 'assets/img/people/'+id+'.jpg'
        gdown.download("https://drive.google.com/uc?id={}".format(url.split('id=')[-1]), change_to, quiet=False)

RichardShe https://drive.google.com/open?id=1pAqQi52Pr1B0xIEtMr3XaAas1hVMW08w


Downloading...
From: https://drive.google.com/uc?id=1pAqQi52Pr1B0xIEtMr3XaAas1hVMW08w
To: /Users/alex/Documents/weissmanlab/assets/img/people/RichardShe.jpg
100%|██████████| 588k/588k [00:00<00:00, 15.7MB/s]


LakshmiMiller-Vedam https://drive.google.com/open?id=1pAkanvnqJmd1D9i7Onl_6_lWLmg5nugY


Downloading...
From: https://drive.google.com/uc?id=1pAkanvnqJmd1D9i7Onl_6_lWLmg5nugY
To: /Users/alex/Documents/weissmanlab/assets/img/people/LakshmiMiller-Vedam.jpg
100%|██████████| 95.3k/95.3k [00:00<00:00, 3.84MB/s]


LukeKoblan https://drive.google.com/open?id=1IsF8g-d_w7A4XTUmI_2t21q2jGv0EKTf


Downloading...
From: https://drive.google.com/uc?id=1IsF8g-d_w7A4XTUmI_2t21q2jGv0EKTf
To: /Users/alex/Documents/weissmanlab/assets/img/people/LukeKoblan.jpg
100%|██████████| 2.34M/2.34M [00:00<00:00, 15.1MB/s]


ZebulonLevine https://drive.google.com/open?id=12ColnJvOtz18CTVoc9p1HeKm4R3hjHV7


Downloading...
From: https://drive.google.com/uc?id=12ColnJvOtz18CTVoc9p1HeKm4R3hjHV7
To: /Users/alex/Documents/weissmanlab/assets/img/people/ZebulonLevine.jpg
100%|██████████| 11.4k/11.4k [00:00<00:00, 25.4MB/s]


JosephReplogle https://drive.google.com/open?id=1ycj11mfsZUJiOY9RFpyb7svJ9n9vCGSm


Downloading...
From: https://drive.google.com/uc?id=1ycj11mfsZUJiOY9RFpyb7svJ9n9vCGSm
To: /Users/alex/Documents/weissmanlab/assets/img/people/JosephReplogle.jpg
100%|██████████| 968k/968k [00:00<00:00, 7.15MB/s]


XiaojieQiu https://drive.google.com/open?id=14JxH-Gx2PkGrvIdslelK6CgbahXjb7L7


Downloading...
From: https://drive.google.com/uc?id=14JxH-Gx2PkGrvIdslelK6CgbahXjb7L7
To: /Users/alex/Documents/weissmanlab/assets/img/people/XiaojieQiu.jpg
100%|██████████| 16.5k/16.5k [00:00<00:00, 16.5MB/s]


ReubenSaunders https://drive.google.com/open?id=10yqkD80mUAibxDNiJWbtOaT9azrITN72


Downloading...
From: https://drive.google.com/uc?id=10yqkD80mUAibxDNiJWbtOaT9azrITN72
To: /Users/alex/Documents/weissmanlab/assets/img/people/ReubenSaunders.jpg
100%|██████████| 398k/398k [00:00<00:00, 6.58MB/s]


MattJones https://drive.google.com/open?id=1rl68JZd9UYCybIeylNaRMxtVB4ktF3HJ


Downloading...
From: https://drive.google.com/uc?id=1rl68JZd9UYCybIeylNaRMxtVB4ktF3HJ
To: /Users/alex/Documents/weissmanlab/assets/img/people/MattJones.jpg
100%|██████████| 561k/561k [00:00<00:00, 8.76MB/s]


SarahSchumacher https://drive.google.com/open?id=1WJ5Kl7qaNTYB1kAHIrqmdt9fOOtlsKF3


Downloading...
From: https://drive.google.com/uc?id=1WJ5Kl7qaNTYB1kAHIrqmdt9fOOtlsKF3
To: /Users/alex/Documents/weissmanlab/assets/img/people/SarahSchumacher.jpg
100%|██████████| 117k/117k [00:00<00:00, 4.44MB/s]


TessaBertozzi https://drive.google.com/open?id=1TQJiPteaOk8HthDPapEFRKNCALgAHSZS


Downloading...
From: https://drive.google.com/uc?id=1TQJiPteaOk8HthDPapEFRKNCALgAHSZS
To: /Users/alex/Documents/weissmanlab/assets/img/people/TessaBertozzi.jpg
100%|██████████| 2.70M/2.70M [00:00<00:00, 14.7MB/s]


AnneOdera https://drive.google.com/open?id=1BPMTR0JQd6V5aXAZXH9SZZyEUm54-6_d


Downloading...
From: https://drive.google.com/uc?id=1BPMTR0JQd6V5aXAZXH9SZZyEUm54-6_d
To: /Users/alex/Documents/weissmanlab/assets/img/people/AnneOdera.jpg
100%|██████████| 274k/274k [00:00<00:00, 7.57MB/s]


AlexLeNail https://drive.google.com/open?id=1V8zAqzjdeIdUFCUeWB1lfqZI7ZRyx0la


Downloading...
From: https://drive.google.com/uc?id=1V8zAqzjdeIdUFCUeWB1lfqZI7ZRyx0la
To: /Users/alex/Documents/weissmanlab/assets/img/people/AlexLeNail.jpg
100%|██████████| 857k/857k [00:00<00:00, 10.9MB/s]


YiChen nan
AtharvOak https://drive.google.com/open?id=1AW-fszVxhPoQbhk56kCzeOxANBBjE7M0


Downloading...
From: https://drive.google.com/uc?id=1AW-fszVxhPoQbhk56kCzeOxANBBjE7M0
To: /Users/alex/Documents/weissmanlab/assets/img/people/AtharvOak.jpg
100%|██████████| 626k/626k [00:00<00:00, 8.41MB/s]


CristenMuresan https://drive.google.com/open?id=14lJw4f2eAHCqmJ3z6O_zqd8CxpfBG9am


Downloading...
From: https://drive.google.com/uc?id=14lJw4f2eAHCqmJ3z6O_zqd8CxpfBG9am
To: /Users/alex/Documents/weissmanlab/assets/img/people/CristenMuresan.jpg
100%|██████████| 1.12M/1.12M [00:00<00:00, 9.71MB/s]


GayathriMuthukumar https://drive.google.com/open?id=1acceshE6cnXyvun9ehg_5EQhADp-gn26


Downloading...
From: https://drive.google.com/uc?id=1acceshE6cnXyvun9ehg_5EQhADp-gn26
To: /Users/alex/Documents/weissmanlab/assets/img/people/GayathriMuthukumar.jpg
100%|██████████| 5.27M/5.27M [00:00<00:00, 17.1MB/s]


KaterinaPopova https://drive.google.com/open?id=1pUpF7EhZn8E7RWAQJMKqZkDf5Z65PT9s


Downloading...
From: https://drive.google.com/uc?id=1pUpF7EhZn8E7RWAQJMKqZkDf5Z65PT9s
To: /Users/alex/Documents/weissmanlab/assets/img/people/KaterinaPopova.jpg
100%|██████████| 120k/120k [00:00<00:00, 4.43MB/s]


YuanchengLu https://drive.google.com/open?id=1uH4d9gunYYbGys_pbbOdjgG-ZFjv8Zyx


Downloading...
From: https://drive.google.com/uc?id=1uH4d9gunYYbGys_pbbOdjgG-ZFjv8Zyx
To: /Users/alex/Documents/weissmanlab/assets/img/people/YuanchengLu.jpg
100%|██████████| 266k/266k [00:00<00:00, 6.95MB/s]


KatieYost https://drive.google.com/open?id=15BWQLjHXdd5f-6h_klvKl87e1t4Kx0gU


Downloading...
From: https://drive.google.com/uc?id=15BWQLjHXdd5f-6h_klvKl87e1t4Kx0gU
To: /Users/alex/Documents/weissmanlab/assets/img/people/KatieYost.jpg
100%|██████████| 742k/742k [00:00<00:00, 10.5MB/s]

JeffHussmann nan





In [57]:
set(people.id) & set(survey_responses.id)

{'JeffHussmann',
 'JosephReplogle',
 'KaterinaPopova',
 'LakshmiMiller-Vedam',
 'MattJones',
 'ReubenSaunders',
 'RichardShe',
 'XiaojieQiu'}

In [58]:
people = pd.concat((survey_responses, people)).drop_duplicates(subset=['id'], keep='first')

In [61]:
people = people.drop('img', axis=1)

In [63]:
people.to_csv('old_site_and_survey_people.csv', index=False)

In [64]:
pd.DataFrame(people).to_clipboard()

### Trim down lab photos to max size

In [4]:
import os
for img_path in os.listdir('assets/img/people/'):
    try:
        image = Image.open('assets/img/people/'+img_path)
        image.thumbnail((400, 400))
        image.save('assets/img/people/'+img_path)
    except:
        print(img_path)

### Update People from spreadsheet

In [15]:
# id must be both the publication first/last, and the name of the image. Name is just the display name.

In [5]:
url = 'https://docs.google.com/spreadsheets/d/1-Eju9h1XovEBoBv0DGpxh92GYsZ8bzGkxRdRgUb7hvg'
people = pd.read_csv(url+'/export?format=csv')
# strip strings
df_obj = people.select_dtypes(['object'])
people[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

In [6]:
with open('_data/people.yml', 'r') as f:
    old_people_data = pd.json_normalize(yaml.safe_load(f))

#### New IDs:

In [7]:
people[ ~people.id.isin(old_people_data.id)]


Unnamed: 0,name,pos,email,id,alum,now
12,Josh Abraham,Post-doc,kabraham@wi.mit.edu,JoshAbraham,False,


#### Updates to existing IDs:

In [9]:
people['update'] = 'current'
old_people_data['update'] = 'previous'

pd.concat([people[people.id.isin(old_people_data.id)],old_people_data[old_people_data.id.isin(people.id)]]).drop_duplicates(subset=['name', 'pos', 'email', 'alum', 'now'], keep=False).sort_values(by='id')


Unnamed: 0,name,pos,email,id,alum,now,update


#### Removal of IDs:

In [10]:
old_people_data[ ~old_people_data.id.isin(people.id)]

Unnamed: 0,name,pos,email,id,alum,now,update


#### Write updated sheet to yaml

In [11]:
people = people.drop('update', axis=1)
people_records = [{k:v for k,v in m.items() if pd.notnull(v)} for m in people.to_dict(orient='records')]


In [12]:
class MyDumper(yaml.SafeDumper):
    # HACK: insert blank lines between top-level objects
    # inspired by https://stackoverflow.com/a/44284819/3786245
    def write_line_break(self, data=None):
        super().write_line_break(data)

        if len(self.indents) == 1:
            super().write_line_break()

In [13]:
open('_data/people.yml', 'w').write(yaml.dump(people_records, Dumper=MyDumper, default_flow_style=False, sort_keys=False))


19112