# Pandas Merge and Apply

In [60]:
import pandas as pd

In [61]:
doi_df_journal = pd.DataFrame({
    "DOI": [
        "10.1371/journal.pcbi.1004668",
        "10.21105/joss.01035",
        "10.12688/f1000research.18866.2",
        "10.1038/s41598-019-52881-4",
        "10.1186/s12859-019-3171-0"],
    "journal": [
        "PLoS Comput Biol",
        "JOSS",
        "F1000Res",
        "Sci Rep",
        "BMC Bioinformatics"]})

doi_df_first_author = pd.DataFrame({
    "DOI": [
        "10.1371/journal.pcbi.1004668",
        "10.21105/joss.01035",
        "10.21105/joss.01006",
        "10.12688/f1000research.18866.2",
        "10.1186/s12859-019-3171-0"],
    "first_author": [
        "Blischak",
        "Sparks",
        "Granger",
        "Thang",
        "Chen"]})

In [62]:
doi_df_first_author

Unnamed: 0,DOI,first_author
0,10.1371/journal.pcbi.1004668,Blischak
1,10.21105/joss.01035,Sparks
2,10.21105/joss.01006,Granger
3,10.12688/f1000research.18866.2,Thang
4,10.1186/s12859-019-3171-0,Chen


In [63]:
doi_df_journal

Unnamed: 0,DOI,journal
0,10.1371/journal.pcbi.1004668,PLoS Comput Biol
1,10.21105/joss.01035,JOSS
2,10.12688/f1000research.18866.2,F1000Res
3,10.1038/s41598-019-52881-4,Sci Rep
4,10.1186/s12859-019-3171-0,BMC Bioinformatics


In [64]:
# inner merge / join 
# join everything that is available in both DFs

doi_df_first_author.merge(doi_df_journal, on='DOI')

Unnamed: 0,DOI,first_author,journal
0,10.1371/journal.pcbi.1004668,Blischak,PLoS Comput Biol
1,10.21105/joss.01035,Sparks,JOSS
2,10.12688/f1000research.18866.2,Thang,F1000Res
3,10.1186/s12859-019-3171-0,Chen,BMC Bioinformatics


In [65]:
# Join on right DF
# Everything in right DF persists; missing rows in left DF get .fillna()

doi_df_first_author.merge(doi_df_journal, on='DOI', how='left') 

Unnamed: 0,DOI,first_author,journal
0,10.1371/journal.pcbi.1004668,Blischak,PLoS Comput Biol
1,10.21105/joss.01035,Sparks,JOSS
2,10.21105/joss.01006,Granger,
3,10.12688/f1000research.18866.2,Thang,F1000Res
4,10.1186/s12859-019-3171-0,Chen,BMC Bioinformatics


In [66]:
doi_df_journal

Unnamed: 0,DOI,journal
0,10.1371/journal.pcbi.1004668,PLoS Comput Biol
1,10.21105/joss.01035,JOSS
2,10.12688/f1000research.18866.2,F1000Res
3,10.1038/s41598-019-52881-4,Sci Rep
4,10.1186/s12859-019-3171-0,BMC Bioinformatics


In [67]:
# Join on left DF
# Everything in left DF persists; missing rows in right DF get .fillna()

doi_df_first_author.merge(doi_df_journal, on='DOI', how='right') 

Unnamed: 0,DOI,first_author,journal
0,10.1371/journal.pcbi.1004668,Blischak,PLoS Comput Biol
1,10.21105/joss.01035,Sparks,JOSS
2,10.12688/f1000research.18866.2,Thang,F1000Res
3,10.1038/s41598-019-52881-4,,Sci Rep
4,10.1186/s12859-019-3171-0,Chen,BMC Bioinformatics


In [68]:
# Outer join

doi_df_first_author.merge(doi_df_journal, on='DOI', how='outer') 

Unnamed: 0,DOI,first_author,journal
0,10.1371/journal.pcbi.1004668,Blischak,PLoS Comput Biol
1,10.21105/joss.01035,Sparks,JOSS
2,10.21105/joss.01006,Granger,
3,10.12688/f1000research.18866.2,Thang,F1000Res
4,10.1186/s12859-019-3171-0,Chen,BMC Bioinformatics
5,10.1038/s41598-019-52881-4,,Sci Rep


## Merge w/ different named cols

In [69]:
doi_df_journal = pd.DataFrame({
    "DOI": [
        "10.1371/journal.pcbi.1004668",
        "10.21105/joss.01035",
        "10.12688/f1000research.18866.2",
        "10.1038/s41598-019-52881-4",
        "10.1186/s12859-019-3171-0"],
    "journal": [
        "PLoS Comput Biol",
        "JOSS",
        "F1000Res",
        "Sci Rep",
        "BMC Bioinformatics"]})

doi_df_first_author = pd.DataFrame({
    "article_doi": [
        "10.1371/journal.pcbi.1004668",
        "10.21105/joss.01035",
        "10.21105/joss.01006",
        "10.12688/f1000research.18866.2",
        "10.1186/s12859-019-3171-0"],
    "first_author": [
        "Blischak",
        "Sparks",
        "Granger",
        "Thang",
        "Chen"]})

In [70]:
doi_df_first_author

Unnamed: 0,article_doi,first_author
0,10.1371/journal.pcbi.1004668,Blischak
1,10.21105/joss.01035,Sparks
2,10.21105/joss.01006,Granger
3,10.12688/f1000research.18866.2,Thang
4,10.1186/s12859-019-3171-0,Chen


In [71]:
doi_df_journal

Unnamed: 0,DOI,journal
0,10.1371/journal.pcbi.1004668,PLoS Comput Biol
1,10.21105/joss.01035,JOSS
2,10.12688/f1000research.18866.2,F1000Res
3,10.1038/s41598-019-52881-4,Sci Rep
4,10.1186/s12859-019-3171-0,BMC Bioinformatics


In [72]:
# left_on = merge left DF on specific column 
# right_on = merge right DF on specific column

doi_df_first_author.merge(doi_df_journal, left_on='article_doi', right_on='DOI', how='inner') 

Unnamed: 0,article_doi,first_author,DOI,journal
0,10.1371/journal.pcbi.1004668,Blischak,10.1371/journal.pcbi.1004668,PLoS Comput Biol
1,10.21105/joss.01035,Sparks,10.21105/joss.01035,JOSS
2,10.12688/f1000research.18866.2,Thang,10.12688/f1000research.18866.2,F1000Res
3,10.1186/s12859-019-3171-0,Chen,10.1186/s12859-019-3171-0,BMC Bioinformatics


## Apply

In [73]:
doi_df_first_author_2 = pd.DataFrame({
    "DOIs": [
        "10.1371/journal.pcbi.1004668",
        "10.21105/joss.01035",
        "10.21105/joss.01006",
        "10.12688/f1000research.18866.2",
        "10.1186/s12859-019-3171-0"],
    "first_author": [
        "Blischak J.",
        "Sparks Adam H.",
        "Granger Nicolas",
        "Thang",
        "Chen Danze"]})

In [74]:
doi_df_first_author_2

Unnamed: 0,DOIs,first_author
0,10.1371/journal.pcbi.1004668,Blischak J.
1,10.21105/joss.01035,Sparks Adam H.
2,10.21105/joss.01006,Granger Nicolas
3,10.12688/f1000research.18866.2,Thang
4,10.1186/s12859-019-3171-0,Chen Danze


In [75]:
'Rosalind Franklin Franklin'.split()[0]

'Rosalind'

In [76]:
doi_df_first_author_2['first_author'].apply(
    lambda author: author.split()[0])

0    Blischak
1      Sparks
2     Granger
3       Thang
4        Chen
Name: first_author, dtype: object

In [77]:
doi_df_first_author_2['last_name'] = doi_df_first_author_2['first_author'].apply(
    lambda author: author.split()[0])

In [78]:
doi_df_first_author_2

Unnamed: 0,DOIs,first_author,last_name
0,10.1371/journal.pcbi.1004668,Blischak J.,Blischak
1,10.21105/joss.01035,Sparks Adam H.,Sparks
2,10.21105/joss.01006,Granger Nicolas,Granger
3,10.12688/f1000research.18866.2,Thang,Thang
4,10.1186/s12859-019-3171-0,Chen Danze,Chen


In [79]:
doi_df_first_author_2['first_author'].apply(
    lambda author: ' '.join(author.split()[1:]))

0         J.
1    Adam H.
2    Nicolas
3           
4      Danze
Name: first_author, dtype: object

In [80]:
'Sparks Adam H.'.split()[1:]

['Adam', 'H.']

In [81]:
'---'.join(['Adam', 'H.', 'blub', 'bla'])

'Adam---H.---blub---bla'

In [82]:
doi_df_first_author_2['first_name'] = doi_df_first_author_2['first_author'].apply(
    lambda author: ' '.join(author.split()[1:]))

In [83]:
doi_df_first_author_2

Unnamed: 0,DOIs,first_author,last_name,first_name
0,10.1371/journal.pcbi.1004668,Blischak J.,Blischak,J.
1,10.21105/joss.01035,Sparks Adam H.,Sparks,Adam H.
2,10.21105/joss.01006,Granger Nicolas,Granger,Nicolas
3,10.12688/f1000research.18866.2,Thang,Thang,
4,10.1186/s12859-019-3171-0,Chen Danze,Chen,Danze


In [84]:
# doi_df_first_author_2 = doi_df_first_author_2.drop(columns='first_author')

In [85]:
# del doi_df_first_author_2['first_author']

In [86]:
doi_df_first_author_2

Unnamed: 0,DOIs,first_author,last_name,first_name
0,10.1371/journal.pcbi.1004668,Blischak J.,Blischak,J.
1,10.21105/joss.01035,Sparks Adam H.,Sparks,Adam H.
2,10.21105/joss.01006,Granger Nicolas,Granger,Nicolas
3,10.12688/f1000research.18866.2,Thang,Thang,
4,10.1186/s12859-019-3171-0,Chen Danze,Chen,Danze


In [87]:
doi_df_first_author_2.to_csv('doi_first_last_name.csv')

## lambda

In [88]:
# function
def clean_string(author):
    return ' '.join(author.split()[1:])

doi_df_first_author_2['func_result'] = doi_df_first_author_2['first_author'].apply(clean_string)


# function as lambda
doi_df_first_author_2['lambda_result'] = doi_df_first_author_2['first_author'].apply(
    lambda author: ' '.join(author.split()[1:]))

In [89]:
doi_df_first_author_2

Unnamed: 0,DOIs,first_author,last_name,first_name,func_result,lambda_result
0,10.1371/journal.pcbi.1004668,Blischak J.,Blischak,J.,J.,J.
1,10.21105/joss.01035,Sparks Adam H.,Sparks,Adam H.,Adam H.,Adam H.
2,10.21105/joss.01006,Granger Nicolas,Granger,Nicolas,Nicolas,Nicolas
3,10.12688/f1000research.18866.2,Thang,Thang,,,
4,10.1186/s12859-019-3171-0,Chen Danze,Chen,Danze,Danze,Danze
