In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, StringType, StructType, StructField
import re

In [0]:
df = spark.read.option("multiline", True).json('dbfs:/user/dblpv13/dblpv13.0.json.gz')

In [0]:
# finds the country names in a list of strings
# modified to only use the first element of the list
# uses regex to remove punctuation from the string and to match the given names of the countries
# can also match the name/abbreviation of the state in the US, in this case it will return the country name 'United States'
# can match the abbreviations 'UK' and 'USA'

def getCountry(s):
    if s is None:
        return None
    arr = []
    countries = ["Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua & Deps", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina", "Burundi", "Cambodia", "Cameroon", "Canada", "Cape Verde", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo", "Congo Democratic Republic", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czech Republic", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "East Timor", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Ethiopia", "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-bissau", "Guyana", "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Ivory Coast", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "South Korea", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg", "Macedonia", "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Burma", "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "Norway", "Romania", "Pakistan", "Palau", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Oman", "Russia", "Rwanda", "St Kitts & Nevis", "St Lucia", "Saint Vincent & The Grenadines", "Samoa", "San Marino", "Sao Tome & Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Swaziland", "Sweden", "Switzerland", "Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Togo", "Tonga", "Trinidad & Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe"]
    state_names = ["alaska", "alabama", "arkansas", "american samoa", "arizona", "california", "colorado", "connecticut", "district ", "of columbia", "delaware", "florida", "georgia", "guam", "hawaii", "iowa", "idaho", "illinois", "indiana", "kansas", "kentucky", "louisiana", "massachusetts", "maryland", "maine", "michigan", "minnesota", "missouri", "mississippi", "montana", "north carolina", "north dakota", "nebraska", "new hampshire", "new jersey", "new mexico", "nevada", "new york", "ohio", "oklahoma", "oregon", "pennsylvania", "puerto rico", "rhode island", "south carolina", "south dakota", "tennessee", "texas", "utah", "virginia", "virgin islands", "vermont", "washington", "wisconsin", "west virginia", "wyoming"]
    states = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']
    
    for i in s:
        if i["org"] is None:
            arr.append(None)
            break
        sent = re.sub("[^a-zA-Z -]", "", i["org"])
        x = None
        for j in countries:
            x = re.search(j.lower(), sent.lower())
            if x is not None:
                if j.lower() == 'india':
                    x = re.search('indiana', sent.lower())
                    if x is not None:
                        arr.append("United States")
                elif j.lower() == 'georgia':
                    x = re.search('USA', sent)
                    if x is not None:
                        arr.append("United States")
                else:
                    arr.append(j)
                break
        if x is None:
            x = re.search("USA", sent)
            if x is not None:
                arr.append("United States")
                break
        if x is None:
            x = re.search("UK", sent)
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            x = re.search("england", sent.lower())
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            x = re.search("scotland", sent.lower())
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            x = re.search("wales", sent.lower())
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            for j in states:
                x = re.search(j, sent)
                if x is not None:
                    arr.append("United States")
                    break
        if x is None:
            for j in state_names:
                x = re.search(j, sent.lower())
                if x is not None:
                    arr.append("United States")
                    break
        break
                    
    if len(arr) > 0:
        return arr[0]
    else:
        return None

getCountryUDF = udf(getCountry)

In [0]:
# Organization (affiliation of the first author)
# ID - authors.orgid
# Name - authors.org
# Country - getCountryUDF(F.arrays_zip("authors.org"))
def organization(df):
    new_df = df.select(F.col("authors.orgid").getItem(0).alias("org_id"),
                       F.col("authors.org").getItem(0).alias("org_name"),
                       (getCountryUDF(F.arrays_zip("authors.org"))).alias("org_country"))
    new_df = new_df.na.drop("all")
    return new_df

org_df = organization(df)

In [0]:
new_df = org_df.select('org_id', 'org_name', 'org_country').distinct()
new_df = new_df.withColumn('ID', F.monotonically_increasing_id())

#display(new_df.limit(10))

In [0]:
df2 = df.withColumn("org_id", F.col("authors.orgid").getItem(0)).withColumn("org_name", F.col("authors.org").getItem(0))
#display(df2.limit(10))

In [0]:
final_df = df2.join(new_df.select('ID', 'org_id', 'org_name'), on=['org_id', 'org_name'])
final_df = final_df.drop('org_id', 'org_name')

display(final_df.limit(10))

_id,abstract,authors,doi,fos,isbn,issn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,venue,volume,year,ID
53e99785b7602d9701f40583,"Summary: We implemented a software tool called GENESIS for three different genome rearrangement problems: Sorting a unichromosomal genome by weighted reversals and transpositions (SwRT), sorting a multichromosomal genome by reversals, translocations, fusions and fissions (SRTl), and sorting a multichromosomal genome by weighted reversals, translocations, fusions, fissions and transpositions (SwRTTl). Availability: Source code can be obtained by the authors, or use the web interface http://www.uni-ulm.de/in/theo/research/genesis.html Contact: simon.gog@uni-ulm.de","List(List(53f43018dabfaeb22f42a3cf, null, null, null, Simon Gog, null, null, null, null, Univ Ulm, Inst Theoret Comp Sci, D-89069 Ulm, Germany, null, 5f71b2aa1c455f439fe3d5f8, List(Univ Ulm, Inst Theoret Comp Sci, D-89069 Ulm, Germany), null, null), List(53f42cfadabfaedf43510fe2, null, null, null, Martin Bader, null, null, null, null, Univ Ulm, Inst Theoret Comp Sci, D-89069 Ulm, Germany, null, 5f71b2aa1c455f439fe3d5f8, List(Univ Ulm, Inst Theoret Comp Sci, D-89069 Ulm, Germany), null, null), List(53f42c6edabfaedd74d2898b, null, null, null, Enno Ohlebusch, null, null, null, null, Univ Ulm, Inst Theoret Comp Sci, D-89069 Ulm, Germany, null, 5f71b2aa1c455f439fe3d5f8, List(Univ Ulm, Inst Theoret Comp Sci, D-89069 Ulm, Germany), null, null))",10.1093/bioinformatics/btn026,,,Handbook of Hope,5.0,"List(software tool, unichromosomal genome, web interface, source code, different genome rearrangement problem, multichromosomal genome, html Contact, weighted reversal)",en,,712,711,,,GENESIS,"List(http://doi.ieeecomputersociety.org/10.1109/SERVICES-I.2009.108, http://www.webofknowledge.com/)","List(53a7284920f7420be8baedee, null, null, null, null, null, null, SERVICES I, null, null, null, null, null)",24.0,2008,42949674833
53e99785b7602d9701f440df,,"List(List(53f46a0adabfaec09f24e389, null, Petra.Stoerig@uni-duesseldorf.de, 5b869f98e1cd8e14a3b1e5b9, Petra Stoerig, null, null, null, null, Institute of Experimental Psychology II, Heinrich-Heine-University, Duesseldorf 40225, Germany, null, 5f71b2ce1c455f439fe3e62a, null, null, null), List(53f4682bdabfaefedbb8fa5e, null, Alan.Cowey@psy.ox.ac.uk, 5b86b6ede1cd8e14a34deaa0, Alan Cowey, null, null, null, null, Department of Experimental Psychology, University of Oxford, Oxford OX1 3UD, UK, null, 5f71b2841c455f439fe3c6ab, null, null, null))",10.1016/j.cub.2007.07.016,,,0960-9822,19.0,List(),en,0.0,4,R822,,,Blindsight.,"List(http://www.scholarpedia.org/article/Blindsight, http://www.ncbi.nlm.nih.gov/pubmed/17925204?report=xml&format=text)","List(555036c97cea80f954155dd5, null, null, null, null, null, null, Scholarpedia, null, null, null, null, 0)",17.0,2007,42949676253
53e99792b7602d9701f58126,"E-government has emerged not merely as a specialization in public administration, but as a transformative force affecting all levels and functions of government. This volume, written by a mix of practitioners and researchers, provides an overview of the management challenges and issues involved in seeking a new form of governance -- digital government. An initial introductory section presents the hopes for e-government and outlines its history in the United States and globally. Section II, ""The New Face of Government,"" examines FirstGov, the premiere example of e-government and surveys its political implications. Section III, ""Issues in Digital Governance,"" discusses such management challenges as privacy rights, e-procurement, e-commerce, and ethics in e-government. Section IV, ""Preparing for Digital Government,"" discusses data warehousing and related prerequisites for e-government, including the education and training of the public service. Finally, Section V, ""The Future of E-Government,"" discusses the digital divide, citizen participation, and factors that will determine the eventual success or failure of the e-government model.","List(List(null, null, null, 5b869d53e1cd8e14a3a1ddb6, P L Giorgi, null, null, null, null, Dipartimento di Matematica e Applicazioni, Universita degli Studi di Napoli Complesso Monte S., Angelo Via Cintia, 80126 Naples, Italy, null, 5f71b2811c455f439fe3c5cb, null, null, null), List(null, null, null, 5b86b607e1cd8e14a347658e, N Oggiano, null, null, null, null, Dipartimento di Matematica Seconda, Universita' degli Studi di Napoli, Via Vivaldi, 43 81100 Caserta, Italy, null, 5f71b6951c455f439fe5904b, null, null, null), List(56017c7b45cedb3395e628b1, null, null, 5b86b607e1cd8e14a347658e, A Kantar, null, null, null, null, Dipartimento di Matematica Seconda, Universita' degli Studi di Napoli, Via Vivaldi, 43 81100 Caserta, Italy, null, 5f71b6951c455f439fe5904b, null, null, null), List(56017c7b45cedb3395e628b4, null, null, 5b869d53e1cd8e14a3a1ddb6, A Corrias, null, null, null, null, Dipartimento di Matematica e Applicazioni, Universita degli Studi di Napoli Complesso Monte S., Angelo Via Cintia, 80126 Naples, Italy, null, 5f71b2811c455f439fe3c5cb, null, null, null))",10.1155/S0962935194000669,,,0962-9351,7.0,"List(section iii, section iv, management challenge, section ii, digital governance, digital divide, e-government model, discusses data warehousing, digital government, section v)",en,6.0,S3,S3,,,Preface.,"List(http://dx.doi.org/10.1155/S0962935194000669, http://jmlr.org/proceedings/papers/v38/lebanon15.html, http://www.ncbi.nlm.nih.gov/pubmed/23601432?report=xml&format=text, http://dl.acm.org/citation.cfm?id=2583610.2583657&coll=DL&dl=GUIDE&CFID=685818890&CFTOKEN=77058333&preflayout=flat, http://dl.acm.org/citation.cfm?id=1705506.1705589&coll=DL&dl=GUIDE&CFID=687163793&CFTOKEN=30852066&preflayout=flat, http://www.ncbi.nlm.nih.gov/pubmed/19853181?report=xml&format=text, http://dx.doi.org/10.1023/B:APCS.0000049330.42565.e2, http://www.ncbi.nlm.nih.gov/pubmed/23932103?report=xml&format=text, http://www.ncbi.nlm.nih.gov/pubmed/23211465?report=xml&format=text, http://www.ncbi.nlm.nih.gov/pubmed/25727707?report=xml&format=text, http://dl.acm.org/citation.cfm?id=2773784&COLL=DL&DL=ACM&preflayout=flat, http://dl.acm.org/citation.cfm?id=1959886.1960198&coll=DL&dl=GUIDE&CFID=521904122&CFTOKEN=70368633&preflayout=flat, http://dl.acm.org/citation.cfm?id=2653507.2654201&coll=DL&dl=GUIDE&CFID=522612554&CFTOKEN=93798862&preflayout=flat, http://www.ncbi.nlm.nih.gov/pubmed/24418611?report=xml&format=text, http://www.ncbi.nlm.nih.gov/pubmed/18475600?report=xml&format=text, http://dx.doi.org/10.1007/BF02072881, http://link.springer.com/article/10.1007/BF02072881, http://dx.doi.org/10.1007/BF02069112, http://link.springer.com/article/10.1007/BF02069112, http://dx.doi.org/10.1007/BF02063703, http://link.springer.com/article/10.1007/BF02063703, http://dx.doi.org/10.1007/BF02065714, http://link.springer.com/article/10.1007/BF02065714, http://dx.doi.org/10.1007/BF02064563, http://link.springer.com/article/10.1007/BF02064563, http://dx.doi.org/10.1007/BF02172104, http://link.springer.com/article/10.1007/BF02172104, http://dx.doi.org/10.1016/S0083-6729(16)30014-0, http://www.ncbi.nlm.nih.gov/pubmed/27125748?report=xml&format=text, http://dx.doi.org/10.1016/j.disopt.2016.09.002, http://doi.ieeecomputersociety.org/10.1109/TVCG.2016.2599300, https://doi.org/10.3233/FI-2017-1457, http://dx.doi.org/10.1002/net.21613, http://dl.acm.org/citation.cfm?id=2885345&preflayout=flat, https://doi.org/10.1016/j.endm.2018.01.001, https://doi.org/10.1007/BF02205445, https://www.ncbi.nlm.nih.gov/pubmed/27513583, http://www.webofknowledge.com/)","List(53a72d1920f7420be8c59b84, null, null, null, null, null, null, AISTATS, null, null, null, null, 0)",3.0,1994,8589942605
53e99796b7602d9701f61ea3,,"List(List(53f42e86dabfaee43ebd36a0, null, null, 5b86b6c7e1cd8e14a34ccada, Josephine Anstey, null, null, null, null, Univ. of Illinois at Chicago, Chicago, null, 5f71b29c1c455f439fe3d0e5, null, null, null))",10.1145/280953.281335,,1-58113-046-5,,,List(Guerilla VR),en,0.0,129,128,,,Guerilla VR,"List(http://dx.doi.org/10.1145/280953.281335, http://doi.acm.org/10.1145/280953.281335, db/conf/siggraph/siggraph1998aa.html#Anstey98, https://doi.org/10.1145/280953.281335)","List(null, null, null, null, null, null, null, SIGGRAPH Abstracts and Applications, null, null, null, null, 10)",,1998,17179870591
53e9979bb7602d9701f631e2,,"List(List(53f4399ddabfaeecd697ca71, null, null, null, Lawrence Lowe, null, null, null, null, Hewlett-Packard Ltd., South Queensferry, West Lothian, Scotland, null, 5f71b47f1c455f439fe4a655, null, null, null))",10.1016/0141-9331(79)90029-2,,,Microprocessors and Microsystems,3.0,List(),en,0.0,143,141,,,Digital troubleshooting,List(http://dx.doi.org/10.1016/0141-9331(79)90029-2),"List(53a7310b20f7420be8d1c829, null, null, null, null, null, null, Microprocessors and Microsystems - Embedded Hardware Design, null, null, null, null, 11)",3.0,1979,42949698417
53e9979bb7602d9701f68c03,"The common historically reductionist past of evolutionary biology and traditional social sciences such as economics has led way to a nascent holistic perspective of nonlinear science that is capable of describing multiple levels of reality. We propose a novel language for describing human behavior and social phenomena, set within a general theory of collective behavior and structure formation, with a resulting architecture that can be broadly applied. This work represents the blue print for a Multi-Agent Systems (MAS) design language in which agency is granted in a quantitative, rather than the traditional qualitative way. The relevant agents in the proposed system are intermediate in the sense that they are both influenced by an upper level with its own degree of agency, while at the same time they are determined by relatively independent subcomponents that must be ‘subdued’ into acceptable behavior. Any observed action is considered to be the result of the interplay of multiple distinguishable actors. We put forward this language as a basis for the construction of large-scale simulations and for the description of complex social phenomena.","List(List(53f436d0dabfaee1c0aa1287, null, esuarez@trinity.edu, 5b8698fbe1cd8e14a383b21f, Eugenio Dante Suarez, null, 544bd9b445ce266baf0c4902, null, null, Trinity University Department of Business Administration One Trinity Place San Antonio TX USA 78212, null, 5f71b2ad1c455f439fe3d751, null, null, 18241051), List(53f38339dabfae4b34a0295a, null, null, null, Antonio Rodríguez Díaz, null, null, null, null, null, null, null, null, null, 3480864), List(53f43166dabfaee02ac93c0b, null, puga@uabc.mx, null, Manuel Castañón-puga, null, 544bd9d245ce266baf278949, null, null, Baja California Autonomous University Chemistry and Engineering Faculty Calzada Tecnológico 14418, Mesa de Otay Tijuana Baja California México 22390, null, null, null, null, 18124677))",10.1007/978-3-540-70812-4_15,,,,,"List(distributed agency, social simulation, multi-agent systems, levels of reality.)",en,4.0,293,269,,,Fuzzy Agents,List(http://dx.doi.org/10.1007/978-3-540-70812-4_15),"List(5390a21620f70186a0e5e13d, null, null, null, null, null, null, Soft Computing for Hybrid Intelligent Systems, null, null, null, null, 0)",,2008,42949690042
53e9979eb7602d9701f6d619,"In program debugging, reproducibility of bugs is a key requirement. Unfortunately, bugs in concurrent programs are notoriously difficult to reproduce because bugs due to concurrency happen under very specific thread schedules and the likelihood of taking such corner-case schedules during regular testing is very low. We propose concurrent breakpoints, a light-weight and programmatic way to make a concurrency bug reproducible. We describe a mechanism that helps to hit a concurrent breakpoint in a concurrent execution with high probability. We have implemented concurrent breakpoints as a light-weight library for Java and C/C++ programs. We have used the implementation to deterministically reproduce several known non-deterministic bugs in real-world concurrent Java and C/C++ programs with almost 100% probability.","List(List(53f450d6dabfaefedbb3c259, null, null, 5b86b6cce1cd8e14a34cf6f7, Chang-Seo Park, null, null, null, null, University of California Berkeley, Berkeley, CA, USA, null, 5f71b2841c455f439fe3c6b7, null, null, null), List(53f4ebf6dabfaee4dc8b8c01, null, null, 5b86b6cce1cd8e14a34cf6f7, Koushik Sen, null, null, null, null, University of California Berkeley, Berkeley, CA, USA, null, 5f71b2841c455f439fe3c6b7, null, null, null))",10.1145/2145816.2145880,,,0362-1340,8.0,"List(real-world concurrent Java, concurrent program, corner-case schedule, high probability, key requirement, light-weight library, concurrent breakpoint, concurrent execution, concurrency bug reproducible, concurrent breakpoints)",en,14.0,332,331,//static.aminer.org/pdf/20170130/pdfs/ppopp/ktpiukebcycrz018homqarhamdxn6gpv.pdf,,Concurrent breakpoints,"List(http://dx.doi.org/10.1145/2145816.2145880, http://doi.acm.org/10.1145/2145816.2145880, https://static.aminer.org/pdf/20170130/pdfs/index.txt, https://dl.acm.org/doi/abs/10.1145/2370036.2145880)","List(53a728b620f7420be8bb65ff, null, null, null, null, null, null, PPOPP, null, null, null, null, 0)",47.0,2012,25769812991
53e9979eb7602d9701f6eea1,"Principles of Population Genetics (3rd edn). Daniel L. Hartl and Andrew G. Clark. Sinauer Associates, Inc., Sunderland, Massachusetts. 1997. Pp. 542. Price £39.95, hardback. ISBN 0 87893 306 9. There is demand for a more widespread understanding of population genetics. Many a 'health scare of the week' highlights the lack of public understanding of population processes. Areas ranging from law enforcement through agricultural and medical practice to biological conservation now require some expertise in population genetics. Does this book promise to satisfy the demand? Star qualities are immediately apparent: enthusiasm for the subject, a clear overview, and a universal style. Such qualities did not arise overnight. The two previous editions were considered to be the leading modern text for advanced undergraduates and beginning postgraduates. However, I feel that the presentation is too ambitious for most undergraduates, especially for those with no background. For such, Professor Hartl's Primer in Population Genetics (2nd edn) from the same publisher will provide a more accessible introduction. To make life simpler, education tends towards telling a linear story. Population genetics on the other hand resembles a multidimensional jigsaw puzzle: its language is a colourful amalgam of physical mechanisms, mathematical models, and statistical descriptions. This text deals with the conflict by dividing the subject into its major processes and applications. Nonetheless, the authors try to emphasize connectedness: for example, between inbreeding and genetic drift. Another recurring theme is the current deficit in our understanding of the relationship between genetic and phenotypic variation. The third edition introduces many changes in structure and content. The effect is to give a better flow from background concepts through processes to applications. The genetic system at the centre of attention is the classical gene in an outbreeding diploid. Nonetheless, glimpses are provided of other systems, such as selfing plants, transposable elements and bacterial chromosomes. The first two chapters provide the genetic and statistical background. The resolution of the conflict between the biometrical and Mendelian views of inheritance by Sir Ronald Fisher is inserted to give the subject a historical perspective. Chapters three to seven form the core of the book, and describe the main influences on genetic variation, including the relationship between gene and genotype, population subdivision, inbreeding, mutation, recombination and migration. Chapter six provides a panorama of Darwinian selection, from single gene models, through the mutation-selection balance and interactions with genetic drift, to all the complexities of age-structured populations and epistasis. The long-standing debate about the relative roles of selection and drift in adaptive evolution is referred to without the reader becoming enmeshed, and Kimura's neutral theory is accorded the role of universal null hypothesis in this context. The reader is given a taste of the diffusion theory which identified the equilibrium states of the classical literature, and which led to the latest coalescence models. Chapters eight and nine deal with two major applications of population genetic theory: molecular evolution and quantitative genetics. The use of insights from the neutral theory to interpret protein and DNA sequence data is described, as are the methods for gene tree construction. Important applications of gene trees are described: for hypothesis tests of the direction of organismal trait evolution, for estimation of long-term migration rates, and for demonstrating selection at the gene level over evolutionary time scales. Quantitative genetics is given the standard presentation: the definition of heritability, and the estimation of components of genetic variance. The use of dense linkage maps for locating quantitative trait loci is presented as one way to elucidate the relationship between genetic and phenotypic variation. The book ends on this characteristically optimistic note. This book is essential reading for anyone requiring a firm foundation in modern population genetics. Arabidopsis Protocols (Methods in Molecular Biology 82). José M. Martínez-Zapater and Julio Salinas (eds). Humana Press, Totowa, New Jersey. 1998. Pp. 440. Price $79.50, hardback. ISBN 0 89603391 0. Arabidopsis Protocols represents a timely publication that is aimed to fill a niche within a market full of plant molecular biology-related titles. Interest in Arabidopsis as an experimental model for higher plants has increased exponentially since the mid-1980s and is set to rise even further with the impending completion of its genome project soon after the millennium. The strongly nucleic acid-related theme within many chapters of the book reflects the array of techniques that are currently being employed by the international Arabidopsis community to map and clone their target genes. However, Arabidopsis Protocols does far more than provide solace for the loneliness of the long-distance chromosome walker. Initial chapters provide basic, yet important, information for new Arabidopsis researchers — how to grow this weed successfully! This section includes useful chapters by Mary Anderson describing the control of Arabidopsis pests and diseases, plus Randy Scholl and colleagues detail how they preserve the longevity of Arabidopsis seed. In contrast, Part 2 proved disappointing. Whilst I commend the editors for including chapters describing the purification of Arabidopsis organelles and macromolecules, I was surprised to discover that Arabidopsis cells only contain mitochondrial and chloroplast organelles and that their macromolecules are totally nucleic acid-based! The inclusion of a chapter describing the subfractionation of Arabidopsis cellular membranes would have gone a long way towards balancing this section. Part 3 proved excellent reading, particularly the chapters on seed mutagenesis and genetic analysis which included useful introductory discussions, whilst Part 4 comprehensively describes PCR and RFLP-based techniques used to map Arabidopsis genes and mutations. Transient and stable Arabidopsis transformation techniques provide the theme for Part 5, including a description of vacuum-based in planta, Agrobacterium-mediated transformation by Bechtold and Pelletier. However, I was surprised to see no reference being made to detergent-based in planta transformation methodologies which are currently gaining favour amongst many Arabidopsis laboratories. Commendably, the treatment of gene cloning within Part 6 was well balanced, contrasting chromosome walking and landing strategies with transposon and T-DNA-based gene tagging approaches. The final section addresses techniques used to study Arabidopsis gene expression including a selection of chapters describing in situ hybridisation, reporter genes and, particularly enjoyable, in vivo footprinting. However, I was disappointed that the editors did not include a chapter on the localisation of those well known DNA by-products, proteins! This omission emphasises the overall nucleic acid-based theme within Arabidopsis Protocols. Whilst it is fair to say that the majority of Arabidopsis researchers are presently engrossed with nucleic acid-related topics, there is a growing interest in post-transcriptional processes. In summary, Arabidopsis Protocols provides an excellent technical reference for molecular geneticists, old and new, and will represent a 'must-have' text for every Arabidopsis laboratory for the next decade. Principles of Genome Analysis (2nd edn). S. B. Primrose. Blackwell Science, Oxford. 1998. Pp 193. Price £19.50, paperback. ISBN 0 632 04983 9. S. B. Primrose has authored or co-authored several books like this one: heavily illustrated texts of between 100 and 200 pages on subjects related to gene manipulation. Each is aimed at a broad market — students, lecturers or researchers interested in understanding, or considering entry to, the fields in question. All provide references to both the original literature and to appropriate reviews (about 400 references are cited in this volume). The two most successful, Introduction to Modern Virology (latterly with Dimmock) and Principles of Gene Manipulation (with Old) are in their 4th and 5th editions, respectively, and have increased in size several fold (to about 400 pages). Principles of Gene Manipulation is used as a teaching text in this University. Each of these books was last updated in 1994. Principles of Genome Analysis, subtitled A Guide to Mapping and Sequencing DNA from Different Organisms was first published in 1995 and has now been republished in revised, although not much lengthened, form. The subject of this volume is, of course, extremely timely. With public interest in human genome analysis reaching heights unprecedented for a problem in fundamental biological research, and with both large-scale commercial and government funding earmarked for the sequencing and analysis of the genomes of dozens of different species, the constituency of those interested in this volume should be both large and diverse. The book is divided into seven chapters. Chapter 1 attempts both to explain the benefits of genome sequencing, from the lofty ('detailed understanding of the organism will only be achieved when every gene has been identified and its transcript and the timing of transcript synthesis known') to the practical (a more efficient way to find disease genes). It also lists sequencing projects which are underway, describes some techniques, and outlines the rest of the book. I found Chapter 2 on the organization and structure of genomes (with the lowest density of 'practical' information) the most interesting. As one more familiar with prokaryotes, the complexity and variability (even amongst related species) of eukaryotic DNA organization creates for me a bewildering picture: why does the cow have a genome which harbours 25% tandemly repeated DNA? Why is mammalian DNA heavily punctuated with functionally inactive interspersed repeats? Why do introns vary in size and frequency between species? Chapter 3 is a straightforward exposition of methods for subdividing chromosomes into minichromosomes and other vectors as a preliminary to sequencing and gene mapping. Chapter 4 offers a real challenge to the novice; it presents a veritable alphabet soup (fortunately there is an abbreviations list at the start of the book) of ingenious methods for assembling physical maps. I think, for the non-expert, it is more to be dipped into than assimilated. Chapter 5, on sequencing, began as a welcome relief but progressed to an explication of sequencing on microchips which defeated me. Chapter 6, on sequence analysis and new to this edition, is only 12 pages long and surprisingly, considering the subject matter of the rest of the book, concentrates primarily on bacterial genome sequences with heavy reference to the publications of Koonin and collaborators. Others are more circumspect than he at declaring pairs of genes homologous and the impression is perhaps given here that genes (and their functions) are more conserved between species than may eventually prove to be the case. The final chapter deals with finding genes in the DNA-haystack of large genomes. The text is well illustrated with mostly useful diagrams (some borrowed from Principles of Gene Manipulation), although some were hard to understand, and with a lesser number of tables (I would have liked more). There are a number of proof-reading errors, omissions (several references are absent from the reference lists) difficult or inadequate figure captions and the occasional misstatement, but these do not seriously detract from the usefulness of the book. Although I suspect that private purchases may be limited, this is certainly a volume with a place in every comprehensive biology library. Genetic Structure and Local Adaptation in Natural Insect Populations. Susan Mopper and Sharon Y. Strauss (eds). Chapman and Hall, New York. 1997. Pp. 449. Price £65.00, hardback. ISBN 041208031 1. Some 30 years ago, three seminal papers were published that have steered subsequent work on insect/plant interactions. First, Hairston, Smith & Slobodkin (now known as HSS, 1960) published a short note based on the observation that 'the world is green' and that the nature of population regulation depends on the trophic level examined. This work sparked research and controversy eventually leading to debates on the role of competition vs. other forces in structuring ecological communities. Following HSS, Ehrlich & Raven (1964) made the case for reciprocal coevolution between plants and insects, prompting investigations into the role of phytochemistry in insect/plant interactions, as well as phylogenetic tests of macro-evolutionary cospeciation between insects and plants. Finally, Bush (1969) stirred the pot with evidence from plant associated flies of sympatric speciation, a topic that remains controversial to this day. Now, 30 years later, we are wondering where this field stands. For the past decade, Denno & McClure's (1983) Variable Plants and Herbivores in Natural and Managed Systems and Strong, Lawton & Southwood's (1984) Insects on Plants provided the backbone for this discipline. In this volume, Mopper and Strauss have offered a much-needed update to these well-worn bibles of insect/plant interactions. It is an impressive collection of well-edited papers that focus on the genetic issues at the heart of insect/plant interactions, and in particular the importance of demic adaptation. In the past third of a century, the field has matured, not coincidentally as a result of more sophisticated genetic techniques as well as methods of analysis. So, what's new? Several themes emerge. As Boecklen & Mopper (Ch. 4) point out, the proper designs to test for local adaptation are now recognized, and the relevant data are pulled together here for the first time. Picture insects feeding on plants. First, selection for adaptation to host plants can be strong enough to overcome most scenarios of gene flow (Strauss & Karban, Ch. 8), and, as a result, local adaptation can occur at several possible scales, for example, individual plants, plant species, and sites (Mopper, Ch. 7). Local adaptation can lead to the formation of demes (Alstad, Ch. 1; Stiling & Rossi, Ch. 2), but not always (Cobb &Whitham, Ch. 3; Boecklen & Mopper, Ch. 4; Hanks & Denno, Ch. 11). Dispersal, or lack thereof, is critical in permitting local adaptation (Mopper, Ch. 7; Strauss & Karban, Ch. 8, Hanks & Denno, Ch. 11; Thomas & Singer, Ch. 14). Feeding style (Stiling & Rossi, Ch. 2), inherited environmental effects (Rossiter, Ch. 6), social behaviour and other intrademic structure (McCauley & Goff, Ch. 9; Costa, Ch. 10), life history patterns (Peterson & Denno, Ch. 12), and many sorts of habitat heterogeneity (Gandon et al., Ch. 13), all influence the magnitude of gene flow if not the likelihood of local adaptation. Finally, host fidelity can act as a reproductive barrier to gene flow either partially (Thomas & Singer, Ch. 14; Itami et al., Ch. 15) or nearly completely (Feder et al., Ch. 16). Given that the book is a solid and important contribution to the field, is anything missing? The mechanisms underlying adaptation have received little attention by the contributors and are 'conspicuous by their absence' (Berenbaum & Zangerl, Ch. 5). This criticism may be valid–reciprocal transplant studies used to detect local adaptation often make it difficult to distinguish among various selective mechanisms. However, this is a criticism of the field rather than the book. One promising area that was overlooked in the book is the use of phylogenetics or genealogies to understand the evolution of host associations, although Itami et al. (Ch. 15) come close. With regards to organization, I have four reservations about the volume. First, the collection of papers is dominated by North American schools of insect/plant studies – 15 of 16 papers; the 16th, Gandon et al., is out of France. Second, the authors missed an opportunity to comment on the chapters — the four sections would have benefited from both synthesis and perspective. Third, the index is thin at seven pages, compared to 37 pages in Denno & McClure (albeit a larger volume) and 20 pages in Strong, Lawton & Southwood (a smaller volume) which included a very useful author index, an unfortunate omission here. Fourth, and this one is for the publishers, £65.00 is beyond the grasp of most graduate students worldwide. Does it really cost this much to produce an edited volume of 16 papers when academics write these chapters for free? That said, who will want to read the book? I would recommend the book as required reading for anyone interested in insect/plant interactions. Perhaps more importantly, it is an outstanding source of ideas. The perfect target would be students in a seminar setting, or a grant-starved professor submitting another proposal on insect/plant interactions (thank you!), or a beginning PhD student looking for a project. But, for those of you whose daily encounters with entomofauna is a splat on the windshield and the millennium bug, there is still a lot here — genetic differentiation, selection, adaptation, speciali-zation, life-history variation, speciation, and biodiversity. There is no doubt this field is evolving quickly, so read the book and keep up with the Red Queen! Melanism: Evolution in Action. Michael E. N. Majerus. Oxford University Press, Oxford. 1998. Pp. 338. Price £23.95, paperback. ISBN 0 19 854982 2. The classical example of natural selection at work concerns melanism (the occurrence of dark forms). Probably all biologists have heard about 'the peppered moth story', first published by Kettlewell in Heredity. The simple version of the story is that once upon a time there were two forms of the peppered moth (Biston betularia), a common white one and a rare black one. When pollution made tree trunks black, more whites than blacks were eaten by birds, and since then most peppered moths in industrial areas are black. If you suspect that the reality is not as simple as that you are right. The moths, for example, do not normally rest on tree trunks, and intermediate forms exist. Michael Majerus dissects the story in his book on melanism and shows that it is more complex and fascinating than most biologists will have realized. The book starts with general principles. First what melanism is and its possible causes, then principles of genetics and finally those of evolutionary processes. Readers of Heredity can skip the genetic part, unless they want to have a clearly written text that explains genetics at the very basic level. Likewise the evolutionary part: it is basic, but might be suitable in a course for first year students. The remainder of the book is on melanism in specific groups of organisms. First two chapters on the peppered moth, then two on other moths, and finally a large chapter on ladybirds. The reason for the choice of organisms is not that melanism is restricted to them, but simply that the author happens to work on them. Is the book therefore only of interest to those who work on moths and ladybirds? Surely not! By ignoring the book workers on other organisms would make the same mistake that Majerus did, missing interesting and important work outside their own realm. The author states in the preface that one of his original goals was to 'update Kettlewell' but that this was not achievable in one book. Instead he has 'used only those examples known to me to illustrate the points I have tried to make'. His real aim then was to engender renewed interest in melanism. But he could have done a better job in this respect. By sticking to his 'own organisms' he missed the opportunity to highlight several important topics related to melanism. There is, for example, nice work on plasticity in melanism, and on the relationship between the darkness of wings and mate locating behaviour in butterflies. The author apparently also missed the fact that the forms of the scarlet tiger moth (Panaxia dominula) are mainly determined by temperature, thereby making it useless as an illustration of balanced polymorphism. The book is almost exclusively on cases of melanism with discrete forms determined by a few alleles. Heritabilities are hardly mentioned, although there exists work dealing with continuous variation, even in 'his own' ladybirds. I found the last section on 'the future of research into melanism' disappointing. I think that the most exciting opportunities arise by integrating different biological disciplines. What, for example, is the developmental basis of the spots in ladybirds, is it the same in the different forms, and in different species? By combining such questions with a phylogenetic approach we would really learn something new about the evolution of melanism. The book is, however, very readable — something which cannot be said of many books containing so much interesting scientific material.","List(List(null, null, null, 5b8696f3e1cd8e14a376503f, John Dallas, null, null, null, null, 1NERC Molecular Genetics in Ecology Initiative, Department of Zoology, University of Aberdeen, Tillydrone Avenue, Aberdeen AB24 2TZ, UK, null, 5f71b2921c455f439fe3cc52, null, null, null), List(53f4cfdbdabfaeedd877d78a, null, null, 5b86c96ee1cd8e14a3d31959, Malcolm J Bennett, null, null, null, null, 2Department of Biological Sciences, University of Warwick, Coventry CV4 7AL, UK, null, 5f71b2871c455f439fe3c7fd, null, null, null), List(null, null, null, 5b86c802e1cd8e14a3c8bae8, Millicent Masters, null, null, null, null, 3Institute of Cell and Molecular Biology, University of Edinburgh, Kings Buildings, Edinburgh EH9 3JR, UK, null, 5f71b4a31c455f439fe4b735, null, null, null), List(null, null, null, 5b86bc34e1cd8e14a374114c, George k Roderick, null, null, null, null, 4Center for Conservation Research and Training, 3050 Maile Way, 409 Gilmore, University of Hawaii, Honolulu, Hawaii 96822, USA, null, 5f71b2901c455f439fe3cba3, null, null, null), List(53f44ccadabfaee1c0b0690e, null, null, 5b868a2fe1cd8e14a3241e3b, Jack J Windig, null, null, null, null, 5Department of Zoology, University of Stockholm, S 106 91 Stockholm, Sweden, null, 5f71b2841c455f439fe3c682, null, null, null))",10.1093/icb/42.2.408,,,1540-7063,2.0,"List(book review, genetics, eukaryotes, ecological, population and evolutionary genetics, human population genetics, genomics, post-genomics, biometrical and statistical genetics, animal and plant breeding, cytogenetics)",en,2.0,408,408,https://static.aminer.cn/upload/pdf/program/53e9979eb7602d9701f6eea1_0.pdf,,Book reviews.,"List(http://dx.doi.org/10.1093/icb/42.2.408, http://dx.doi.org/10.3233/IFS-1994-2108, http://www.ncbi.nlm.nih.gov/pubmed/21708734?report=xml&format=text, http://dx.doi.org/doi:10.1046/j.1365-2540.1998.00470.x, http://www.nature.com/hdy/journal/v81/n4/full/6884700a.html, https://link.springer.com/10.1007/BF00287783)","List(555036d07cea80f954159906, null, null, null, null, null, null, Math. Meth. of OR, null, null, null, null, 0)",42.0,2002,25769807201
53e9979fb7602d9701f70150,"Block Jam is a musical interface controlled by the arrangement of 24 tangible blocks. By positioning the blocks (Figure 1), musical phrases and sequences are created, allowing multiple users to play and collaborate.","List(List(53f447d7dabfaee4dc7d6cae, null, null, null, Henry Newton-Dunn, null, null, null, null, Sony CSL Interaction Lab, null, 5f71b2f51c455f439fe3f795, null, null, null), List(53f466dddabfaeecd6a0eca1, null, null, 5b86b9dde1cd8e14a36333b1, Hiroaki Nakano, null, null, null, null, Sony Design Center, null, 5f71b2841c455f439fe3c67d, null, null, null), List(54354272dabfaebba58bc248, null, null, 5b86ad96e1cd8e14a30bb298, James Gibson, null, null, null, null, Sony Design Center Europe, null, 5f71b2841c455f439fe3c67d, null, null, null))",10.1145/1242073.1242102,,1-58113-525-4,,,"List(interactive music, musical phrase, multiple user, musical interface, collaborative systems, block jam, tangible block, tangible interface)",en,5.0,67,67,,,Block jam,"List(http://dx.doi.org/10.1145/1242073.1242102, http://doi.acm.org/10.1145/1242073.1242102, db/conf/siggraph/siggraph2002aa.html#DunnNG02, https://doi.org/10.1145/1242073.1242102)","List(5736ae3ad39c4f40a797601c, null, null, null, null, null, null, SIGGRAPH Abstracts and Applications, null, null, null, null, 10)",,2002,8
53e997a2b7602d9701f782e9,"As the industry moves to personalization and mobility, users expect their applications to be location savvy, and relevant to their lives in increasing detail. While we can pinpoint a user at a location within 700 meters with just their IP address, and within a meter with their GPS-enabled mobile phone, we fall short when it comes to understanding their geographic context. A person's geographic context includes their current and previous location, the things that surround them, their activity in a given place, as well as their thoughts and feelings in that place. Understanding this context allows us to personalize their experience and refine their interactions with an application, on a hyper-local level.","List(List(53f497fbdabfaee1c0bad917, null, null, null, Vanessa Murdock, null, 544bd9d745ce266baf2ae4cc, null, null, Yahoo! Research, Barcelona, Spain, null, 5f71b2901c455f439fe3cb6d, null, null, null), List(53f32e25dabfae9a8449e908, null, null, 5b868fbce1cd8e14a3477bf0, Gary Gale, null, 544bd9be45ce266baf14b877, null, null, Nokia Gate5 GmbH, Berlin, Germany, null, 5f71b2831c455f439fe3c646, null, null, null))",10.1145/2063576.2064030,,,,,"List(hyper-local level, IP address, industry move, computational geography, geographic context, GPS-enabled mobile phone, previous location, location savvy)",en,,2598,2597,,,Computational geography,List(http://doi.acm.org/10.1145/2063576.2064030),"List(53a72c0f20f7420be8c3872f, null, null, null, null, null, null, CIKM, null, null, null, null, 0)",,2011,6201


In [0]:
# save as a delta table
new_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("organizations")

In [0]:
display(spark.table("organizations").limit(10))

org_id,org_name,org_country,ID
,Maynard Evans High School,,0
5f71b3201c455f439fe40a0b,"LIAFA, Université Paris VII, France",France,1
5f71b2dd1c455f439fe3ecd0,"Research Institute for Symbolic Computation, Johannes Kepler University, A-4040 Linz, Austria",Austria,2
5f71b4961c455f439fe4b0fe,"2 Moore Road, Southboro, Massachusetts",United States,3
5f71b2811c455f439fe3c5b7,"Satellite Television Corp., New York, NY|c|",United States,4
5f71b3311c455f439fe41212,"Yandex, Moscow, Russian Fed.",Russia,5
,Mainz,,6
5f71b2a01c455f439fe3d2b7,"University of Illinois at Urbana-Champaign Department of Philosophy 105 Gregory Hall, MC 468 Urbana IL 61801 USA",United States,7
5f71b2f51c455f439fe3f795,Sony CSL Interaction Lab,,8
5f71b2ca1c455f439fe3e4a2,"Univ. of Nijmegen, The Netherlands",Netherlands,9
