In [1]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import torch
import re
from transformers import set_seed
import json
from torch import cuda
torch.cuda.empty_cache()

In [2]:
cuda.is_available()

True

In [3]:
# list of FORD categories 
ford_categories =  { 
    "10100": "Mathematics",
    "10200": "Computer and information sciences",
    "10300": "Physical sciences",
    "10400": "Chemical sciences",
    "10500": "Earth and related environmental sciences",
    "10600": "Biological sciences",
    "10700": "Other natural sciences",
    "20100": "Civil engineering",
    "20200": "Electrical engineering, Electronic engineering, Information engineering",
    "20300": "Mechanical engineering",
    "20400": "Chemical engineering",
    "20500": "Materials engineering",
    "20600": "Environmental engineering",
    "20800": "Evironmental biotechnology",
    "20900": "Industrial biotechnology",
    "21000": "Nano-technology",
    "21100": "Other engineering and technologies",
    "30100": "Basic medicine",
    "30200": "Clinical medicine",
    "30300": "Health sciences",
    "30400": "Medical biotechnology",
    "30500": "Other medical sciences",
    "40100": "Agriculture, Forestry and Fisheries",
    "40200": "Animal and Dairy science",
    "40300": "Veterinary science",
    "40400": "Agricultural biotechnology",
    "40500": "Other agricultural sciences",
    "50100": "Psychology and cognitive sciences",
    "50200": "Economics and Business",
    "50300": "Education",
    "50400": "Sociology",
    "50500": "Law",
    "50600": "Political science",
    "50700": "Social and economic geography",
    "50800": "Media and communications",
    "50900": "Other social sciences",
    "60100": "History and Archaeology",
    "60200": "Languages and Literature",
    "60300": "Philosophy, Ethics and Religion",
    "60400": "Arts (arts, history of arts, performing arts, music)",
    "60500": "Other Humanities and the Arts",    
}

In [4]:
# list of research types and their definitions
research_dict = {
    "Quantitative research": "Involves the systematic collection and analysis of numerical data to describe and understand phenomena, test hypotheses, and establish patterns and relationships. Relies on statistical methods for data analysis.",
    "Experiment": "A research method where researchers manipulate one or more variables under controlled conditions to observe the effect on another variable. Used to establish causality and relationships between variables.",
    "Exploratory research": "Conducted to gain insights, identify research questions, and understand a problem more comprehensively. Often used in the early stages of research to explore new areas or phenomena.",
    "Qualitative research": "Focuses on understanding human behavior, experiences, and perceptions through methods like interviews, observations, and textual analysis. Aims to uncover meanings and patterns.",
    "Mixed methods": "Integrates qualitative and quantitative approaches in a single study to provide a more comprehensive understanding of a research problem. Combines strengths of both methods.",
    "Observation": "Involves systematically watching and recording behaviors, events, or phenomena to gather data. Can be conducted in natural settings or controlled environments.",
    "Applied research": "Aims to address specific practical problems and produce outcomes directly applicable to real-world situations. Often involves collaboration with stakeholders.",
    "Descriptive research": "Seeks to describe characteristics or features of a population, phenomenon, or process without manipulating variables. Provides baseline information.",
    "Focus groups": "Involve facilitated discussions with a small group of participants to gather insights, opinions, or attitudes on a specific topic. Provide qualitative data through group interactions.",
    "Survey methodology": "Encompasses the design, implementation, and analysis of surveys to gather data from a sample of a population. Focuses on ensuring validity and reliability of survey instruments.",
    "Correlational research": "Examines the relationship between variables to determine whether and how they co-vary. Assesses the strength and direction of associations but does not establish causation.",
    "Interviews": "A method of data collection involving direct conversations with participants to gather detailed and in-depth information. Can be structured or unstructured.",
    "Questionnaires": "Structured instruments consisting of a set of questions designed to gather standardized data from respondents. Used in survey research to collect quantitative data.",
    "Secondary research": "Involves the use of existing data sources such as literature reviews, databases, or reports to address research questions. Provides a synthesis of existing knowledge.",
    "Conclusive research": "Aims to provide final answers or conclusive findings to specific research questions or hypotheses. Often used to validate or refute existing theories or claims.",
    "Ethnographic research": "Involves deep immersion in a social setting to understand the culture, behaviors, and practices of a group. Uses participant observation and qualitative methods."
}


In [5]:
detailed_ford_categories = {"10101": "Pure mathematics",
                   "10102": "Applied mathematics",
                   "10103": "Statistics and probability",
                   "10201": "Computer sciences, information science, bioinformathics; excluding hardware development, social aspect",
                   "10301": "Atomic, molecular and chemical physics (physics of atoms and molecules including collision, interaction with radiation,magnetic resonances Mössbauer effect)",
                   "10302":"Condensed matter physics (including formerly solid state physics, supercond.)",
                   "10303":"Particles and field physics",
                   "10304":"Nuclear physics ",
                   "10305":"5 Fluids and plasma physics (including surface physics)",
                   "10306":"Optics (including laser optics and quantum optics)",
                   "10307":"Acoustics",
                   "10308":"Astronomy (including astrophysics, space science)",
                   "10401":"Organic chemistry",
                   "10402":"Inorganic and nuclear chemistry",
                   "10403":"Physical chemistry",
                   "10404":"Polymer science",
                   "10405":"Electrochemistry (dry cells, batteries, fuel cells, corrosion metals, electrolysis)",
                   "10406":"Analytical chemistry",
                   "10501":"Hydrology",
                   "10502":"Oceanography",
                   "10503":"Water resources",
                   "10504":"Mineralogy",
                   "10505":"Geology",
                   "10506":"Paleontology",
                   "10507":"Volcanology",
                   "10508":"Physical geography",
                   "10509":"Meteorology and atmospheric sciences",
                   "10510":"Climatic research",
                   "10511":"Environmental sciences; excluding social aspects",
                   "10601":"Cell biology",
                   "10602":"Biology (theoretical, mathematical, thermal, cryobiology, biological rhythm), Evolutionary biology",
                   "10603":"Genetics and heredity; excluding medical genetics",
                   "10604":"Reproductive biology; excluding medical aspects",
                   "10605":"Developmental biology ",
                   "10606":"Microbiology",
                   "10607":"Virology",
                   "10608":"Biochemistry and molecular biology",
                   "10609":"Biochemical research methods",
                   "10610":"Biophysics",
                   "10611":"Plant sciences, botany",
                   "10612":"Mycology",
                   "10613":"Zoology",
                   "10614":"Behavioral sciences biology",
                   "10615":"Ornithology",
                   "10616":"Entomology",
                   "10617":"Marine biology, freshwater biology, limnology",
                   "10618":"Ecology",
                   "10619":"Biodiversity conservation, Other biological topics",
                   "20101":"Civil engineering",
                   "20102":"Construction engineering, Municipal and structural engineering",
                   "20103":"Architecture engineering",
                   "20104":"Transport engineering",
                   "20201":"Electrical and electronic engineering",
                   "20202":"Communication engineering and systems",
                   "20203":"Telecommunications",
                   "20204":"Robotics and automatic control",
                   "20205":"Automation and control systems ",
                   "20206":"Computer hardware and architecture",
                   "20301":"Mechanical engineering ",
                   "20302":"Applied mechanics",
                   "20303":"Thermodynamics",
                   "20304":"Aerospace engineering",
                   "20305":"Nuclear related engineering, excluding nuclear physics; excluding nuclear physics",
                   "20306":"Audio engineering, reliability analysis",
                   "20401":"Chemical engineering (plants, products)",
                   "20402":"Chemical process engineering",
                   "20501":"Materials engineering",
                   "20502":"Paper and wood",
                   "20503":"Textiles; including synthetic dyes, colours, fibres, excluding Nanoscale materials and Biomaterials",
                   "20504":"Ceramics",
                   "20505":"Composites (including laminates, reinforced plastics, cermets, combined natural and synthetic fibre fabrics; filled composites)",
                   "20506":"Coating and films",
                   "20601":"Medical engineering",
                   "20602":"Medical laboratory technology (including laboratory samples analysis; diagnostic technologies), excluding Biomaterials",
                   "20701":"Environmental and geological engineering, geotechnics",
                   "20702":"Petroleum engineering (fuel, oils)",
                   "20703":"Mining and mineral processing",
                   "20704":"Energy and fuels",
                   "20705":"Remote sensing",
                   "20706":"Marine engineering, sea vessels",
                   "20707":"Ocean engineering",
                   "20801":"Environmental biotechnology",
                   "20802":"Bioremediation, diagnostic biotechnologies (DNA chips and biosensing devices) in environmental management",
                   "20803":"Environmental biotechnology related ethics",
                   "20901":"Industrial biotechnology",
                   "20902":"Bioprocessing technologies (industrial processes relying on biological agents to drive the process) biocatalysis, fermentation",
                   "20903":"Bioproducts (products that are manufactured using biological material as feedstock) biomaterials, bioplastics, biofuels, bioderived bulk and fine chemicals, bio-derived novel materials",
                   "21001":"Nano-materials (production and properties)",
                   "21002":"Nano-processes (applications on nano-scale), excluding biomaterials",
                   "21101":"Food and beverages",
                   "30101":"Human genetics",
                   "30102":"Immunology",
                   "30103":"Neurosciences including psychophysiology",
                   "30104":"Pharmacology and pharmacy",
                   "30105":"Physiology (including cytology)",
                   "30106":"Anatomy and morphology, excluding plant science",
                   "30107":"Medicinal chemistry",
                   "30108":"Toxicology",
                   "30109":"Pathology",
                   "30201":"Cardiac and Cardiovascular systems",
                   "30202":"Endocrinology and metabolism (including diabetes, hormones)",
                   "30203":"Respiratory systems",
                   "30204":"Oncology",
                   "30205":"Hematology",
                   "30206":"Otorhinolaryngology",
                   "30207":"Ophthalmology",
                   "30208":"Dentistry, oral surgery and medicine",
                   "30209":"Paediatrics",
                   "30210":"Clinical neurology",
                   "30211":"Orthopaedics",
                   "30212":"Surgery",
                   "30213":"Transplantation",
                   "30214":"Obstetrics and gynaecology",
                   "30215":"Psychiatry",
                   "30216":"Dermatology and venereal diseases",
                   "30217":"Urology and nephrology",
                   "30218":"General and internal medicine",
                   "30219":"Gastroenterology and hepatology",
                   "30220":"Andrology",
                   "30221":"Critical care medicine and Emergency medicine",
                   "30223":"Anaesthesiology",
                   "30224":"Radiology, nuclear medicine and medical imaging",
                   "30225":"Allergy",
                   "30226":"Rheumatology",
                   "30227":"Geriatrics and gerontology",
                   "30229":"Integrative and complementary medicine (alternative practice systems)",
                   "30230":"Other clinical medicine subjects",
                   "30301":"Social biomedical sciences (includes family planning, sexual health, psycho-oncology, political and social effects of biomedical research)",
                   "30302":"Epidemiology",
                   "30303":"Infectious Diseases",
                   "30304":"Public and environmental health",
                   "30305":"Occupational health",
                   "30306":"Sport and fitness sciences",
                   "30307":"Nursing",
                   "30308":"Nutrition, Dietetics",
                   "30309":"Tropical medicine",
                   "30310":"Parasitology",
                   "30311":"Medical ethics",
                   "30312":"Substance abuse",
                   "30401":"Health-related biotechnology",
                   "30402":"Technologies involving the manipulation of cells, tissues, organs or the whole organism (assisted reproduction)",
                   "30403":"Technologies involving identifying the functioning of DNA, proteins and enzymes and how they influence the onset of disease and maintenance of wellbeing (gene-based diagnostics and therapeutic interventions (pharmacogenomics, gene-based therapeutics)",
                   "30404":"Biomaterials (as related to medical implants, devices, sensors)",
                   "30405":"Medical biotechnology related ethics",
                   "30501":"Forensic science, Other medical science",
                   "40101":"Agriculture",
                   "40102":"Forestry",
                   "40103":"Fishery",
                   "40104":"Soil science",
                   "40105":"Horticulture, viticulture",
                   "40106":"Agronomy, plant breeding and",
                   "40201":"Animal and dairy science; excluding Animal biotechnology",
                   "40202":"Pets",
                   "40203":"Husbandry",
                   "40301":"Veterinary science",
                   "40401":"Agricultural biotechnology and food biotechnology",
                   "40402":"GM technology (crops and livestock), livestock cloning, marker assisted selection, diagnostics (DNA chips and biosensing devices for the early/accurate detection of diseases) biomass feedstock production technologies, biopharming",
                   "40403":"Agricultural biotechnology related ethics",
                   "50101":"Psychology (including human - machine relations)",
                   "50102":"Psychology, special (including therapy for learning, speech, hearing, visual and other physical and mental disabilities);",
                   "50103":"Cognitive sciences",
                   "50201":"Economic Theory",
                   "50202":"Applied Economics, Econometrics",
                   "50203":"Industrial relations",
                   "50204":"Business and management",
                   "50205":"Accounting",
                   "50206":"Finance",
                   "50301":"Education, general; including training, pedagogy, didactics [and education systems]",
                   "50302":"Education, special (to gifted persons, those with learning disabilities)",
                   "50401":"Sociology",
                   "50402":"Demography",
                   "50403":"Social topics (Women´s and gender studies; Social issues; Family studies; Social work)",
                   "50404":"Antropology, ethnology",
                   "50501":"Law",
                   "50502":"Criminology, penology",
                   "50601":"Political science",
                   "50602":"Public administration ",
                   "50603":"Organisation theory",
                   "50701":"Cultural and economic geography",
                   "50702":"Urban studies (planning and development)",
                   "50703":"Transport planning and social aspects of transport; excluding transport engineering",
                   "50704":"Environmental sciences (socialaspects)",
                   "50801":"Journalism",
                   "50802":"Media and socio-cultural communication",
                   "50803":"Information science (social aspects)",
                   "50804":"Library science",
                   "50901":"Other social sciences",
                   "60101":"History; except history of science and technology and history of specific sciences",
                   "60102":"Archaeology",
                   "60201":"General language studies",
                   "60202":"Specific languages",
                   "60203":"Linguistics",
                   "60204":"General literature studies",
                   "60205":"Literary theory",
                   "60206":"Specific literatures",
                   "60301":"Philosophy, History and Philosophy of science and technology",
                   "60302":"Ethics (except ethics related to specific subfields)",
                   "60303":"Theology",
                   "60304":"Religious studies",
                   "60401":"Arts, Art history",
                   "60402":"Architectural design",
                   "60403":"Performing arts studies (Musicology, Theater science, Dramaturgy)",
                   "60404":"Folklore studies",
                   "60405":"Studies on Film, Radio and Television",
                   "60500":"Other Humanities and the Arts",
                   }

In [6]:
def get_subclasses(ident):
    """Get subclasses for each FORD class"""
    ident = ident[:3]
    subclasses = ", ".join([detailed_ford_categories[cat] for cat in detailed_ford_categories.keys() if cat[:3] == ident])
    return subclasses

'Agricultural biotechnology and food biotechnology, GM technology (crops and livestock), livestock cloning, marker assisted selection, diagnostics (DNA chips and biosensing devices for the early/accurate detection of diseases) biomass feedstock production technologies, biopharming, Agricultural biotechnology related ethics'

In [7]:
HUGGING_FACE_API_KEY = "" # do not share
set_seed(42)

2024-04-21 19:23:25.022291: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
df = pd.read_csv("preprocessed.csv")

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "TheBloke/Llama-2-13B-chat-GPTQ"

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

In [10]:
def parse_output(output, category):
    pattern = "\{\n[\s\S]*?\}"
    try:
        match = re.search(pattern, output)
        json_string = match.group()
        feature_dict = json.loads(json_string)
        return feature_dict[category]
    except AttributeError:
        print(output)
        return None

In [17]:
def generate_features(title, abstract, category = None,discipline = None,):
    prompt_template=f'''[INST] <<SYS>>
    You are a categorization assistant. Your job will be to assign a certain characteristic to a research paper based on its abstract. 
    In this instance you will decide the rigor of the logic presented in the abstract.
    Your job will be to assess the methodolical soundness and the quality of logic provided in the abstract.
    You will choose between three levels of rigor: low, medium and high. Low being the least rigorous and high being the most. 
    Be concise, do not provide any explanations. Your answer will consist of an answer in json format like so:
    {{
        "rigor": "value"
    }}
    <</>>
    Abstract to be evaluated: 
    {abstract}[/INST]
    Response:
    '''
    input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
    output = model.generate(inputs=input_ids, do_sample=False, max_new_tokens=256)
    features = tokenizer.decode(output[0]).split('Response:')[1]
    if discipline:
        features = features.replace(" ", "").replace("\n", "").replace("</s>", "")
        features = features.replace("\n", "")
        features = features.replace(".", "")
    else:
        features = parse_output(features, category)
    del input_ids
    del output
    torch.cuda.empty_cache()  # Clear GPU memory
    return features 

In [12]:
def generate_dummies(disciplines):
    for discipline in disciplines:
        res = []
        for i, row in df.iterrows():
            features = generate_features(row['title'], row["abstract"], discipline=discipline)
            res.append(features)
            if i % 50 == 0:
                print(f"Done {i/df.shape[0]*100}%")
        df[discipline] = res
        print(f"Done {discipline}!")
        df.to_csv("disciplines_binary.csv")

In [13]:
def generate_category(category):
    res = []
    for i, row in df.iterrows():
        features = generate_features(row['title'], row["abstract"], discipline = category)
        res.append(features)
        if i % 50 == 0:
            print(f"Done {i/df.shape[0]*100}%")
    df[category] = res
    df.to_csv(f"{category}.csv")

In [18]:
generate_category("rigor")

Done 0.0%
Done 37.59398496240601%
Done 75.18796992481202%
