<a href="https://colab.research.google.com/github/thomouvic/CSC502/blob/main/pyspark_medline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analysis of Medline Data

**To be run on Google Colab**.

**The following cell can take long, about 3 min**. Only execute it once per session.  

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop2.tgz
!tar xf spark-3.3.2-bin-hadoop2.tgz

In [2]:
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop2"

import findspark
findspark.init("spark-3.3.2-bin-hadoop2")# SPARK_HOME

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

sc = SparkContext.getOrCreate()
spark = SparkSession.builder.getOrCreate()

In [14]:
!wget https://raw.githubusercontent.com/thomouvic/CSC502/main/data/mesh_terms.txt -O mesh_terms.txt
medline_raw = sc.textFile("mesh_terms.txt");

medline_lists = medline_raw.map(lambda line: line.split("|"))
print(medline_lists.take(5))

topics = medline_lists.flatMap(lambda topiclist: topiclist)
print(topics.take(5))

--2024-03-05 23:02:23--  https://raw.githubusercontent.com/thomouvic/CSC502/main/data/mesh_terms.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4778319 (4.6M) [text/plain]
Saving to: ‘mesh_terms.txt’


2024-03-05 23:02:23 (56.1 MB/s) - ‘mesh_terms.txt’ saved [4778319/4778319]

[['Intellectual Disability', 'Maternal-Fetal Exchange', 'Pregnancy Complications'], ['Amniocentesis', 'Research'], ['Cesarean Section', 'General Surgery', 'Hysterectomy', 'Sterilization, Reproductive'], ['Animals, Laboratory', 'Eugenics'], ['Aftercare', 'Anesthesia', 'Cesarean Section', 'Contraception', 'Follow-Up Studies', 'Mortality', 'Postpartum']]
['Intellectual Disability', 'Maternal-Fetal Exchange', 'Pregnancy Complications', 'Amniocentesis', 'Research']


In [15]:
topic_cnt = topics.map(lambda topic: (topic,1))\
                  .reduceByKey(lambda x,y: x+y )
print(topic_cnt.take(100))

[('Intellectual Disability', 99), ('Maternal-Fetal Exchange', 60), ('Amniocentesis', 24), ('Research', 1646), ('Cesarean Section', 116), ('Sterilization, Reproductive', 101), ('Animals, Laboratory', 71), ('Anesthesia', 482), ('Mortality', 349), ('Postpartum', 1), ('Adrenalectomy', 62), ('Breast Neoplasms', 190), ('Endocrine Surgical Procedures', 7), ('Bone Neoplasms', 76), ('Spinal Cord Neoplasms', 25), ('Spine', 82), ('Aorta', 62), ('Aortic Coarctation', 4), ('Gangrene', 28), ('Isotonic Solutions', 10), ('Esophageal Diseases', 29), ('Breast Feeding', 176), ('Congenital Abnormalities', 309), ('Kaposi Varicelliform Eruption', 3), ('Gallbladder Diseases', 32), ('Bladder', 1), ('Arteries', 141), ('Disease', 1349), ('Embolectomy', 4), ('Femoral Artery', 41), ('Heart Diseases', 138), ('Asthma', 41), ('Dyspnea, Paroxysmal', 1), ('Hearing', 96), ('Hearing Tests', 97), ('Leeches', 18), ('Leeching', 3), ('Exercise Therapy', 74), ('Radioisotopes', 161), ('Carcinogenesis', 21), ('Kidney Diseases'

In [16]:
cnt_topicList = topic_cnt.map(lambda tc: (tc[1],tc[0]))\
                         .groupByKey()

#print(cnt_topicList.take(5))
print(cnt_topicList.map(lambda x: (x[0], list(x[1]))).take(5))

[(60, ['Maternal-Fetal Exchange', 'Urologic Surgical Procedures', 'Accident Prevention', 'Obesity', 'Human Body', 'Ethics, Professional', 'Outcome Assessment (Health Care)', 'Mammals', 'Lymphocytes', 'Publishing', 'Artifacts', 'Education, Medical, Undergraduate', 'Protein Structure, Secondary']), (24, ['Amniocentesis', 'Veterans', 'Administration, Oral', 'Conjunctiva', 'Precancerous Conditions', 'Gastric Mucosa', 'Explosions', 'Flavonoids', 'Injections, Intravenous', 'Social Desirability', 'Fires', 'Folic Acid', 'Thigh', 'Anesthesia, Endotracheal', 'Peritoneum', 'Felis', 'Molar', 'Pseudomonas Infections', 'Poliovirus', 'Glomerulonephritis', 'Root Canal Filling Materials', 'Sunlight', 'Child, Hospitalized', 'Immunologic Memory', 'Lipoma', 'Smoke', 'Archives', 'Dangerous Behavior', 'Thermodynamics', 'Length of Stay', 'Dentures', 'Neoplasms, Radiation-Induced', 'Persistent Vegetative State', 'Antigens, Viral', 'Microspheres', 'Dominance, Cerebral', 'Dental Health Surveys', 'Mother-Child R

In [17]:
# ascending is set to false, so it's descending
cnt_topicList_sorted = cnt_topicList.sortByKey(False)
cnt_topicList_sorted.map(lambda x: (x[0], list(x[1])))\
                    .take(100)

[(1646, ['Research']),
 (1349, ['Disease']),
 (1131, ['Neoplasms']),
 (1061, ['Tuberculosis']),
 (814, ['Public Policy']),
 (793, ['Jurisprudence']),
 (758, ['Demography']),
 (741, ['Population Dynamics']),
 (688, ['Economics']),
 (687, ['Medicine']),
 (652, ['Socioeconomic Factors']),
 (639, ['Blood']),
 (626, ['Politics']),
 (587, ['Emigration and Immigration']),
 (574, ['Social Change']),
 (556, ['Physicians']),
 (539, ['Mutation']),
 (496, ['Abortion, Induced']),
 (482, ['Anesthesia']),
 (477, ['Hospitals']),
 (472, ['Public Health']),
 (469, ['Algorithms']),
 (467, ['Models, Biological']),
 (466, ['Ethnic Groups']),
 (442, ['Electrocardiography']),
 (431, ['Wounds and Injuries']),
 (429, ['Women']),
 (425, ['Diet']),
 (423, ['Electroencephalography']),
 (417, ['Biomedical Research', 'Evaluation Studies as Topic']),
 (415, ['Employment']),
 (397, ['Delivery of Health Care']),
 (390, ["Women's Rights"]),
 (385, ['Fertility']),
 (384, ['Vaccination', 'Attitude']),
 (376, ['Models, Th

In [18]:
# Let's create a frequency count.
# This is an RDD of integer pairs (cnt, freq), e.g. (5,10),
# meaning that there are 10 topics having a count of 5.
cnt_freq = cnt_topicList.map(lambda x: (x[0], len(x[1])))
cnt_freq.collect()

[(60, 13),
 (24, 92),
 (1646, 1),
 (116, 7),
 (482, 1),
 (62, 31),
 (190, 1),
 (76, 9),
 (82, 11),
 (4, 950),
 (28, 89),
 (10, 294),
 (176, 4),
 (32, 62),
 (138, 7),
 (96, 9),
 (18, 152),
 (74, 14),
 (90, 7),
 (8, 380),
 (212, 2),
 (20, 132),
 (78, 15),
 (192, 4),
 (22, 106),
 (556, 1),
 (306, 4),
 (166, 2),
 (52, 30),
 (6, 595),
 (110, 4),
 (14, 200),
 (26, 78),
 (228, 1),
 (30, 59),
 (48, 30),
 (40, 44),
 (124, 5),
 (56, 15),
 (142, 4),
 (42, 35),
 (98, 10),
 (44, 17),
 (88, 11),
 (282, 1),
 (150, 6),
 (248, 4),
 (46, 33),
 (16, 176),
 (104, 7),
 (120, 11),
 (114, 4),
 (252, 2),
 (158, 2),
 (156, 6),
 (94, 15),
 (64, 26),
 (2, 2185),
 (68, 17),
 (472, 1),
 (122, 9),
 (184, 2),
 (36, 45),
 (164, 6),
 (206, 1),
 (136, 5),
 (50, 35),
 (66, 16),
 (58, 20),
 (70, 15),
 (12, 227),
 (304, 1),
 (72, 13),
 (84, 12),
 (246, 2),
 (38, 35),
 (174, 5),
 (34, 45),
 (182, 2),
 (222, 1),
 (204, 1),
 (80, 10),
 (160, 1),
 (334, 1),
 (238, 1),
 (140, 4),
 (196, 4),
 (102, 9),
 (132, 6),
 (54, 26),
 (2