In [4]:
import csv
import json
# from unittest import skip
import pandas as pd
import hardcoded

class Person:
    def __init__(self,
                 first_name,
                 middle_name,
                 last_name,
                 year=None,
                 phone=None,
                 email=None,
                 affiliation=[],
                 specialization_original=[],
                 specialization_modified=[],
                 memoriam=False,
                 unactive=False,
                 organization=[],
                 email_affiliation=None,
                 id_num=None,
                 kumu_num=None):
        self.year = year
        self.first_name = first_name
        self.middle_name = middle_name
        self.last_name = last_name
        self.phone = phone
        self.email = email
        self.affiliation = affiliation
        self.specialization_original = specialization_original
        self.specialization_modified = specialization_modified
        self.memoriam = memoriam
        self.unactive = unactive
        self.organization = organization
        self.email_affiliation = email_affiliation
        self.umbrella_aff = []
        self.related_aff = []
        self.umbrella_spec = []
        self.related_spec = []
        self.id_num = id_num
        self.kumu_num = kumu_num

    # return nice string of info

    def __str__(self):
        # short version
        return f'Person: {self.last_name}, {self.first_name}\n'

    def description(self):
        # long version
        # {self.middle_name if self.middle_name != None else ""}
        return(f'Name: {self.last_name}, {self.first_name}\n' +
               f'Middle name: {self.middle_name if self.middle_name != "" else None}\n' +
               f'Phone: {self.phone}\n' +
               f'Email: {self.email}\n' +
               f'Affiliation(s): {", ".join([aff for aff in self.affiliation]) if len(self.affiliation)>0 else None}\n' +
               f'Original specialization(s): {", ".join([spec for spec in self.specialization_original]) if len(self.specialization_original)>0 else None}\n' +
               f'Modified pecialization(s): {", ".join([spec for spec in self.specialization_modified]) if len(self.specialization_modified)>0 else None}\n' +
               f'Organization(s): {", ".join(org for org in self.organization)}\n' +
               f'Year joined: {self.year} \n' +
               f'In memoriam: {self.memoriam}\n' +
               f'No longer an active member: {self.unactive}\n' +
               f'Affiliation(s) from email: {", ".join([email_aff for email_aff in self.email_affiliation] if self.email_affiliation!= None else [])}\n' +
               f'ID: {self.id_num}\n' +
               f'Umbrella institution(s): {", ".join(inst for inst in self.umbrella_aff) if len(self.umbrella_aff)>0 else None}\n' +
               f'Related institution(s): {", ".join(inst for inst in self.related_aff) if len(self.related_aff)>0 else None}\n' +
               f'Umbrella specializations(s): {", ".join(spec for spec in self.umbrella_spec) if len(self.umbrella_spec)>0 else None}\n' +
               f'Related specializations(s): {", ".join(spec for spec in self.related_spec) if len(self.related_spec)>0 else None}')

    def kumu_tag(self):
        ans = []
        ans.extend([aff for aff in self.affiliation]
                   if self.affiliation != None else [])
        ans.extend([spec for spec in self.specialization_modified]
                   if self.specialization_modified != None else [])
        ans.extend(self.organization)
        ans.extend([f'Year: {self.year}'] if self.year != None else [])
        ans.extend([email_aff for email_aff in self.email_affiliation]
                   if self.email_affiliation != None else [])
        ans.extend([inst for inst in self.umbrella_aff]
                   if self.umbrella_aff != None else [])
        ans.extend([spec for spec in self.umbrella_spec]
                   if self.umbrella_spec != None else [])
        if len(ans) > 0:
            return("|".join(ans))
        else:
            return ""

    # bunch of get funcs
    def make_list(self):
        return([self.year,
                self.first_name,
                self.middle_name,
                self.last_name,
                self.phone,
                self.email,
                self.affiliation,
                self.specialization_original,
                self.specialization_modified,
                self.memoriam,
                self.unactive,
                self.organization,
                self.email_affiliation,
                self.umbrella_aff,
                self.related_aff,
                self.umbrella_spec,
                self.related_spec,
                self.id_num,
                self.kumu_num,
                ])

    def get_name(self, no_mn=False):
        # if self.kumu_num != None:
        #     return f'{self.last_name}, {self.first_name}, {self.middle_name} ({self.kumu_num})'
        # else:
        #     return f'{self.last_name}, {self.first_name}, {self.middle_name}'

        if no_mn == True or self.middle_name == '':
            if self.kumu_num != None:
                # no middle name, yes kumu num
                return f'{self.last_name}, {self.first_name} |{self.kumu_num}'
            else:
                # no middle name, no kumu num
                return f'{self.last_name}, {self.first_name}'
        else:
            if self.kumu_num != None:
                # yes middle name, yes kumu num
                return f'{self.last_name}, {self.first_name} ({self.middle_name}) |{self.kumu_num}'
            else:
                # yes middle name, no kumu num
                return f'{self.last_name}, {self.first_name} ({self.middle_name})'

    def set_organization(self, new_org, add=True):
        if add:
            self.organization.append(new_org)
        else:
            self.organization = [new_org]

    def set_id_num(self, new_id_num):
        self.id_num = new_id_num

    def set_kumu_num(self, new_num):
        self.kumu_num = new_num

    # allow to either completely change specilization or add new specilization (default)
    def set_specialization_original(self, new_spec, add=True):
        if add:
            self.specialization_original.append(new_spec)
        else:
            self.specialization_original = new_spec

    def remove_specialization_original(self, spec_to_remove):
        self.specialization_original.remove(spec_to_remove)

    def set_specialization_modified(self, new_spec, add=True):
        if add:
            self.specialization_modified.append(new_spec)
        else:
            self.specialization_modified = new_spec

    def remove_specialization_modified(self, spec_to_remove):
        self.specialization_modified.remove(spec_to_remove)

    # same as specilization but with affiliation instead
    def set_affiliation(self, new_aff, add=True):
        if add:
            self.affiliation.append(new_aff)
        else:
            self.affiliation = new_aff

    def set_umbrella_aff(self, aff):
        if aff not in self.umbrella_aff:
            self.umbrella_aff.append(aff)

    def set_related_aff(self, aff):
        if aff not in self.related_aff:
            self.related_aff.append(aff)

    def set_umbrella_spec(self, spec):
        if type(spec) == str:
            if spec not in self.umbrella_spec:
                self.umbrella_spec.append(spec)
        elif type(spec) == list:
            for s in spec:
                if s not in self.umbrella_spec:
                    self.umbrella_spec.append(s)
        else:
            raise Exception(
                "pass a non-str and non-list item in set_umbrella_spec")

    def set_related_spec(self, spec):
        if type(spec) == str:
            if spec not in self.related_spec:
                self.related_spec.append(spec)
        elif type(spec) == list:
            for s in spec:
                if s not in self.related_spec:
                    self.related_spec.append(s)
        else:
            raise Exception(
                "pass a non-str and non-list item in set_related_spec")

    # sorting by lastname
    def __lt__(self, other):
        return self.last_name + self.first_name < other.last_name + other.first_name

# importing aap data + making Person instances for aap members
aap_excel = pd.read_excel(r'aap.xlsx')
aap_data = pd.DataFrame(aap_excel, columns=['Name']).values.tolist()
aap_list = []
# WARNING: aap data have 3 pairs of duplicates

# iterating through excel data to put stuff in list
for p in aap_data:
    name = p[0].split(' ')
    ln = name[-1].strip()
    # fn = " ".join(name[:-1]).strip()
    fn = name[0].strip()
    mn = " ".join(name[1:-1]).strip()

    # first name is the first word in name
    # if len(name == 2):  # if there is only one word, then that must be the first name and not middle name
    #     md = None
    # else:
    #     # if there is more than one word, add anything that isn't the first or the last word to middle name
    #     # this is not going to always give the right middle name, but it's the best I can do without manually doing it
    #     mn = " ".join(name[1:-1]).strip()

    aap_list.append(Person(first_name=fn, middle_name=mn,
                    last_name=ln, organization=["AAP"]))

aap_list.sort()
# for person in aap_list:
#     print(person)

# importing asci data + making Person instances for asci members
asci_excel = pd.read_excel(r'asci.xlsx')
asci_data = pd.DataFrame(asci_excel, columns=['year-link', 'name-link',
                                              'Phone number', 'Email', 'Institutional affiliation',
                                              'Specialties', 'In Memoriam', 'No Longer Active Member']).values.tolist()


# print(asci_data[0])
asci_list = []


for p in asci_data:
    year = p[0][:4]
    name = p[1].split(',')
    ln = name[0].strip()  # all that is before comma
    fn = name[1].split()[0]  # first word after comma
    mn = " ".join(name[1].split()[1:]).replace(
        '(', '').replace(')', '') if len(name[1].split()) > 1 else ''

    # fn = " ".join(name[1:]).strip()

    # filtering to only numbers
    _phone = ''.join([d for d in str(p[2]) if d in '0123456789'])
    # make sure that it's the right length and not some accidentally scaped num
    phone = _phone if len(_phone) >= 10 else False

    email = p[3] if'@' in str(p[3]) else None

    affiliation = [p[4].strip()] if not (
        pd.isna(p[4]) or p[4] == None) else []

    # doing spec is complicated
    if p[5] == 'Diagnostic radiology':
        p[5] = 'Diagnostic Radiology'
    elif p[5] == 'Obstetrics and gynecology':
        p[5] = 'Obstetrics and Gynecology'
    # original spec without hardoded spec list
    spec_original = [p[5]] if not (pd.isna(p[5]) or p[5] ==
                                   None or len(p[5]) >= 100 or '19' in p[5] or '20' in p[5]) else []
    # spec with hardcoded spec list
    spec_modified = [p[5]] if not (pd.isna(p[5]) or p[5] ==
                                   None or len(p[5]) >= 100 or '19' in p[5] or '20' in p[5]) and p[5] in hardcoded.spec_list else []
    memoriam = p[6] if not pd.isna(p[6]) else False

    unactive = p[7] if not pd.isna(p[7]) else False

    email_affiliation = email[email.index(
        '@')+1: email.rindex('.')].split('.') if email != None else None

    # NEED TO SCREEN FOR DUPLICATES WITHIN ASCI DUE TO WEIRD SCAPING SYNTAX
    # not very efficent method:
    duplicate = False
    for person in asci_list:

        # if this person is already in list
        if person.first_name == fn and person.last_name == ln and person.year == year:
            # print('HAVE DUPS')
            # print(spec)
            duplicate = True
            # add spec/affiliation if applicable
            if spec_original != []:
                for s in spec_original:
                    person.set_specialization_original(s)
            if spec_modified != []:
                for s in spec_modified:
                    person.set_specialization_modified(s)
            if affiliation != []:
                for aff in affiliation:
                    person.set_affiliation(aff)

            break
    # if not duplicate, add the person to list
    if not duplicate:
        asci_list.append(Person(fn, mn, ln, year, phone, email,
                                affiliation, spec_original, spec_modified, memoriam,
                                unactive, ["ASCI"], email_affiliation))


_combine_list = (aap_list + asci_list)
_combine_list.sort()  # yet to combine people with same name from ASCI and AAP
combine_list = []
# COMBINE PEOPLE FROM ASCI AND AAP HERE
for i in range(len(_combine_list)-1):
    # iterating through the list, checking for each consecutive pair of people
    p1 = _combine_list[i]
    p2 = _combine_list[i+1]
    if p1.get_name(True) == p2.get_name(True) and p1.organization == ['AAP'] and p2.organization == ['ASCI']:
        # the pair has same first and last names
        # and they are from ASCI and AAP respectively (know this for sure if they match due to sorting alphabetically)
        # then change the latter's organization to both and add asci verion to final list
        p2.set_organization('AAP')
        combine_list.append(p2)
    elif p1 not in combine_list:
        # else if they don't share the same name or is from the same organization
        # and the first person isnt already in the final list
        # add first person to the final list
        combine_list.append(p1)

# UMBRELLA/RELATED SPEC/AFFILATION

for p in asci_list:

    # super and sub specializations
    for each_spec in p.specialization_modified:

        # if the spec is a super spec, add it to the umbrella spec list
        if each_spec in hardcoded.sup_spec_list:
            p.set_umbrella_spec(each_spec)

        # if the spec is a sub spec, get its super spec and add that to the umbrella spec list
        if each_spec in hardcoded.reverse_spec_dict.keys():
            p.set_umbrella_spec(hardcoded.reverse_spec_dict[each_spec])

    # go through list of super spec (supspec) and get all related specs
    for each_sup_spec in p.umbrella_spec:
        for each_sub_spec in hardcoded.spec_dict[each_sup_spec]:
            p.set_related_spec(each_sub_spec)

    # super and sub affiliations
    for each_aff in p.affiliation:
        # if affiliation in an umbrella of affiliation, add the umbrella affiliation to umbrella aff
        if each_aff in hardcoded.reverse_aff_dict.keys():
            p.set_umbrella_aff(hardcoded.reverse_aff_dict[each_aff])

    # from each umbrella affs get all sub affs
    for each_sup_aff in p.umbrella_aff:
        for each_sub_aff in hardcoded.aff_dict[each_sup_aff]:
            p.set_related_aff(each_sub_aff)


unique_name_dict = {}  # name: number of occurances

id_num_counter = 0
# =============MAKING ID NUM IN ALPHABETICAL ORDER=================

for p in combine_list:
    id_num_counter += 1
    p.set_id_num("P" + str(id_num_counter))
    name = p.get_name()
    if name not in unique_name_dict:
        unique_name_dict[name] = 1
    else:
        unique_name_dict[name] += 1
        p.set_kumu_num(unique_name_dict[name])


# WRITING INTO A NEW FILE WITH CLEAN DATA:)
with open('asci_aap_dataUpdated.csv', 'w', encoding="utf-8") as f:
    # create the csv writer
    writer = csv.writer(f)

    # write a row to the csv file
    writer.writerow(['year',
                     'first_name',
                     'middle_name',
                     'last_name',
                     'phone',
                     'email',
                     'affiliation',
                     'original specialization',
                     'modified specialization',
                     'memoriam',
                     'unactive',
                     'organization',
                     'email_affiliation',
                     'umbrella_aff',
                     'related_aff',
                     'umbrella_spec',
                     'related_spec',
                     'id_num',
                     'kumu_num'])

    for person in combine_list:
        #print(person.make_list()) 
        writer.writerow(person.make_list())
f.close()
    
    
    
# counter = 0
# for p in combine_list:
#     print(p.get_name(True))

#     if p.get_email() != None and p.get_affiliation() == []:
#         counter += 1
#         print(p.get_email())
# print(counter)
# print(len(combine_list))

with open('asci_aap_data_reader_friendly.txt', 'w', encoding="utf-8") as f:
    for p in combine_list:
        person_description = p.description()
        f.write(person_description)
        f.write("\n")
        f.write("\n")
f.close()


# FINDING THE DISTRIBUTION OF SPECIALITIES IF LISTED
aamc_spec_dict = {}

for p in combine_list:
    # found_super_spec = False
    spec_count = 0
    for s in p.specialization_original:
        if s in hardcoded.aamc_spec_list:
            spec_count += 1
            if s in aamc_spec_dict:
                aamc_spec_dict[s] += 1
            else:
                aamc_spec_dict[s] = 1
    if spec_count > 1:
        print('SPEC COUNT > 1:', spec_count, p.specialization_original)

for k in aamc_spec_dict.keys():
    print(f'{k}: {aamc_spec_dict[k]}')


# PRIMARY, SECONDARY, ... SPECIALITIES DISTRIBUTION
# {'Internal Medicine': [num primary occurance, num secondary occurance, ...]}
spec_freq_dict = {}

for p in combine_list:
    # found_super_spec = False
    index = 0
    for s in p.specialization_modified:
        if s in spec_freq_dict:
            spec_freq_dict[s][index] += 1
        else:
            spec_freq_dict[s] = [0, ]*8
            # print(spec_freq_dict[s])
            spec_freq_dict[s][index] += 1
        index += 1

# spec_list1 = spec_freq_dict.keys()
# for spec in spec_list1:
#     print(f'{spec}: {spec_freq_dict[spec]}')


# LONGEST THINGS COUNT
# (long_fn, long_mn, long_ln, long_em, max_num_aff, max_num_spec) = (0,)*6
# long_spec_name = len(max(hardcoded.spec_list))
# long_aff_name = len(max(hardcoded.aff_dict.keys()))

# for p in combine_list:
#     if len(p.first_name) > long_fn:
#         long_fn = len(p.first_name)
#     if len(p.middle_name) > long_mn:
#         long_mn = len(p.middle_name)
#     if len(p.last_name) > long_ln:
#         long_ln = len(p.last_name)
#     if p.email != None:
#         if len(p.email) > long_em:
#             long_em = len(p.email)
#     if len(p.affiliation) > max_num_aff:
#         max_num_aff = len(p.affiliation)
#     if len(p.specialization_modified) > max_num_spec:
#         max_num_spec = len(p.specialization_modified)


# print(long_fn, long_mn, long_ln, long_em,
#       max_num_aff, max_num_spec, long_aff_name, long_spec_name)

print(len(combine_list))

#conda create -n name_of_my_env python
#source activate name_of_my_env
#activate name_of_my_env
#conda install pandas
#conda install pandas=0.20.3


SPEC COUNT > 1: 2 ['Genetics', 'Immunology', 'Internal Medicine', 'Pediatrics', 'Hematology']
SPEC COUNT > 1: 2 ['Physiology', 'Metabolism', 'Neurology', 'Nephrology', 'Endocrinology', 'Neurobiology', 'Critical Care Medicine', 'Internal Medicine', 'Geriatrics']
SPEC COUNT > 1: 2 ['Internal Medicine', 'Obstetrics and Gynecology', 'Infectious Disease', 'Immunology']
SPEC COUNT > 1: 2 ['Neurology', 'Neurobiology', 'Virology', 'Pathology']
SPEC COUNT > 1: 2 ['Cell Biology', 'Endocrinology', 'Molecular Biology', 'Biochemistry', 'Dermatology', 'Internal Medicine']
SPEC COUNT > 1: 2 ['Biochemistry', 'Neurobiology', 'Metabolism', 'Geriatrics', 'Psychiatry', 'Neurology', 'Genetics']
SPEC COUNT > 1: 2 ['Nutrition', 'Internal Medicine', 'Biochemistry', 'Hepatology', 'Gastroenterology', 'Pathology']
SPEC COUNT > 1: 2 ['Oncology', 'Pathology', 'Immunology', 'Hematology', 'Pediatrics']
SPEC COUNT > 1: 3 ['Pediatrics', 'Immunology', 'Internal Medicine', 'Allergy', 'Pulmonology', 'Administration', 'De

In [9]:
#Ashley's attempt at converting csv to json file
#continued list of scientists:
addedNames = [
    {
        "year": "2023",
        "first_name": "Hojun",
        "middle_name": "",
        "last_name": "Li",
        "phone": "617-324-4404",
        "email": "hojunli@mit.edu",
        "affiliation": ['MIT Koch Institute'],
        "original_specialization": ['Hematology', 'Oncology', 'Molecular Biology'],
        "unactive": "False",
        "organization": ['MIT', 'Boston Childrens Hospital']
    },
    {
        "year": "2023",
        "first_name": "Tobiloba",
        "middle_name": "",
        "last_name": "Oni",
        "phone": "617-258-5125",
        "email": "tobioni@wi.mit.edu",
        "affiliation": ['MIT Koch Institute'],
        "original_specialization": ['Oncology', 'Genes and Genomes', 'Pancreatic Cancer'],
        "unactive": "False",
        "organization": ['MIT', 'Whitehead Institute']
    },
    {
        "year": "2023",
        "first_name": "Salil",
        "middle_name": "",
        "last_name": "Garg",
        "phone": "617-715-4470",
        "email": "jstraehl@mit.edu",
        "affiliation": ['MIT Koch Institute'],
        "original_specialization": ['Laboratory Medicine', 'Molecular Genetic Pathology', 'Oncology'],
        "unactive": "False",
        "organization": ['MIT','Yale University']
    }
    
    
]

              
              
              
              
with open('asci_aap_dataUpdated.csv', 'r', encoding = 'utf-8') as csvFile:
    csvReader = csv.reader(csvFile)
    next(csvReader)
    data = {"people":[]}
    
    for row in csvReader:
        if row:
            data["people"].append({"year": row[0], "first_name": row[1], 
                                  "middle_name": row[2], "last_name": row[3],
                                  "phone": row[4], "email": row[5], 
                                  "affiliation": row[6], "original specialization": row[7],
                                  "modified specialization": row[8], "unactive": row[10], 
                                 "organization": row[11], "email_affiliation": row[12],
                                 "umbrella_aff": row[13], "related_aff": row[14],
                                 "umbrella_spec": row[15], "related_spec": row[16],
                                 "id_num": row[17], "kumu_num": row[18]})
    for name in addedNames:
        data["people"].append(name)
        
        
with open('asci_aap_dataJSONUpdated.json', 'w', encoding="utf-8") as jsonFile:
    json.dump(data,jsonFile, indent=4)
