# LTI faculty

In [73]:
import pandas as pd
urls = [r'https://lti.cs.cmu.edu/directory/all/154/1?page=0', r'https://lti.cs.cmu.edu/directory/all/154/1?page=1']
datas = []
for url in urls:
    tables = pd.read_html(url) # Returns list of all tables on page
    datas.append(tables[0]) # Select table of interest


In [83]:
import numpy as np 
def parse_person_info(a):
    try:
        if np.isnan(a):
            return dict()
    except:
        pass
    
    data_dict = dict()
    names = ['Email', 'Office', 'Phone', 'Research Areas']
    idx = []
    for name in names: 
        try:
            idx.append(a.index(name))
        except:
            pass
    idx = list(sorted(idx))
    
    #parse first & second name 
    name_and_title = a[:idx[0]]
    name = ' '.join(name_and_title.split(' ')[:2])
    title = ' '.join(name_and_title.split(' ')[2:])
    data_dict['Name'] = name 
    data_dict['Title'] = title.strip()
    #parse other info
    for i in range(len(idx)):
        if i == len(idx) - 1:
            first, second = idx[i], len(a)
        else:
            first, second = idx[i], idx[i+1] 
        s = a[first:second]
        s = s.split(':')
        data_dict[s[0]] = s[1].strip()
    
    for name in names:
        if name not in data_dict:
            data_dict[name] = None
    return data_dict


In [105]:
#dict version 
lti_faculty = []
for data in datas:
    for d in data.values:
        for info in d:
            info_dict = parse_person_info(info)
            if info_dict:
                lti_faculty.append(info_dict)

In [106]:
lti_faculty = []
for data in datas:
    for d in data.values:
        for info in d:
            try:
                if np.isnan(info):
                    pass
            except:
                lti_faculty.append(info)

In [107]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/lti_faculty.json', 'w') as f:
    json.dump(lti_faculty, f)

# LTI Staff

In [108]:
import pandas as pd
urls = [r'https://lti.cs.cmu.edu/directory/all/154/2?page=0', r'https://lti.cs.cmu.edu/directory/all/154/2?page=1']
datas = []
for url in urls:
    tables = pd.read_html(url) # Returns list of all tables on page
    datas.append(tables[0]) # Select table of interest


In [109]:
lti_staff = []
for data in datas:
    for d in data.values:
        for info in d:
            try:
                if np.isnan(info):
                    pass
            except:
                lti_staff.append(info)

In [111]:
len(lti_staff)

33

In [112]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/lti_staff.json', 'w') as f:
    json.dump(lti_staff, f)

# LTI student (ignore now)

In [113]:
import pandas as pd
urls = [r'https://lti.cs.cmu.edu/directory/students/current/all']
datas = []
for url in urls:
    tables = pd.read_html(url) # Returns list of all tables on page
    datas.append(tables[0]) # Select table of interest


In [114]:
lti_student = []
for data in datas:
    for d in data.values:
        for info in d:
            try:
                if np.isnan(info):
                    pass
            except:
                lti_student.append(info)

# LTI Faculty paper

In [None]:
import requests

In [58]:
def is_cmu(faculty, threshold = 0.7):
    total = 0
    cs = 0
    for p in faculty['papers']:
        total += 1
        if p['fieldsOfStudy'] is not None and 'Computer Science' in p['fieldsOfStudy']:
            cs += 1
    if total == 0:
        return False
    return cs/total >= threshold


In [59]:
def extract_paper_info(faculty):
    papers = faculty['papers']
    paper_info = []
    for p in papers:
        paper_string = ''
        paper_string += f'Author: {faculty["name"]} '
        paper_string += f'Title: {p["title"]} '
        paper_string += f'Publication year: {p["year"]} '
        paper_info.append(paper_string)
    return paper_info

In [60]:
faculty = json.load(open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/lti_faculty.json'))

In [61]:
faculty_names = [s.split(' ')[:2] for s in faculty]

In [63]:
publications = []
for faculty_name in faculty_names:
    if faculty_name[0] != "Eric":
        name_str = faculty_name[0]+ '+' +faculty_name[1]
        name_str = name_str.lower()
    else:
        name_str = 'eric+xing' #hardcode for eric ;)
    url = f'https://api.semanticscholar.org/graph/v1/author/search?query={name_str}&fields=name,aliases,url,affiliations,papers.title,papers.year,papers.fieldsOfStudy'
    faculty_list = requests.get(url).json()['data']
    for f in faculty_list:
        if is_cmu(f):
            publications.extend(extract_paper_info(f))

In [65]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/faculty_publication.json', 'w') as f:
    json.dump(publications, f)

# CMU Schedule of class

In [74]:

import pandas as pd
urls = [r'https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_spring.htm', r'https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_summer_1.htm', r'https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_summer_2.htm', r'https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_fall.htm']
datas = []
for url in urls:
    tables = pd.read_html(url) # Returns list of all tables on page
    datas.append(tables[0]) # Select table of interest
order = ['Spring', 'Summer 1/Summer All', 'Summer 2', 'Fall']

In [75]:
courses = []
for table, semester in zip(datas, order):
    #drop first row and last column
    table = table.iloc[1:, :-1]
    table.columns = table.iloc[0]
    table = table.iloc[1:, :]

    #exclude rows where location and title is nan
    table = table[table['Location'].notna() | table['Title'].notna()]
    table.fillna(method='ffill', inplace=True)
    table.reset_index(inplace=True, drop=True)

    for i in range(len(table)):
        course_info = table.iloc[i].to_dict()
        s = semester + ' offering: '
        for k, v in course_info.items():
            s += f'{k}: {v} '
        courses.append(s)
    


  table.fillna(method='ffill', inplace=True)
  table.fillna(method='ffill', inplace=True)
  table.fillna(method='ffill', inplace=True)
  table.fillna(method='ffill', inplace=True)


In [77]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/course_schedule.json', 'w') as f:
    json.dump(courses, f)

# CMU Academic calendar

In [19]:
import pandas as pd 
semesters = ['Fall 2023', 'Spring 2024', 'Summer One_All 2024', 'Summer Two 2024']
events = []
for semester in semesters:
    df = pd.read_excel(f'/zfsauton2/home/yifuc/11711-RAG/data/cmu/raw/{semester}.xlsx')
    for i in range(len(df)):
        date_dict = df.iloc[i].to_dict()
        #convert timestamp to string and remove hour, minute, and second
        date1 = date_dict['Date'].strftime('%Y-%m-%d')
        date2 = date_dict['Date2']
        #check for consecutive date
        if not str(date2) == 'nan' and not str(date2) == 'NaT':
            date2 = date2.strftime('%Y-%m-%d')
            date = f'{date1}-{date2}'
        else:
            date = date1
        #create date string 
        date_string = f'Date: {date}'
        for k, v in date_dict.items():
            if k not in ['Date', 'Date2', '-']:
                date_string += f' {k}: {v}'
        date_string = semester + ': ' + date_string
        events.append(date_string)

In [21]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/university_calendar.json', 'w') as f:
    json.dump(events, f)