# LTI faculty

In [45]:
import pandas as pd
urls = [r'https://lti.cs.cmu.edu/directory/all/154/1?page=0', r'https://lti.cs.cmu.edu/directory/all/154/1?page=1']
datas = []
for url in urls:
    tables = pd.read_html(url) # Returns list of all tables on page
    datas.append(tables[0]) # Select table of interest


In [46]:
import numpy as np 
def parse_person_info(a):
    try:
        if np.isnan(a):
            return dict()
    except:
        pass
    
    data_dict = dict()
    names = ['Email', 'Office', 'Phone', 'Research Areas']
    idx = []
    for name in names: 
        try:
            idx.append(a.index(name))
        except:
            pass
    idx = list(sorted(idx))
    
    #parse first & second name 
    name_and_title = a[:idx[0]]
    name = ' '.join(name_and_title.split(' ')[:2])
    title = ' '.join(name_and_title.split(' ')[2:])
    data_dict['Name'] = name 
    data_dict['Title'] = title.strip()
    #parse other info
    for i in range(len(idx)):
        if i == len(idx) - 1:
            first, second = idx[i], len(a)
        else:
            first, second = idx[i], idx[i+1] 
        s = a[first:second]
        s = s.split(':')
        data_dict[s[0]] = s[1].strip()
    
    for name in names:
        if name not in data_dict:
            data_dict[name] = None
    return data_dict


In [105]:
#dict version 
lti_faculty = []
for data in datas:
    for d in data.values:
        for info in d:
            info_dict = parse_person_info(info)
            if info_dict:
                lti_faculty.append(info_dict)

In [47]:
lti_faculty = []
for data in datas:
    for d in data.values:
        for info in d:
            try:
                if np.isnan(info):
                    pass
            except:
                lti_faculty.append(info)

In [107]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/lti_faculty.json', 'w') as f:
    json.dump(lti_faculty, f)

# LTI Staff

In [108]:
import pandas as pd
urls = [r'https://lti.cs.cmu.edu/directory/all/154/2?page=0', r'https://lti.cs.cmu.edu/directory/all/154/2?page=1']
datas = []
for url in urls:
    tables = pd.read_html(url) # Returns list of all tables on page
    datas.append(tables[0]) # Select table of interest


In [109]:
lti_staff = []
for data in datas:
    for d in data.values:
        for info in d:
            try:
                if np.isnan(info):
                    pass
            except:
                lti_staff.append(info)

In [111]:
len(lti_staff)

33

In [112]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/lti_staff.json', 'w') as f:
    json.dump(lti_staff, f)

# LTI student (ignore now)

In [113]:
import pandas as pd
urls = [r'https://lti.cs.cmu.edu/directory/students/current/all']
datas = []
for url in urls:
    tables = pd.read_html(url) # Returns list of all tables on page
    datas.append(tables[0]) # Select table of interest


In [114]:
lti_student = []
for data in datas:
    for d in data.values:
        for info in d:
            try:
                if np.isnan(info):
                    pass
            except:
                lti_student.append(info)

# LTI Faculty paper

In [None]:
import requests

In [58]:
def is_cmu(faculty, threshold = 0.7):
    total = 0
    cs = 0
    for p in faculty['papers']:
        total += 1
        if p['fieldsOfStudy'] is not None and 'Computer Science' in p['fieldsOfStudy']:
            cs += 1
    if total == 0:
        return False
    return cs/total >= threshold


In [59]:
def extract_paper_info(faculty):
    papers = faculty['papers']
    paper_info = []
    for p in papers:
        paper_string = ''
        paper_string += f'Author: {faculty["name"]} '
        paper_string += f'Title: {p["title"]} '
        paper_string += f'Publication year: {p["year"]} '
        paper_info.append(paper_string)
    return paper_info

In [60]:
faculty = json.load(open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/lti_faculty.json'))

In [61]:
faculty_names = [s.split(' ')[:2] for s in faculty]

In [63]:
publications = []
for faculty_name in faculty_names:
    if faculty_name[0] != "Eric":
        name_str = faculty_name[0]+ '+' +faculty_name[1]
        name_str = name_str.lower()
    else:
        name_str = 'eric+xing' #hardcode for eric ;)
    url = f'https://api.semanticscholar.org/graph/v1/author/search?query={name_str}&fields=name,aliases,url,affiliations,papers.title,papers.year,papers.fieldsOfStudy'
    faculty_list = requests.get(url).json()['data']
    for f in faculty_list:
        if is_cmu(f):
            publications.extend(extract_paper_info(f))

In [65]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/faculty_publication.json', 'w') as f:
    json.dump(publications, f)

# CMU Schedule of class

In [74]:

import pandas as pd
urls = [r'https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_spring.htm', r'https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_summer_1.htm', r'https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_summer_2.htm', r'https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_fall.htm']
datas = []
for url in urls:
    tables = pd.read_html(url) # Returns list of all tables on page
    datas.append(tables[0]) # Select table of interest
order = ['Spring', 'Summer 1/Summer All', 'Summer 2', 'Fall']

In [75]:
courses = []
for table, semester in zip(datas, order):
    #drop first row and last column
    table = table.iloc[1:, :-1]
    table.columns = table.iloc[0]
    table = table.iloc[1:, :]

    #exclude rows where location and title is nan
    table = table[table['Location'].notna() | table['Title'].notna()]
    table.fillna(method='ffill', inplace=True)
    table.reset_index(inplace=True, drop=True)

    for i in range(len(table)):
        course_info = table.iloc[i].to_dict()
        s = semester + ' offering: '
        for k, v in course_info.items():
            s += f'{k}: {v} '
        courses.append(s)
    


  table.fillna(method='ffill', inplace=True)
  table.fillna(method='ffill', inplace=True)
  table.fillna(method='ffill', inplace=True)
  table.fillna(method='ffill', inplace=True)


In [77]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/course_schedule.json', 'w') as f:
    json.dump(courses, f)

# CMU Academic calendar

In [None]:
import pandas as pd 
semesters = ['Fall 2023', 'Spring 2024', 'Summer One_All 2024', 'Summer Two 2024']
events = []
for semester in semesters:
    df = pd.read_excel(f'/zfsauton2/home/yifuc/11711-RAG/data/cmu/raw/{semester}.xlsx')
    for i in range(len(df)):
        date_dict = df.iloc[i].to_dict()
        #convert timestamp to string and remove hour, minute, and second
        date1 = date_dict['Date'].strftime('%Y-%m-%d')
        date2 = date_dict['Date2']
        #check for consecutive date
        if not str(date2) == 'nan' and not str(date2) == 'NaT':
            date2 = date2.strftime('%Y-%m-%d')
            date = f'{date1}-{date2}'
        else:
            date = date1
        #create date string 
        date_string = f'Date: {date}'
        for k, v in date_dict.items():
            if k not in ['Date', 'Date2', '-']:
                date_string += f' {k}: {v}'
        date_string = semester + ': ' + date_string
        events.append(date_string)

In [21]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/university_calendar.json', 'w') as f:
    json.dump(events, f)

# LTI Academic

In [1]:
import pandas as pd
urls = [r'https://lti.cs.cmu.edu/learn']
datas = []
for url in urls:
    tables = pd.read_html(url) # Returns list of all tables on page
    datas.append(tables[0]) # Select table of interest


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [7]:
url = r'https://lti.cs.cmu.edu/learn'
#parse the first chunk of the html about phd program in lti 
import requests
from bs4 import BeautifulSoup
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
phd = soup.find_all('div', class_='field-item even')

In [12]:
phd[3].text

'Carnegie Mellon\'s School of Computer Science has a centralized online application process. Applications and all supporting documentation for fall admission to any of the LTI\'s graduate programs must be received by the application deadline. Incomplete applications will not be considered. The application period for Fall 2024 will open on September 6, 2023.\nFinal Application Deadline\nDecember 13, 2023 at 3 p.m. EST.\nCost\n$100 per program and $80 if the applicant applies before November 29, 2023 at 3 p.m. EST (early deadline).\nFee Waivers\nFee waivers may be available in cases of financial hardship. For more information, please refer to the\xa0School of Computer Science Fee Waiver page.\nRequirements\nThe School of Computer Science requires the following for all Ph.D. applications.\nGRE scores:\xa0GREs are now optional, but if you want to submit GRE scores:These must be less than five years old. The GRE Subject Test is not required, but is recommended. Our Institution Code is 2074;

# SCS 25 great things and history

In [25]:
url = r'https://www.cs.cmu.edu/scs25/25things'
#parse the whole webpage's text
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
text = soup.get_text()
text = "\n".join(text.strip().split('\n')[-25:])

What’s so great about computer science at Carnegie Mellon?We're glad you asked! Here are 25 great ideas from CMU computer scientists to think about as we celebrate the birthday of the School of Computer Science.1. Artificial intelligence, 1955-56 Can you write a working computer program without a computer? Herb Simon (H’90), at left, Allen Newell (IA’57), at right, and Cliff Shaw did. The team created the first artificial intelligence program, Logic Theorist, which could solve logic puzzles in the same way that a human might solve them. Newell demonstrated that it worked by writing the instructions on 3-by-5 index cards that were manipulated on the kitchen table by Newell, his wife, and a group of Carnegie Tech grad students.
2. Multi-core processors, 1971 Multi-core processors are common in today’s computers, but they were still science fiction in the early 1970s. But when CMU researchers found their existing machines too slow to keep pace with the advance of speech and graphics progr

In [26]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/scs_25_great_thing.json', 'w') as f:
    json.dump([text], f)

In [30]:
url = r'https://www.cs.cmu.edu/scs25/history'
#parse the whole webpage's text
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
text = soup.get_text()
text = text.strip().split('\n')[0] + '\n' + text.strip().split('\n')[-1]

A history of SCS | SCS25 - Carnegie Mellon University School of Computer Science
For an expanded history of the School of Computer Science and its predecessors at CMU, read "Institutional Memories" in the Summer 2014 issue of The Link magazine.In 2014, the School of Computer Science celebrated its 25th year as a stand-alone college within Carnegie Mellon University. It was arguably the first college devoted solely to computer science in the United States, and a model for others that followed. But CMU’s computer science era begins much earlier—in 1956, with the arrival of an IBM 650 computer on the campus of what was then known as Carnegie Institute of Technology. The IBM 650 had magnetic-drum memory and a processing speed of approximately 60 instructions per second. Herb Simon (H’90), associate dean of the Graduate School of Industrial Administration—now known as CMU’s Tepper School of Business—established Carnegie Tech’s first Computation Center with the help of its first director, Al

In [31]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/scs_history.json', 'w') as f:
    json.dump([text], f)

# CMU Fact Sheet

In [35]:
#use pypdf to parse an online pdf
url = r'https://www.cmu.edu/about/cmu_fact_sheet_02.pdf'
import io
import requests
from PyPDF2 import PdfReader
headers = {'User-Agent': 'Mozilla/5.0 (X11; Windows; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36'}

response = requests.get(url=url, headers=headers, timeout=120)
on_fly_mem_obj = io.BytesIO(response.content)
pdf_file = PdfReader(on_fly_mem_obj)

In [41]:
text = ''
for p in pdf_file.pages:
    text += p.extract_text()

In [43]:
import json 
with open('/zfsauton2/home/yifuc/11711-RAG/data/cmu/fact_sheet.json', 'w') as f:
    json.dump([text], f)