In [7]:
import duckdb
import pandas as pd 
from pathlib import Path
from app_config import *

In [39]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [2]:
class DBConn(object):
    def __init__(self, file_db=FILE_DB):
        """Support only DuckDB and SQLite
        """
        if not Path(file_db).exists():
            raise Exception(f"Database file not found: {file_db}")
        if file_db.endswith("duckdb"):
            self.conn = duckdb.connect(file_db)
        else:
            self.conn = sqlite3.connect(file_db)

    def __enter__(self):
        return self.conn

    def __exit__(self, type, value, traceback):
        self.conn.close()

In [5]:
select_sql = f"""
    select name,url, research_area,department from g_person 
    where person_type='faculty' and org like 'Cornell%' and job_title not like '%Emeritus%'
    and (research_area is not NULL and research_area != '')
    order by research_area
    ;
"""

In [8]:
with DBConn() as _conn:
    df = pd.read_sql(select_sql,_conn)

  df = pd.read_sql(select_sql,_conn)


In [9]:
df

Unnamed: 0,name,url,research_area,department
0,Fengqi You,https://www.cheme.cornell.edu/faculty-director...,Artificial Intelligence,Chair of Ph.D. Studies in Cornell Systems Engi...
1,Nathan Kallus,http://www.nathankallus.com/,Artificial Intelligence,Operations Research and Information Engineerin...
2,Cheng Zhang,http://www.czhang.org/,Artificial Intelligence,"Information Science, CS Field Member"
3,Bart Selman,http://www.cs.cornell.edu/selman/,Artificial Intelligence,"Computer Science, CS Field Member; Joseph C Fo..."
4,Yoav Artzi,http://yoavartzi.com/,Artificial Intelligence,"Computer Science, Cornell Tech, CS Field Member"
...,...,...,...,...
110,Mark Wilde,https://www.markwilde.com/,Theory of Computing,
111,David Williamson,http://www.davidpwilliamson.net/work,Theory of Computing,Operations Research and Information Engineerin...
112,Siddhartha Banerjee,https://people.orie.cornell.edu/sbanerjee/inde...,Theory of Computing,Operations Research and Information Engineerin...
113,Christina Lee Yu,https://people.orie.cornell.edu/cleeyu/,Theory of Computing,"ORIE, CS Field Member"


In [16]:
df.columns

Index(['name', 'url', 'research_area', 'department'], dtype='object')

## unique research_area

In [14]:
research_area = {}
for r in df["research_area"].to_list():
    for i in r.split(","):
        k = i.strip()
        count = research_area.get(k,0)
        research_area[k] = count+1

In [15]:
research_area

{'Artificial Intelligence': 51,
 'Computational Biology': 6,
 'Scientific Computing': 9,
 'Graphics': 5,
 'Human Interaction': 26,
 'Vision': 6,
 'Machine Learning': 8,
 'Robotics': 9,
 'Theory of Computing': 28,
 'Security': 16,
 'Systems and Networking': 26,
 'Computer Architecture & VLSI': 5,
 'Programming Languages': 10,
 'Database Systems': 3,
 'Software Engineering': 2}

## unique department

In [26]:
HEAD_MAP = {
    "chair":1, "dean":1, "director":1, "professor":1
}

def _contains_item(s, dic=HEAD_MAP):
    for k in dic.keys():
        if k.lower() in s.lower():
            return True
        
    return False

In [27]:
is_head = {}
department = {}
for r in df["department"].to_list():
    if _contains_item(r):
        cnt = is_head.get(r,0)
        is_head[r] = cnt+1
        continue 
        
    r = r.replace(";", ",")
    for i in r.split(","):
        k = i.replace("CS Field Member", "").replace("CS Minor Field Member", "").replace("CS field member", "")
        k = k.replace("ORIE", 'Operations Research and Information Engineering')
        k = k.replace("ECE", 'Electrical and Computer Engineering')
        k = k.replace('Cornell Tech (Jacobs Inst)', 'Cornell Tech')
        k = k.replace('Jacobs Technion-Cornell Institute', 'Cornell Tech')
        k = k.strip()
        if not k: continue
        count = department.get(k,0)
        department[k] = count+1

department, {}, is_head

({'Operations Research and Information Engineering': 5,
  'Cornell Tech': 22,
  'Information Science': 12,
  'Computer Science': 46,
  'Mechanical and Aerospace Engineering': 3,
  'Psychology': 1,
  'Weill Medical': 1,
  'Linguistics': 1,
  'Electrical and Computer Engineering': 14,
  'Engineering': 1,
  'Information Science and Science & Technology Studies': 1,
  'Statistics and Data Science': 3,
  'Mechanical Engineering': 1,
  'Mathematics': 2,
  'Biological Statistics & Computational Biology': 1},
 {},
 {'Chair of Ph.D. Studies in Cornell Systems Engineering, Co-director of the Cornell University AI for Science Institute, Associate Director of Cornell Energy Systems Institute, Co-lead of Schmidt AI in Science Program at Cornell, and Associate Director of C': 1,
  'Computer Science, CS Field Member; Joseph C Ford Professor of Engineering': 1,
  'CIS and Applied Economics and Management; Computer Science, CS Field Member; Ronald C. and Antonia V. Nielsen Professor': 1,
  'Computer Sc

## add columns

In [28]:
def lambda_tag_area(row, area):
    return 1 if _contains_item(row["research_area"], dic=research_area) else 0

In [29]:
def lambda_tag_head(row):
    return 1 if _contains_item(row["department"], dic=is_head) else 0

In [30]:
def lambda_tag_dept(row):
    return 1 if _contains_item(row["department"], dic=department) else 0

In [31]:
for area in research_area.keys():
    df[f"AREA_{area.replace(' ', '_')}"] = df.apply(lambda_tag_area, axis=1)

In [32]:
df["HEAD"] = df.apply(lambda_tag_head, axis=1)

In [33]:
for dept in department.keys():
    df[f"DEPT_{dept.replace(' ', '_')}"] = df.apply(lambda_tag_dept, axis=1)

In [41]:
df.head(3)

Unnamed: 0,name,url,research_area,department,AREA_Artificial_Intelligence,AREA_Computational_Biology,AREA_Scientific_Computing,AREA_Graphics,AREA_Human_Interaction,AREA_Vision,AREA_Machine_Learning,AREA_Robotics,AREA_Theory_of_Computing,AREA_Security,AREA_Systems_and_Networking,AREA_Computer_Architecture_&_VLSI,AREA_Programming_Languages,AREA_Database_Systems,AREA_Software_Engineering,HEAD,DEPT_Operations_Research_and_Information_Engineering,DEPT_Cornell_Tech,DEPT_Information_Science,DEPT_Computer_Science,DEPT_Mechanical_and_Aerospace_Engineering,DEPT_Psychology,DEPT_Weill_Medical,DEPT_Linguistics,DEPT_Electrical_and_Computer_Engineering,DEPT_Engineering,DEPT_Information_Science_and_Science_&_Technology_Studies,DEPT_Statistics_and_Data_Science,DEPT_Mechanical_Engineering,DEPT_Mathematics,DEPT_Biological_Statistics_&_Computational_Biology
0,Fengqi You,https://www.cheme.cornell.edu/faculty-directory/fengqi-you,Artificial Intelligence,"Chair of Ph.D. Studies in Cornell Systems Engineering, Co-director of the Cornell University AI for Science Institute, Associate Director of Cornell Energy Systems Institute, Co-lead of Schmidt AI in Science Program at Cornell, and Associate Director of C",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,Nathan Kallus,http://www.nathankallus.com/,Artificial Intelligence,"Operations Research and Information Engineering, Cornell Tech, CS Field Member",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,Cheng Zhang,http://www.czhang.org/,Artificial Intelligence,"Information Science, CS Field Member",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
