# CMSC 320 - Final Tutorial - UMD Subreddit

In [33]:
import requests
import sqlite3
from sqlite3 import Error
import os
import pandas as pd

In [34]:
def make_get_request(URL, headers=None, given_params=None):
    """Sends a GET request to the given URL.
    
    Parameters
    ----------
    URL : str
        The url to send a GET request
    given_params : dictionary, optional
        A dictionary of any additional parameters (default is None)
        
        
    Returns
    -------
    dictionary
        A dictionary containing the JSON response
    """
    
    SUCCESS = 200
    response = requests.get(URL, headers=headers, params=given_params)
    
    if (response.status_code == SUCCESS):
        return response
    else:
        return {};

# Scrape and Parse Faculty

## Attempted to use umd.io, but it appears to be rather glitchy

In [35]:
"""
umd_professor_url = "https://api.umd.io/v1/professors";
page = 1
professorNames = set() 
params = {'departments': 'CMSC', 'page': page}

json = make_get_request(umd_professor_url, params)
professorNames.update([professor['name'] for professor in json])

while json is not None:
    page = page + 1
    params['page'] = page
    
    json = make_get_request(umd_professor_url, params)
    if json:
        professorNames.update([professor['name'] for professor in json])
        
    print(professorNames)
        
# print(professorNames)
"""
print()




## Scrape faculty from UMD faculty page

In [36]:
from bs4 import BeautifulSoup

# Headers for the request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0",
    "Access-Control-Allow-Origin": "*",
    "Access-Control-Allow-Headers": "Content-Type",
    "Access-Control-Allow-Methods": "GET"
}

faculty_url = "https://academiccatalog.umd.edu/undergraduate/administrators-officials-faculty/"
response = make_get_request(faculty_url, headers=headers)

soup = BeautifulSoup(response.content, 'html.parser')

## Parse each listing into a name, type, and college

In [37]:
def split_name(name):
    # Name structure: <last name>, <first name> (<middle name/initial>)
    split = name.split(',')
    last_name = split[0]
    
    first_name = split[1].split()[0]
    
    return (first_name, last_name)

#### Ignoring those who don't fall into the colleges listed below

In [38]:
colleges = ['AGNR', 'ARCH', 'ARHU', 'BSOS', 'BMGT', 'CMNS', 'EDUC', 'ENGR', 'JOUR', 'INFO', 'SPHL', 'PLCY']

# Description format: Type, SCHOOL-DEPARTMENT (repeated); Degree, University, Year; (repeated)
def create_faculty(name, description, faculty_dict):
    (first_name, last_name) = split_name(name.strip())
    
    job_type = description.split(',')[0]
    college_dept = description.split(',')[1]
    
    # Not currently using, but may switch to it
    faculty_college = college_dept.split('-')[0].strip()
    
    prof_colleges = []
    for college in colleges:
        if college in college_dept:
            prof_colleges.append(college)
    
    if prof_colleges:
        faculty = {'first_name': first_name, 'last_name': last_name, 'colleges': prof_colleges, 'type': job_type}

        if name not in faculty_dict:
            faculty_dict[name.strip()] = faculty

In [39]:
faculty_blocks = soup.find_all("p", class_="faculty-item")

# Keeping track of only CS and ENGR professors, but this is arbitrary
CMNS_profs = set()
ENGR_profs = set()
all_faculty = {}

for block in faculty_blocks:
    
    # Names are stored within the <strong/> tag
    name = block.strong.string 
    
    # content structure: space,  name, space, <br/>, description
    contents = block.contents
    description = contents[4]
    
    create_faculty(name, description, all_faculty)
    
    if "CMNS" in description:
        CMNS_profs.add(name)
    
    elif "ENGR" in description:
        ENGR_profs.add(name)
        

In [40]:
print(all_faculty["Childs, Andrew M"])

{'first_name': 'Andrew', 'last_name': 'Childs', 'colleges': ['CMNS'], 'type': 'Professor'}


# Create a database functionality

In [41]:
def create_connection(db_file):
    conn = None
    print(db_file)
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)

    return conn

In [42]:
def create_table(conn, create_table_sql, params=()):
    try:
        c = conn.cursor()
        c.execute(create_table_sql, params)
    except Error as e:
        print(e)

## Create a database holding faculty information

In [43]:
# Using only the first mentioned college for now, could switch later? Also only considering the colleges that were listed - others are ignored

def create_names_table(conn):
    names_table = """ CREATE TABLE IF NOT EXISTS names (
                        id integer PRIMARY KEY,
                        first_name text NOT NULL,
                        last_name text NOT NULL,
                        nick_name text,
                        type text NOT NULL,
                        college text NOT NULL,
                        UNIQUE (first_name, last_name, type)
                ) """
    create_table(conn, names_table)

In [44]:
def insert_faculty_name(conn, faculty):
    sql = """ INSERT OR IGNORE INTO names(first_name,last_name,type,college)
                VALUES(?,?,?,?)"""
    
    cursor = conn.cursor()
    cursor.execute(sql, faculty)
    conn.commit()
    
    return cursor.lastrowid

In [45]:
faculty_names_path = os.path.abspath('.')
faculty_names_path = faculty_names_path + "/data/db/faculty_names.db"
faculty_names_db = create_connection(faculty_names_path)

create_names_table(faculty_names_db)
                                      

for faculty_name in all_faculty:
    faculty = all_faculty[faculty_name]
    
    if not faculty["colleges"]:
        college = None
    else:
        college = faculty["colleges"][0]
        
    params = (faculty["first_name"], faculty["last_name"], faculty["type"], college)
    
    insert_faculty_name(faculty_names_db, params)
    
faculty_names_db.close()

G:\UMD\FALL_2020\CMSC320\final-tutorial/data/db/faculty_names.db


## Create a database per professor for comments and submissions mentioning them

### Should only be done when a mention is found

In [46]:
def normalize_name(name):
    return ''.join(char for char in name if char.isalnum())

In [47]:
def create_professor_tables(conn):
    submissions_table = """ CREATE TABLE IF NOT EXISTS submissions (
                            id integer PRIMARY KEY,
                            sub_id text NOT NULL,
                            title text NOT NULL,
                            selftext text NOT NULL,
                            score integer NOT NULL,
                            full_link text NOT NULL,
                            created_time integer NOT NULL,
                            UNIQUE(sub_id)
                        )"""
    
    comments_table = """ CREATE TABLE IF NOT EXISTS comments (
                            id integer PRIMARY KEY,
                            sub_id text NOT NULL,
                            comment_id text NOT NULL,
                            body text NOT NULL,
                            score integer NOT NULL,
                            created_time integer NOT NULL,
                            UNIQUE(comment_id, sub_id)
                    )"""
    
    create_table(conn, submissions_table)
    create_table(conn, comments_table)

In [48]:
def create_professor_db(filename):
    directory = os.path.abspath('.')
    filepath = directory + "\data\db\professors\\" + filename
    
    professor_db = create_connection(filepath)
    
    create_professor_tables(professor_db)
    
    return professor_db

In [58]:
def insert_submission(conn, values):
    insert_sql = """INSERT OR IGNORE INTO submissions (
                        sub_id, title, selftext, score, full_link, created_time
                    ) VALUES (?,?,?,?,?,?)
                 """
    cursor = conn.cursor()
    cursor.execute(insert_sql, values)
    conn.commit()
    
    return cursor.lastrowid

In [50]:
def insert_all_submissions(conn, df):
    for idx, sub in df.iterrows():
        insert_submission(conn, (sub['id'], sub['title'], sub['selftext'], sub['score'], sub['full_link'], sub['created_utc']))

In [51]:
# This will create a database per professor immediately, no matter what. But we probably don't want that
"""
directory = os.path.abspath('.')
directory = directory + "\data\db\professors"

for professor in all_faculty:
    prof_name = split_name(professor)
    prof_name = (normalize_name(prof_name[0]), normalize_name(prof_name[1]))
    filename = prof_name[0] + "_" + prof_name[1] + ".db" # first_last
    
    create_professor_db(filename)
    
"""
print()




## Requesting Data from the UMD Subreddit with PushShift API

In [52]:
import praw
from psaw import PushshiftAPI

api = PushshiftAPI()

In [53]:
def form_query(professor):
    return "{last_name}|({first_name}+{last_name})".format(last_name=professor["last_name"], first_name=professor["first_name"])

In [54]:
def make_professor_filename(professor):
    return normalize_name(professor["first_name"]) + "_" + normalize_name(professor["last_name"]) + ".db"

In [59]:
umd_subreddit = 'umd'

sub_limit = 10
sub_filter = ['title', 'selftext', 'score', 'full_link', 'id', 'created_utc']

comment_limit = 10
comment_filter = ['body', 'score', 'created_utc', 'id', 'link_id']

unmentioned_names = []

for professor_key in all_faculty:
    professor = all_faculty[professor_key]

    query = form_query(professor)
    submissions = api.search_submissions(q=query, subreddit=umd_subreddit, filter=sub_filter, limit=sub_limit)
    
    df = pd.DataFrame([thing.d_ for thing in submissions])
    
    if len(df.index) == sub_limit:
        filename = make_professor_filename(professor)
        prof_db = create_professor_db(filename)
        insert_all_submissions(prof_db, df)
    else:
        unmentioned_names.append(professor_key)

G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Hossein_Abbasi.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Jeffrey_Adams.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Lowell_Adams.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\William_Adams.db




G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Ashok_Agrawala.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Wei_Ai.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Danielle_Alexander.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\James_Alexander.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Millard_Alexander.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Patricia_Alexander.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Thomas_Alexander.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Frank_Alt.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Mark_Austin.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Gregory_Ball.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Michael_Ball.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Antoine_Banks.db
G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors\Richard_Barber.db
G:\UMD

KeyboardInterrupt: 