# CMSC 320 - Final Tutorial - UMD Subreddit

In [1]:
import requests
import sqlite3
from sqlite3 import Error
import os

In [2]:
def make_get_request(URL, headers=None, given_params=None):
    """Sends a GET request to the given URL.
    
    Parameters
    ----------
    URL : str
        The url to send a GET request
    given_params : dictionary, optional
        A dictionary of any additional parameters (default is None)
        
        
    Returns
    -------
    dictionary
        A dictionary containing the JSON response
    """
    
    SUCCESS = 200
    response = requests.get(URL, headers=headers, params=given_params)
    
    if (response.status_code == SUCCESS):
        return response
    else:
        return {};

# Scrape and Parse Faculty

## Attempted to use umd.io, but it appears to be rather glitchy

In [3]:
"""
umd_professor_url = "https://api.umd.io/v1/professors";
page = 1
professorNames = set() 
params = {'departments': 'CMSC', 'page': page}

json = make_get_request(umd_professor_url, params)
professorNames.update([professor['name'] for professor in json])

while json is not None:
    page = page + 1
    params['page'] = page
    
    json = make_get_request(umd_professor_url, params)
    if json:
        professorNames.update([professor['name'] for professor in json])
        
    print(professorNames)
        
# print(professorNames)
"""
print()




## Scrape faculty from UMD faculty page

In [4]:
from bs4 import BeautifulSoup

# Headers for the request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0",
    "Access-Control-Allow-Origin": "*",
    "Access-Control-Allow-Headers": "Content-Type",
    "Access-Control-Allow-Methods": "GET"
}

faculty_url = "https://academiccatalog.umd.edu/undergraduate/administrators-officials-faculty/"
response = make_get_request(faculty_url, headers=headers)

soup = BeautifulSoup(response.content, 'html.parser')

## Parse each listing into a name, type, and college

In [5]:
def split_name(name):
    # Name structure: <last name>, <first name> (<middle name/initial>)
    split = name.split(',')
    last_name = split[0]
    
    first_name = split[1].split()[0]
    
    return (first_name, last_name)

#### Ignoring those who don't fall into the colleges listed below

In [6]:
colleges = ['AGNR', 'ARCH', 'ARHU', 'BSOS', 'BMGT', 'CMNS', 'EDUC', 'ENGR', 'JOUR', 'INFO', 'SPHL', 'PLCY']

# Description format: Type, SCHOOL-DEPARTMENT (repeated); Degree, University, Year; (repeated)
def create_faculty(name, description, faculty_dict):
    (first_name, last_name) = split_name(name.strip())
    
    job_type = description.split(',')[0]
    college_dept = description.split(',')[1]
    
    faculty_college = college_dept.split('-')[0].strip()
    
    prof_colleges = []
    for college in colleges:
        if college in college_dept:
            prof_colleges.append(college)
    
    if prof_colleges:
        faculty = {'first_name': first_name, 'last_name': last_name, 'colleges': prof_colleges, 'type': job_type}

        if name not in faculty_dict:
            faculty_dict[name.strip()] = faculty

In [7]:
faculty_blocks = soup.find_all("p", class_="faculty-item")

# Keeping track of only CS and ENGR professors, but this is arbitrary
CMNS_profs = set()
ENGR_profs = set()
all_faculty = {}

for block in faculty_blocks:
    
    # Names are stored within the <strong/> tag
    name = block.strong.string 
    
    # content structure: space,  name, space, <br/>, description
    contents = block.contents
    description = contents[4]
    
    create_faculty(name, description, all_faculty)
    
    if "CMNS" in description:
        CMNS_profs.add(name)
    
    elif "ENGR" in description:
        ENGR_profs.add(name)
        

In [8]:
print(all_faculty["Childs, Andrew M"])

{'first_name': 'Andrew', 'last_name': 'Childs', 'colleges': ['CMNS'], 'type': 'Professor'}


# Create a databases

In [9]:
def create_connection(db_file):
    conn = None
    print(db_file)
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)

    return conn

In [10]:
def create_table(conn, create_table_sql, params=()):
    try:
        c = conn.cursor()
        c.execute(create_table_sql, params)
    except Error as e:
        print(e)

## Create a database holding faculty information

In [11]:
# Using only the first mentioned college for now, could switch later? Also only considering the colleges that were listed - others are ignored

def create_names_table(conn):
    names_table = """ CREATE TABLE IF NOT EXISTS names (
                        id integer PRIMARY KEY,
                        first_name text NOT NULL,
                        last_name text NOT NULL,
                        nick_name text,
                        type text NOT NULL,
                        college text NOT NULL
                        
                ) """
    create_table(conn, names_table)

In [12]:
def insert_faculty_name(conn, faculty):
    sql = """ INSERT INTO names(first_name,last_name,type,college)
                VALUES(?,?,?,?)"""
    
    cursor = conn.cursor()
    cursor.execute(sql, faculty)
    conn.commit()
    
    return cursor.lastrowid

In [13]:
faculty_names_path = os.path.abspath('.')
faculty_names_path = faculty_names_path + "/data/db/faculty_names.db"
faculty_names_db = create_connection(faculty_names_path)

create_names_table(faculty_names_db)
                                      

for faculty_name in all_faculty:
    faculty = all_faculty[faculty_name]
    
    if not faculty["colleges"]:
        college = None
    else:
        college = faculty["colleges"][0]
        
    params = (faculty["first_name"], faculty["last_name"], faculty["type"], college)
    
    insert_faculty_name(faculty_names_db, params)
    
faculty_names_db.close()

G:\UMD\FALL_2020\CMSC320\final-tutorial/data/db/faculty_names.db


## Create a database per professor for comments and submissions mentioning them

In [14]:
def create_professor_table(conn, table_name):
    professor_table = """ CREATE TABLE IF NOT EXISTS {table_name} (
                            id integer PRIMARY KEY,
                            sub_id text NOT NULL,
                            body text NOT NULL,
                            is_comment integer NOT NULL,
                            title text,
                            score integer NOT NULL
                        )""".format(table_name=table_name)
    create_table(conn, professor_table)

In [16]:
def normalize_name(name):
    return ''.join(char for char in name if char.isalnum())

In [17]:
directory = os.path.abspath('.')
directory = directory + "\data\db\professors"

for professor in all_faculty:
    prof_name = split_name(professor)
    prof_name = (normalize_name(prof_name[0]), normalize_name(prof_name[1]))
    
    table_name = prof_name[0] + "_" + prof_name[1] # first_last
    create_professor_table(professors_db, table_name)

G:\UMD\FALL_2020\CMSC320\final-tutorial\data\db\professors.db


## Requesting Data from the UMD Subreddit with PushShift API

In [None]:
import praw
from psaw import PushshiftAPI

api = PushshiftAPI()

In [None]:
query="teli|(mohammmad+teli)"
subs_gen = api.search_submissions(q=query, subreddit='umd', filter=['title', 'selftext'], limit=10)

#for sub in subs_gen:
#    print(sub)