In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from extract import Data
import re
import seaborn as sns
from typing import Dict, List, Tuple, Any, Callable

FILENAME = "data/CIS_enrollment.csv"
ENCODING = "iso-8859-1"

data = Data(FILENAME)
raw  = pd.read_csv(FILENAME, encoding=ENCODING)

In [None]:
#code taken from Sam
def get_students_in_major(major):
    return dict(filter(lambda s: s[1].major == major, data.students.items()))

def get_prop(course, major=""):
    if major == "":
        students = data.students
    else:
        students = filter(lambda s: s[1].major == major, data.students.items())

    return len(data.courses[course].students) / len(dict(students))

course = "CS1110"
major = "Computer Science"
p = get_prop(course, major)
print(f"{p:.2%} of {major} majors take {course}")

In [None]:
def createCourseGraph(pattern, majorKeyword, major, core):
    courses = list(filter(lambda c: re.match(pattern,c), data.courses.keys()))

    y = [get_prop(course, major) for course in courses]

    core = list(map(lambda n: majorKeyword + str(n), core))

    color = lambda course: 'blue' if course in core else '#bbbbbb'

    df = pd.DataFrame({"Course": courses, "Proportion":y})
    df = df.sort_values("Proportion", ascending=False).reset_index(drop=True)
    df['Core'] = df.Course.map(color)
    sub_df = df.loc[0:30]

    fig, ax = plt.subplots(1,1, figsize=(6,6))
    ax.barh(sub_df.Course, sub_df.Proportion, color=sub_df.Core)
    ax.invert_yaxis()
    plt.title("Proportion of {} Majors who take {} Classes".format(majorKeyword, majorKeyword))
    fig.show()
    return df

cs_core = [1110, 1112, 2110, 2800, 2802, 3110, 3410, 3420, 4410, 4820]
csDF = createCourseGraph(re.compile(r"^CS\d{4}$"), "CS", "Computer Science", cs_core)

info_core = [1200, 1300, 2040, 2450, 2950]
infoDF = createCourseGraph(re.compile(r"^INFO\d{4}$"), "INFO", "Information Science", info_core)

stsci_core = [2200,3200,3080,4030,4090,4520]
stsciDF = createCourseGraph(re.compile(r"^STSCI\d{4}$"),"STSCI","Statistical Science",stsci_core)

In [None]:
print(["CS" + str(core) for core in cs_core])
print(["INFO" + str(core) for core in info_core])
print(["STSCI" + str(core) for core in stsci_core])

In [None]:
#TODO: build a graph of most common nodes to build generic pathway
def makeGraphTemplate(reqCourses:List[str]):
    reqCourseGraph = {}
    for course in reqCourses:
        reqCourseGraph[course] = {}
        for course2 in reqCourses:
            reqCourseGraph[course][course2] = 0
    return reqCourseGraph

def makeGraph(classes:Dict[int,List[str]], reqCourseGraph:Dict[str,Dict[str,int]]):
    prevClasses = []
    for term, termCourses in classes.items():
        for currCourse in termCourses:
            for prevClass in prevClasses:
                reqCourseGraph[prevClass][currCourse] += 1
        prevClasses = termCourses
    return reqCourseGraph

def makeGraphForMajor(requiredCourses:List[str], major:str):
    reqCourseGraph = makeGraphTemplate(requiredCourses)
    students = get_students_in_major(major).values()
    for i,student in enumerate(students):
        classes = student.term_numbers
        termToClass = {i:[] for i in range(1,13)}
        for classTaken, term in classes.items():
            #get all important classes
            if classTaken in requiredCourses:
                termToClass[term].append(classTaken)
        reqCourseGraph = makeGraph(termToClass, reqCourseGraph)
    return reqCourseGraph

In [None]:
def makeNodes(graph):
    nodes = []
    for key in graph.keys():
        nodeToAdd = {"id":key}
        for letter in key:
            if letter.isdigit():
                nodeToAdd["term"] = letter
                break
        nodes.append(nodeToAdd)
    return nodes

def makeLinks(graph, threshold):
    links = []
    for sourceClass, targetClasses in graph.items():
        maxValue = max(targetClasses.values())
        for targetClass, count in targetClasses.items():
            if count / maxValue >= threshold:
                links.append({"source":sourceClass, "target":targetClass})
    return links

def makeNodesLinks(graph):
    nodes = makeNodes(graph)
    links = makeLinks(graph, 1)
    return(nodes,links)
    
def makeNodesLinksPerc(graph, threshold):
    nodes = makeNodes(graph)
    links = makeLinks(graph, threshold)
    return(nodes,links)
    
def makeNodesLinksPercDF(graph, threshold):
    source = []
    target = []
    for sourceClass, targetClasses in graph.items():
        maxValue = max(targetClasses.values())
        for targetClass, count in targetClasses.items():
            if count / maxValue > threshold:
                source.append(sourceClass)
                target.append(targetClass)
    return pd.DataFrame({"from":source,"to":target})

In [None]:
csReqCourses = list(csDF.drop("Core",axis=1)[:10]["Course"])
infoReqCourses = list(infoDF.drop("Core",axis=1)[:10]["Course"])
stsciReqCourses = list(stsciDF.drop("Core",axis=1)[:10]["Course"])
csGraph = makeGraphForMajor(csReqCourses, "Computer Science")
infoGraph = makeGraphForMajor(infoReqCourses, "Information Science")
stsciGraph = makeGraphForMajor(stsciReqCourses, "Statistical Science")
# print(makeNodesLinks(csGraph))
print(makeNodesLinksPerc(csGraph, 0.75))
# print(makeNodesLinks(infoGraph))
print(makeNodesLinksPerc(infoGraph, 0.75))
# print(makeNodesLinks(stsciGraph))
print(makeNodesLinksPerc(stsciGraph, 0.75))

# Using Danny's Graph

In [None]:
#using Danny's thing
import graph

In [None]:
def uniqueList(edges):
    valueSet = set()
    result = []
    for e in edges:
        if e[2] not in valueSet:
            result.append(e)
            valueSet.add(e[2])
    return result

def makeGraph2(reqCourses, major, g):
    edges = []
    maxValue = 0
    for cname1 in reqCourses:
        for cname2 in reqCourses:
            if cname2 == cname1:
                continue
            course = data.courses[cname1]
            nextCount, sameCount = 0, 0
            for s in course.students:
                if s.major != major:
                    continue
                if cname2 in s.term_numbers and s.term_numbers[cname2] == s.term_numbers[cname1] + 1:
                    nextCount += 1
                elif cname2 in s.term_numbers and s.term_numbers[cname2] == s.term_numbers[cname1]:
                    sameCount += 1
            if nextCount > sameCount:
                edges.append([cname1, cname2, nextCount])
                if nextCount > maxValue: maxValue = nextCount
            else:
                edges.append([cname1, cname2, sameCount])
                if sameCount > maxValue: maxValue = sameCount
    
    edges = list(sorted(edges, key = lambda x: x[2], reverse=True))
    edges = uniqueList(edges)[:10]
    for edge in edges:
        cname1 = edge[0]
        cname2 = edge[1]
        count = edge[2]
        g.addEdge(cname1, cname2, count)
    
    return (g,maxValue)

def displayGraph(graph, threshold):
    print("DISPLAYING . . .")
    for e in graph.filterEdges(lambda e: e.weight > threshold):
        print(str(e))

In [None]:
csGraph = graph.Graph()
infoGraph = graph.Graph()
statsGraph = graph.Graph()
csGraph,maxCS = makeGraph2(csReqCourses, "Computer Science", csGraph)
infoGraph,maxINFO = makeGraph2(infoReqCourses, "Information Science", infoGraph)
stsciGraph,maxSTATS = makeGraph2(stsciReqCourses, "Statistical Science", statsGraph)

displayGraph(csGraph, 0)
displayGraph(infoGraph, 0)
displayGraph(stsciGraph, 0)

In [None]:
csGraph.export_graph("data/CS_req_test", 0)
infoGraph.export_graph("data/INFO_req_test", 0)
stsciGraph.export_graph("data/stsci_req_test", 0)

In [None]:
csGraph.export_json("data/cs_req", 0)
infoGraph.export_json("data/is_req", 0)
stsciGraph.export_json("data/stsci_req", 0)