# Make smart suggestions for followup courses (classes in the learning sense) using unsupervised KNN (K-Nearest-Neighbors)

In [1]:
import pandas as pd

courses = pd.read_csv("../../datasets/harvard_mit_online_courses.csv")
courses.dropna(inplace=True)
courses.head()

Unnamed: 0,Institution,Course Number,Launch Date,Course Title,Instructors,Course Subject,Year,Honor Code Certificates,Participants (Course Content Accessed),Audited (> 50% Course Content Accessed),...,% Certified of > 50% Course Content Accessed,% Played Video,% Posted in Forum,% Grade Higher Than Zero,Total Course Hours (Thousands),Median Hours for Certification,Median Age,% Male,% Female,% Bachelor's Degree or Higher
0,MITx,6.002x,09/05/2012,Circuits and Electronics,Khurram Afridi,"Science, Technology, Engineering, and Mathematics",1,1,36105,5431,...,54.98,83.2,8.17,28.97,418.94,64.45,26.0,88.28,11.72,60.68
1,MITx,6.00x,09/26/2012,Introduction to Computer Science and Programming,"Eric Grimson, John Guttag, Chris Terman",Computer Science,1,1,62709,8949,...,64.05,89.14,14.38,39.5,884.04,78.53,28.0,83.5,16.5,63.04
2,MITx,3.091x,10/09/2012,Introduction to Solid State Chemistry,Michael Cima,"Science, Technology, Engineering, and Mathematics",1,1,16663,2855,...,72.85,87.49,14.42,34.89,227.55,61.28,27.0,70.32,29.68,58.76
3,HarvardX,CS50x,10/15/2012,Introduction to Computer Science,"David Malan, Nate Hardison, Rob Bowden, Tommy ...",Computer Science,1,1,129400,12888,...,11.11,0.0,0.0,1.11,220.9,0.0,28.0,80.02,19.98,58.78
4,HarvardX,PH207x,10/15/2012,Health in Numbers: Quantitative Methods in Cli...,"Earl Francis Cook, Marcello Pagano","Government, Health, and Social Science",1,1,52521,10729,...,47.12,77.45,15.98,32.52,804.41,76.1,32.0,56.78,43.22,88.33


We mainly care about the subjects. Ratings for each course would have been useful but let's see how it performs without. 

To be able to compute the distance between two courses based on how similar there subjects are, need to convert the subjects categories into numerical values. Need to search for datasets with more subjects. 

In [2]:
subjects = courses["Course Subject"].unique()
print(subjects)
print(len(subjects))
subject_dict = {}
i = 0
for subject in subjects:
    subject_dict[subject] = [i]
    i+=1

['Science, Technology, Engineering, and Mathematics' 'Computer Science'
 'Government, Health, and Social Science'
 'Humanities, History, Design, Religion, and Education']
4


In [3]:
courses["Course Subject"] = courses["Course Subject"].map(subject_dict)

In [4]:
courses.head()

Unnamed: 0,Institution,Course Number,Launch Date,Course Title,Instructors,Course Subject,Year,Honor Code Certificates,Participants (Course Content Accessed),Audited (> 50% Course Content Accessed),...,% Certified of > 50% Course Content Accessed,% Played Video,% Posted in Forum,% Grade Higher Than Zero,Total Course Hours (Thousands),Median Hours for Certification,Median Age,% Male,% Female,% Bachelor's Degree or Higher
0,MITx,6.002x,09/05/2012,Circuits and Electronics,Khurram Afridi,[0],1,1,36105,5431,...,54.98,83.2,8.17,28.97,418.94,64.45,26.0,88.28,11.72,60.68
1,MITx,6.00x,09/26/2012,Introduction to Computer Science and Programming,"Eric Grimson, John Guttag, Chris Terman",[1],1,1,62709,8949,...,64.05,89.14,14.38,39.5,884.04,78.53,28.0,83.5,16.5,63.04
2,MITx,3.091x,10/09/2012,Introduction to Solid State Chemistry,Michael Cima,[0],1,1,16663,2855,...,72.85,87.49,14.42,34.89,227.55,61.28,27.0,70.32,29.68,58.76
3,HarvardX,CS50x,10/15/2012,Introduction to Computer Science,"David Malan, Nate Hardison, Rob Bowden, Tommy ...",[1],1,1,129400,12888,...,11.11,0.0,0.0,1.11,220.9,0.0,28.0,80.02,19.98,58.78
4,HarvardX,PH207x,10/15/2012,Health in Numbers: Quantitative Methods in Cli...,"Earl Francis Cook, Marcello Pagano",[2],1,1,52521,10729,...,47.12,77.45,15.98,32.52,804.41,76.1,32.0,56.78,43.22,88.33


Further improvements for later: find the unique categories by splitting on '&', for example: `['Biography & Autobiography']` would become an array of say `[1, 2]`. This way something like `[Science Fiction & Historical Fiction]` could be compared with books that are strictly `Science Fiction` and `Historical Fiction`. 
Another potential improvement would be to cast authors to numerical data too. 

In [5]:
# calculate the percentage of participants who completed the course and became certified
courses["Completion Rate"] = courses["Certified"]/courses["Participants (Course Content Accessed)"] * 100
courses.head()

Unnamed: 0,Institution,Course Number,Launch Date,Course Title,Instructors,Course Subject,Year,Honor Code Certificates,Participants (Course Content Accessed),Audited (> 50% Course Content Accessed),...,% Played Video,% Posted in Forum,% Grade Higher Than Zero,Total Course Hours (Thousands),Median Hours for Certification,Median Age,% Male,% Female,% Bachelor's Degree or Higher,Completion Rate
0,MITx,6.002x,09/05/2012,Circuits and Electronics,Khurram Afridi,[0],1,1,36105,5431,...,83.2,8.17,28.97,418.94,64.45,26.0,88.28,11.72,60.68,8.317408
1,MITx,6.00x,09/26/2012,Introduction to Computer Science and Programming,"Eric Grimson, John Guttag, Chris Terman",[1],1,1,62709,8949,...,89.14,14.38,39.5,884.04,78.53,28.0,83.5,16.5,63.04,9.221962
2,MITx,3.091x,10/09/2012,Introduction to Solid State Chemistry,Michael Cima,[0],1,1,16663,2855,...,87.49,14.42,34.89,227.55,61.28,27.0,70.32,29.68,58.76,12.494749
3,HarvardX,CS50x,10/15/2012,Introduction to Computer Science,"David Malan, Nate Hardison, Rob Bowden, Tommy ...",[1],1,1,129400,12888,...,0.0,0.0,1.11,220.9,0.0,28.0,80.02,19.98,58.78,1.112056
4,HarvardX,PH207x,10/15/2012,Health in Numbers: Quantitative Methods in Cli...,"Earl Francis Cook, Marcello Pagano",[2],1,1,52521,10729,...,77.45,15.98,32.52,804.41,76.1,32.0,56.78,43.22,88.33,9.630434


In [6]:
course_dict = {}
for index, row in courses.iterrows():
    course_dict[index] = (row['Course Number'], row['Course Title'], row['Institution'], row['Instructors'], row['Course Subject'], row['Launch Date'], row['Completion Rate'], row['Total Course Hours (Thousands)'])

In [7]:
course_dict[0]

('6.002x',
 'Circuits and Electronics',
 'MITx',
 'Khurram Afridi',
 [0],
 '09/05/2012',
 8.317407561279602,
 418.94)

Now, following the tutorial for KNN, compute the distance between two given course. With the lack of rating I'll use the Certified Percentage (percentage of participants who gained a cerfication).

In [8]:
from scipy import spatial

def ComputeDistance(a, b):
    subjectsA = a[4]
    subjectsB = b[4]
    genreDistance = spatial.distance.cosine(subjectsA, subjectsB)
    completionRatesA = a[6]
    completionRatesB = b[6]
    completionDistance = abs(completionRatesA - completionRatesB)
    return genreDistance + completionDistance
    


The higher the distance, the less similar the courses are. A course on circuits vs a course on China are subjectively very different. 

In [17]:
ComputeDistance(course_dict[0], course_dict[100])

13.135525590152866

In [18]:
print(course_dict[0])
print(course_dict[100])


('6.002x', 'Circuits and Electronics', 'MITx', 'Khurram Afridi', [0], '09/05/2012', 8.317407561279602, 418.94)
('SW12.10x', "Greater China Today: The People's Republic, Taiwan, and Hong Kong", 'HarvardX', 'Peter Bol, Bill Kirby', [3], '01/05/2015', 21.452933151432468, 48.23)


In [16]:
# Now try with two comp sci courses
ComputeDistance(course_dict[1], course_dict[3])

8.10990611845303

In [19]:
import operator

def getNeighbors(courseID, K):
    distances = []
    for book in course_dict:
        if (book != courseID):
            dist = ComputeDistance(course_dict[courseID], course_dict[book])
            distances.append((book, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors



In [20]:
K = 10
avgCompletion = .01
neighbors = getNeighbors(10, K)
print(neighbors)
for neighbor in neighbors:
    avgCompletion += course_dict[neighbor][6]
    print (course_dict[neighbor][0] + " " + str(course_dict[neighbor][1]))
    
avgCompletion /= K

[6, 132, 128, 268, 15, 165, 205, 125, 161, 252]
3.091x Introduction to Solid State Chemistry
AMPOx.4 Poetry in America: Emily Dickinson
16.00x Introduction to Aerospace Engineering: Astronautics and Human Spaceflight
3.054.3x Cellular Solids: Applications in Nature
8.MREV Mechanics ReView
3.032.1x Mechanical Behavior of Materials: Linear Elastic Behavior
SW12.10x Greater China Today: The People's Republic, Taiwan, and Hong Kong
PH525.3x Advanced Statistics for the Life Sciences
6.00.1x Introduction to Computer Science and Programming Using Python
18.01.3x Calculus: Coordinate Systems and Infinite Series


In [21]:
avgCompletion

6.612953442909993

Compare with actual completion rate of the course

In [22]:
course_dict[10][6]

6.584288443170965

Not bad, this is good for suggesting courses to take next if they liked this course and want to see similar courses. This approach would be better with course ratings.  

## Switching to Self Assessment

What if a user provides us with a subject they want to learn, again this would perform better with course ratings. And a completion rate they are comfortable with. Could provide this in the form of easy, medium, hard, and define what those values are, for now let's say 10% completion rate. 

In [23]:
user_input = {"Course Subject": ["Computer Science"], "Completion Rate": [10]}
user_df = pd.DataFrame(data=user_input)
user_df["Course Subject"] = user_df["Course Subject"].map(subject_dict)

user_dict = {}
for index, row in user_df.iterrows():
    user_dict[index] = (row['Course Subject'], row['Completion Rate'])

In [24]:
def ComputeDistance(a, b):
    subjectsA = a[0]
    subjectsB = b[4]
    genreDistance = spatial.distance.cosine(subjectsA, subjectsB)
    completionRatesA = a[1]
    completionRatesB = b[6]
    completionDistance = abs(completionRatesA - completionRatesB)
    return genreDistance + completionDistance

In [25]:
def getNeighbors(input, K):
    distances = []
    for course in course_dict:
        dist = ComputeDistance(input[0], course_dict[course])
        distances.append((course, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

In [30]:
K = 10
avgCompletion = .01
neighbors = getNeighbors(user_dict, K)
print(neighbors)
for neighbor in neighbors:
    avgCompletion += course_dict[neighbor][6]
    print (course_dict[neighbor][0] + " " + str(course_dict[neighbor][1]))
    
avgCompletion /= K

{0: ([1], 10)}
[194, 69, 75, 61, 88, 146, 153, 137, 4, 256]
3.032.2x Mechanical Behavior of Materials: Stress Transformations, Beams, Columns, and Cellular Solids
6.00.1x Introduction to Computer Science and Programming
1368.1x Saving Schools: History, Politics, and Policy of U.S. Education – History and Politics in U.S. Education
USW30x Tangible Things
11.132x Design and Development of Educational Technology
GSE3x Introduction to Data Wise: A Collaborative Process to Improve Learning and Teaching
6.00.1x Introduction to Computer Science and Programming
JPAL 101x Evaluating Social Programs
PH207x Health in Numbers: Quantitative Methods in Clinical and Public Health Research
3.054.2x Cellular Solids: Applications in Medicine


In [31]:
avgCompletion

10.050242993857015

In [32]:
# eyeball math to make sure completion rate is similar
course_dict[194]

('3.032.2x',
 'Mechanical Behavior of Materials: Stress Transformations, Beams, Columns, and Cellular Solids',
 'MITx',
 'Lorna J. Gibson',
 [0],
 '10/14/2015',
 10.069225928256765,
 20.12)

Followup improvements, find a data set with ratings for courses and more subjects. 