# Sample ML Model of Recommendation Engine                         
scroll to very bottom to see the demo

In [1]:
#importing neccesary packages/modules
import warnings
warnings.simplefilter("ignore")
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [2]:
#Reads fake_student_data.csv which is a mockup of info of student profiles and their interests. 
#To build a real model like this you would first need to collect this data from students. Qualtrics is a good option
#for a free survey survice that automatically generates a csv data file

data = pd.read_csv('fake_student_data.csv', skipinitialspace = True)
pd.set_option('display.max_columns', None)
print("Every Feature: \n", data.columns)

Every Feature: 
 Index(['id', 'age', 'gender', 'education_level', 'career_fields_of_interest',
       'interest_in_connections', 'interest_in_creations',
       'interest_in_exploration', 'interest_in_health', 'interest_in_living',
       'interest_in_justice', 'resource1', 'resource2', 'resource3'],
      dtype='object')


In [3]:
#Perform one hot encoding on all features with categorical data to make every column in the dataset numerical
data = pd.get_dummies(data, columns = ['gender','education_level', 'career_fields_of_interest'], drop_first = False)
data = data.drop(columns=['id'])
print("Columns After One Hot Encoding")
print(data.columns)


Columns After One Hot Encoding
Index(['age', 'interest_in_connections', 'interest_in_creations',
       'interest_in_exploration', 'interest_in_health', 'interest_in_living',
       'interest_in_justice', 'resource1', 'resource2', 'resource3',
       'gender_Female', 'gender_Male', 'education_level_Graduate',
       'education_level_High School', 'education_level_Undergraduate',
       'career_fields_of_interest_Architecture',
       'career_fields_of_interest_Art and Design',
       'career_fields_of_interest_Biology',
       'career_fields_of_interest_Business',
       'career_fields_of_interest_Communications',
       'career_fields_of_interest_Computer Science',
       'career_fields_of_interest_Education',
       'career_fields_of_interest_Engineering',
       'career_fields_of_interest_Environmental Science',
       'career_fields_of_interest_Finance',
       'career_fields_of_interest_History',
       'career_fields_of_interest_Journalism', 'career_fields_of_interest_Law',
     

In [4]:
#show number of missing/invalid data
#fills in missing values with the average of their column
data = data.fillna(data.mean())
#clean_data.to_csv('clean_data.csv')

In [5]:
corr_matrix = data.corr()
corr_list = []
print("Correlation of Features with a value greater than 0.6 or less than -0.6")
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > 0.6: #print out high correlation
            corr_list.append([corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]])

corr_frame = pd.DataFrame(corr_list, columns = ['Feature 1', 'Feature 2', 'Correlation'])
corr_frame = corr_frame.style.set_precision(4)
display(corr_frame)

Correlation of Features with a value greater than 0.6 or less than -0.6


Unnamed: 0,Feature 1,Feature 2,Correlation
0,interest_in_living,interest_in_connections,0.6007
1,gender_Male,gender_Female,-1.0
2,education_level_Graduate,age,0.7071
3,education_level_High School,age,-0.7071
4,education_level_Undergraduate,education_level_Graduate,-0.6124
5,education_level_Undergraduate,education_level_High School,-0.6124


In [6]:
'''
# Normalize numerical variables
scaler = MinMaxScaler()
numerical_cols = ['age', 'interest_in_connections', 'interest_in_creations',
                  'interest_in_exploration', 'interest_in_health', 'interest_in_living',
                  'interest_in_justice']
clean_data[numerical_cols] = scaler.fit_transform(clean_data[numerical_cols])
# Create feature vectors for each student
features = pd.get_dummies(clean_data).values
'''

"\n# Normalize numerical variables\nscaler = MinMaxScaler()\nnumerical_cols = ['age', 'interest_in_connections', 'interest_in_creations',\n                  'interest_in_exploration', 'interest_in_health', 'interest_in_living',\n                  'interest_in_justice']\nclean_data[numerical_cols] = scaler.fit_transform(clean_data[numerical_cols])\n# Create feature vectors for each student\nfeatures = pd.get_dummies(clean_data).values\n"

# The Dataset
This is a mockup of what a survey to students would look like. I assume questions over age, interests, experiences would be asked. Student answers can be used to match students with relevent resources. Here I randomly generated student profiles and assigned them 3 resources (randomly). These 3 resources represent recommendations we would make from what we know of them.

Let's say we ask existing students to tell us what their 3 favorite resources are. When a new student comes, we can use their information and reference our existing student data (for current students) to suggest resources that other students (similar to the new student) enjoyed. 

In [7]:
r1 = data.loc[:,['resource1']]
r2 = data.loc[:,['resource2']]
r3 = data.loc[:,['resource3']]
features_data = data.drop(columns=['resource1', 'resource2', 'resource3'])
features_data

Unnamed: 0,age,interest_in_connections,interest_in_creations,interest_in_exploration,interest_in_health,interest_in_living,interest_in_justice,gender_Female,gender_Male,education_level_Graduate,education_level_High School,education_level_Undergraduate,career_fields_of_interest_Architecture,career_fields_of_interest_Art and Design,career_fields_of_interest_Biology,career_fields_of_interest_Business,career_fields_of_interest_Communications,career_fields_of_interest_Computer Science,career_fields_of_interest_Education,career_fields_of_interest_Engineering,career_fields_of_interest_Environmental Science,career_fields_of_interest_Finance,career_fields_of_interest_History,career_fields_of_interest_Journalism,career_fields_of_interest_Law,career_fields_of_interest_Medicine,career_fields_of_interest_Political Science,career_fields_of_interest_Psychology,career_fields_of_interest_Public Health,career_fields_of_interest_Social Work
0,18,4,2,5,3,4,2.0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,20,3,4,4,4,3,4.0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,22,2,3,2,4,3,5.0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,19,5,5,5,5,5,5.0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,21,4,2,3,5,4,3.0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5,18,3,4,5,2,5,4.0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,20,2,3,4,2,3,5.0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
7,19,5,5,3,4,5,2.0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,22,4,4,4,3,3,4.0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
9,21,3,2,5,4,4,5.0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


# The Models
Because the data is randomly generated (there's no real correlations between the student attributes and which resources they're tagged with, most models will produce 50% correctness at most.The following models use information about the user to predict resources the user will enjoy (from information of other students).

In [8]:
scaler = StandardScaler()
neighbor = KNeighborsClassifier(n_neighbors = 3)
pip = Pipeline(steps = [('scaler',scaler),('knn',neighbor)])
param_grid = {
    'knn__n_neighbors': list(range(1, 3))
}
grid = GridSearchCV(pip, param_grid, cv = 5, scoring = 'accuracy')
knn_pred = cross_val_predict(grid, features_data, r1, cv = 5)
print("Accuracy: \n", accuracy_score(r1, knn_pred)*100)
print("Confusion Matrix: \n", confusion_matrix(r1, knn_pred))
print()
print("Classification Report: \n", classification_report(r1, knn_pred))


pip = Pipeline(steps = [('scaler',scaler),('knn',neighbor)])
grid = GridSearchCV(pip, param_grid, cv = 5,  scoring = 'accuracy')
knn_pred = cross_val_predict(grid, features_data, r2, cv = 5)
print("Accuracy: \n", accuracy_score(r2, knn_pred)*100)
print("Confusion Matrix: \n", confusion_matrix(r2, knn_pred))
print()
print("Classification Report: \n", classification_report(r2, knn_pred))

grid = GridSearchCV(pip, param_grid, cv = 5,  scoring = 'accuracy')
knn_pred = cross_val_predict(grid, features_data, r3, cv = 5)
print("Accuracy: \n", accuracy_score(r3, knn_pred)*100)
print("Confusion Matrix: \n", confusion_matrix(r3, knn_pred))
print()
print("Classification Report: \n", classification_report(r3, knn_pred))

Accuracy: 
 4.0
Confusion Matrix: 
 [[0 1 0 0 0 0 0 0 1 0 0 2 1 1 1 0]
 [1 0 0 0 0 0 0 0 0 0 2 0 0 1 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0]
 [0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0]
 [0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0]
 [1 2 0 1 0 0 0 0 0 0 0 1 0 0 0 0]
 [1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0]]

Classification Report: 
               precision    recall  f1-score   support

      Unit 1       0.00      0.00      0.00         7
     Unit 10       0.00      0.00      0.00         4
     Unit 11       0.00      0.00      0.00         1
     Unit 13       0.00      0.00      0.00         2
     Unit 14       0.00      0.00      0.00         3
     Unit 15       0.50      0.25      0.33         

In [9]:
import imblearn
from imblearn.over_sampling import SMOTE

x_train, x_test, y_train, y_test = train_test_split(features_data, r1, train_size = 0.8)
print("\nShapes of Test/Train Sets")
print("x_train: ", x_train.shape)
print("x_test: ", x_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

tree_classifier = DecisionTreeClassifier(criterion = 'entropy')
print("after smote distribution is now ", y_train['resource1'].value_counts())
tree_classifier.fit(x_train, y_train)

x_train, x_test, y_train, y_test = train_test_split(features_data, r2, train_size = 0.8)
print("\nShapes of Test/Train Sets")
print("x_train: ", x_train.shape)
print("x_test: ", x_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)
print("after smote distribution is now ", y_train['resource2'].value_counts())
tree_classifier.fit(x_train, y_train)
prediction = tree_classifier.predict(x_test)


x_train, x_test, y_train, y_test = train_test_split(features_data, r3, train_size = 0.8)
print("\nShapes of Test/Train Sets")
print("x_train: ", x_train.shape)
print("x_test: ", x_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)
print("after smote distribution is now ", y_train['resource3'].value_counts())
tree_classifier.fit(x_train, y_train)
prediction = tree_classifier.predict(x_test)


Shapes of Test/Train Sets
x_train:  (40, 30)
x_test:  (10, 30)
y_train:  (40, 1)
y_test:  (10, 1)
after smote distribution is now  Unit 1     6
Unit 10    4
Unit 18    4
Unit 3     3
Unit 9     3
Unit 8     3
Unit 6     3
Unit 15    3
Unit 5     2
Unit 16    2
Unit 4     2
Unit 13    2
Unit 11    1
Unit 2     1
Unit 14    1
Name: resource1, dtype: int64

Shapes of Test/Train Sets
x_train:  (40, 30)
x_test:  (10, 30)
y_train:  (40, 1)
y_test:  (10, 1)
after smote distribution is now  Unit 2     6
Unit 4     4
Unit 15    4
Unit 5     4
Unit 1     3
Unit 11    3
Unit 3     2
Unit 18    2
Unit 17    2
Unit 16    2
Unit 8     2
Unit 7     2
Unit 12    1
Unit 14    1
Unit 6     1
Unit 9     1
Name: resource2, dtype: int64

Shapes of Test/Train Sets
x_train:  (40, 30)
x_test:  (10, 30)
y_train:  (40, 1)
y_test:  (10, 1)
after smote distribution is now  Unit 2     7
Unit 1     5
Unit 3     5
Unit 9     4
Unit 17    4
Unit 4     3
Unit 7     3
Unit 14    2
Unit 8     2
Unit 6     1
Unit 16    

In [10]:
bayes_classifier = GaussianNB()
pip = Pipeline(steps = [('bayes',bayes_classifier)])
print("r1")
naive_cross = cross_val_score(pip, features_data, r1, cv = 5)
print("Accuracy: ", naive_cross.mean() * 100)
naive_pred = cross_val_predict(pip, features_data, r1, cv = 5)
print("Confusion Matrix: \n", confusion_matrix(r1, naive_pred))
print()
print("Classification Report: \n", classification_report(r1, naive_pred))

print("r2")
naive_cross = cross_val_score(pip, features_data, r2, cv = 5)
print("Accuracy: ", naive_cross.mean() * 100)
naive_pred = cross_val_predict(pip, features_data, r3, cv = 5)
print("Confusion Matrix: \n", confusion_matrix(r1, naive_pred))
print()
print("Classification Report: \n", classification_report(r1, naive_pred))


print("r3")
naive_cross = cross_val_score(pip, features_data, r3, cv = 5)
print("Accuracy: ", naive_cross.mean() * 100)
naive_pred = cross_val_predict(pip, features_data, r3, cv = 5)
print("Confusion Matrix: \n", confusion_matrix(r1, naive_pred))
print()
print("Classification Report: \n", classification_report(r1, naive_pred))


r1
Accuracy:  4.0
Confusion Matrix: 
 [[0 2 0 1 0 0 0 0 0 0 1 1 1 0 1 0]
 [1 0 0 0 0 0 0 0 0 0 2 0 0 0 1 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 2 1 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0]
 [1 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 0 1 1 0 1 0]
 [1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 1 0 2 0 0 0 0 0]]

Classification Report: 
               precision    recall  f1-score   support

      Unit 1       0.00      0.00      0.00         7
     Unit 10       0.00      0.00      0.00         4
     Unit 11       0.00      0.00      0.00         1
     Unit 13       0.00      0.00      0.00         2
     Unit 14       0.00      0.00      0.00         3
     Unit 15       0.25      0.50      0.33       

In [11]:
mlp = MLPClassifier()
pip = Pipeline(steps = [('scaler', scaler), ('mlp', mlp)])
param_grid = {
    'mlp__activation': ['logistic', 'tanh', 'relu'],
    'mlp__hidden_layer_sizes' :[(5,),(10,),(15,)]
}


grid = GridSearchCV(pip, param_grid, cv = 2, scoring = 'accuracy')
mlp_pred = cross_val_predict(grid, features_data, r1, cv = 2)
print("Accuracy: \n", accuracy_score(r1, mlp_pred)*100)
print("Confusion Matrix: \n", confusion_matrix(r1, mlp_pred))
print()
print("Classification Report: \n", classification_report(r1, mlp_pred))

grid = GridSearchCV(pip, param_grid, cv = 2, scoring = 'accuracy')
mlp_pred = cross_val_predict(grid, features_data, r2, cv = 2)
print("Accuracy: \n", accuracy_score(r1, mlp_pred)*100)
print("Confusion Matrix: \n", confusion_matrix(r2, mlp_pred))
print()
print("Classification Report: \n", classification_report(r2, mlp_pred))

grid = GridSearchCV(pip, param_grid, cv = 2, scoring = 'accuracy')
mlp_pred = cross_val_predict(grid, features_data, r3, cv = 2)
print("Accuracy: \n", accuracy_score(r1, mlp_pred)*100)
print("Confusion Matrix: \n", confusion_matrix(r3, mlp_pred))
print()
print("Classification Report: \n", classification_report(r3, mlp_pred))

Accuracy: 
 8e+00
Confusion Matrix: 
 [[3 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0]
 [3 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0]
 [2 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [2 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0]
 [2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [2 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [2 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0]
 [4 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0]]

Classification Report: 
               precision    recall  f1-score   support

      Unit 1       0.09      0.43      0.15         7
     Unit 10       0.00      0.00      0.00         4
     Unit 11       0.00      0.00      0.00         1
     Unit 13       0.00      0.00      0.00         2
     Unit 14       0.00      0.00      0.00         3
     Unit 15       0.50      0.25      0.33       

In [12]:
from sklearn.ensemble import AdaBoostClassifier
tree_base_classifier =  DecisionTreeClassifier(criterion = 'entropy')

adc = AdaBoostClassifier(base_estimator = tree_base_classifier, n_estimators = 5)
pip = Pipeline(steps = [('ada',adc)])
ada_pred = cross_val_predict(pip, features_data, r1, cv = 5)
print("Accuracy: \n", accuracy_score(r1, ada_pred)*100)
print("Confusion Matrix: \n", confusion_matrix(r1, ada_pred))
print()
print("Classification Report: \n", classification_report(r1, ada_pred))


ada_pred = cross_val_predict(pip, features_data, r2, cv = 5)
print("Accuracy: \n", accuracy_score(r2, ada_pred)*100)
print("Confusion Matrix: \n", confusion_matrix(r2, ada_pred))
print()
print("Classification Report: \n", classification_report(r2, ada_pred))


ada_pred = cross_val_predict(pip, features_data, r3, cv = 5)
print("Accuracy: \n", accuracy_score(r3, ada_pred)*100)
print("Confusion Matrix: \n", confusion_matrix(r3, ada_pred))
print()
print("Classification Report: \n", classification_report(r3, ada_pred))

Accuracy: 
 2.0
Confusion Matrix: 
 [[0 2 0 0 1 0 0 0 1 0 0 0 2 0 1 0]
 [1 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0]
 [0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0]
 [1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0]
 [0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 0]
 [1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0]]

Classification Report: 
               precision    recall  f1-score   support

      Unit 1       0.00      0.00      0.00         7
     Unit 10       0.00      0.00      0.00         4
     Unit 11       0.00      0.00      0.00         1
     Unit 13       0.00      0.00      0.00         2
     Unit 14       0.00      0.00      0.00         3
     Unit 15       0.25      0.25      0.25         

# Content Based Recommendations
Let's make recommendations for students based off lessons's they've already completed
https://helpseotools.com/text-tools/remove-special-characters
https://helpseotools.com/text-tools/remove-special-characters
Use the above to remove special characters and format text to be analyzed

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

resources = pd.read_csv('resourcesText.csv', skipinitialspace = True)
#resources = resources.drop(columns=['Unit','Category', 'Subject', 'Level'])
#resources = pd.get_dummies(resources, columns = ['Category','Subject'], drop_first = False)
resources = resources.fillna(data.mean())
resources

Unnamed: 0,ID,Unit,Category,Subject,Level,Overview
0,1,Unit 1,Exploration,Travel,Beginner,Week 1 Individual Session Dream Trips Trip Pla...
1,2,Unit 2,Living,Education,Beginner-Intermediate,Week 1 Individual Session Introduction to Educ...
2,3,Unit 3,Creations,Film And TV,Intermediate-Advanced,Week 1 Individual Session Introduction to Film...
3,4,Unit 4,Health,Health And Fitness,Beginner-Intermediate,Week 1 Individual Session Healthy Lifestyle Se...
4,5,Unit 5,Living,Work And Life,Beginner,Week 1 Individual Session Success in Life Sess...
5,6,Unit 6,Exploration,Nature And The Environment,Advanced,Week 1 Individual Session Animal Rights Sessio...
6,7,Unit 7,Connections,Relationships,Intermediate-Advanced,Week 1 Individual Session Making Friends Sessi...
7,8,Unit 8,Connections,Social Impact,Advanced,Week 1 Individual Session Eating Cultures Sess...
8,9,Unit 9,Connections,Comparing Cultures,Intermediate-Advanced,Week 1 Individual Session Eating Cultures Sess...
9,10,Unit 10,Creations,Writing,Intermediate,1 Week 1 Individual Session Journaling Session...


In [14]:
tfidf = TfidfVectorizer(stop_words='english')
resources['verview'] = resources['Overview'].fillna('')
overview_matrix = tfidf.fit_transform(resources['Overview'])
similarity_matrix = linear_kernel(overview_matrix,overview_matrix)
mapping = pd.Series(resources.index,index = resources['Unit'])

In [15]:
def recommend_resources(input):
    index = mapping[input]
    similarity_score = list(enumerate(similarity_matrix[index]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    similarity_score = similarity_score[1:4]
    indices = [i[0] for i in similarity_score]
    return (resources['Unit'].iloc[indices])

In [16]:
def predict_for_student (input):
    x_train, x_test, y_train, y_test = train_test_split(features_data, r1, train_size = 0.8)

    tree_classifier = DecisionTreeClassifier()
    tree_classifier.fit(x_train, y_train)
    prediction1 = tree_classifier.predict(input)

    x_train, x_test, y_train, y_test = train_test_split(features_data, r2, train_size = 0.8)
    tree_classifier.fit(x_train, y_train)
    prediction2 = tree_classifier.predict(input)


    x_train, x_test, y_train, y_test = train_test_split(features_data, r3, train_size = 0.8)
    tree_classifier.fit(x_train, y_train)
    prediction3 = tree_classifier.predict(input)
    
    return prediction1, prediction2, prediction3


# DEMO

In [17]:
prediction = recommend_resources('Unit 1')
print('Based on what lessons you like we recommend ')
print(prediction)

Based on what lessons you like we recommend 
1      Unit 2
13    Unit 14
17    Unit 18
Name: Unit, dtype: object


In [18]:
test_student = data.iloc[0]
actual1, actual2, actual3 = test_student['resource1'], test_student['resource2'], test_student['resource3']
test_student = test_student.drop(['resource1', 'resource2', 'resource3'], axis=0)
test_student = test_student.to_frame().T
prediction1, prediction2, prediction3 = predict_for_student(test_student)
print('Based on your information we recommend ', prediction1, prediction2, 'and', prediction3)
print('You actually said you would enjoy ', actual1, actual2, 'and', actual3)

Based on your information we recommend  ['Unit 1'] ['Unit 2'] and ['Unit 3']
You actually said you would enjoy  Unit 1 Unit 2 and Unit 3
