# Users recommendation system for StudyLair.

## Algorithm for employing the Content-Based recommender system of study buddies.

#### Procedure recommendToUser(currUserID, wholeUsersFrame);
#### Input: ID of the user to give a rec to; dataset with all relevant info of the users
#### Output: the dataset/array of recommended users to the current user (sorted in the descending order of similarity)



### Stages:
#### 1) Clean the usersFrame and fetch the current user info (Data preprocessing stage)
#### 2) Create vectors for all users depending on the information about them and the criteria for treating this info and combining it (User-Vector representation stage)
#### 3) Find the most similar users to the current one and display them in descending order (the closest vector in terms of Euclidian distance/similarity cosine function)

Parameters encoding for each user:

ID: keep for the reference

name: lowercase all of it

gender: -1 - male; 0 - undefined; 1 - female

major: label encode

graduation year: take difference of 2020 (relative)

time preference: label encode for each part (ex: 1 - monday, 3 - evening => 13 after encoding)

courses: same as above - partial encoding, then combine (ex: 300 - comp sci, 40 - 400 => 340 after encoding)

In [3]:
# Importing the needed libraries
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from scipy.spatial import distance

In [10]:
# The main function that is used to recommend other users to the given one.
# currUserID - ID of the user to whome to recommend
# wholeUsersFrame - dataset with all users and their info
def recToUser(currUserID, wholeUsersFrame):
    # ------------------------------------------- STAGE 1 ------------------------------------------------------
    # defining the dataframe that will be cleaned and preprocessed
    cleanUsersFrame = wholeUsersFrame.copy()
    
    # converting the names to a lowercase standard for easier later use
    cleanUsersFrame['name'] = cleanUsersFrame['name'].str.lower()
    
    # encoding the gender of the user to a ternary system
    cleanUsersFrame['gender'] = cleanUsersFrame['gender'].replace('female', 1).replace('male', -1).replace('none', 0)
    
    for majorList in cleanUsersFrame['major']:
        for index in range(len(majorList)):
            row = cleanUsersFrame.iloc[0]
            row['major'] = majorList[index]
            cleanUsersFrame = pd.concat([cleanUsersFrame, pd.DataFrame(row).T], join="inner", ignore_index = True)
        cleanUsersFrame.drop([0], inplace = True)
        cleanUsersFrame.reset_index(drop = True, inplace = True)

    # instance for encoding the major
    majorEncoder = LabelEncoder()
    cleanUsersFrame['major'] = majorEncoder.fit_transform(cleanUsersFrame['major'])
    
    # relatively encoding the grad year
    cleanUsersFrame['graduationYear'] = cleanUsersFrame['graduationYear'].apply(lambda x: int(x) - 2020)
    
    # instance for encoding the name of the courses
    nameEncoder = LabelEncoder()
    # local list to hold course numbers in the loop
    nums = []
    # all possible course numbers present in the frame
    allNums = []
    # local list to hold course names in the loop
    names = []
    # all possible course names present in the frame
    allNames = []
    
    # transforming the course into the applied format (by name and m=number separately)
    for courseList in cleanUsersFrame['courseList']:
        for course in courseList:
            # tearing the name and number apart
            items = course.lower()
            items = re.split(r'([0-9]+)', items)
            # getting rid of the unneeded spaces
            courseName = items[0].strip()
            courseName = courseName.split('/')
            courseNum = items[1].strip()
            for index in range(len(courseName)):
                # adding a local element of each user to a local list
                names.append(courseName[index])
                nums.append(courseNum)
        # adding a local list to a global list
        allNums.append(nums)
        allNames.append(names)
        # erasing before the next iteration
        nums = []
        names = []
    
    # defined for all names in a useful 1D format
    nameUnion = []
    for nameList in allNames:
        for name in nameList:
            nameUnion.append(name)
    # reference frame for coursename <=> encoding of it
    names = pd.DataFrame(nameUnion, columns=['courseName'])
    # encode the names
    cypher = nameEncoder.fit_transform(names)
    names['encoding'] = cypher
    
    
    outerIndex = 0
    innerIndex = 0
    # replacing the course names in the whole frame with their encodings
    for nameList in allNames:
        for name in nameList:
            name = names[names['courseName'] == name]['encoding'].reset_index(drop = True)[0]
            allNames[outerIndex][innerIndex] = name
            innerIndex = innerIndex + 1
        outerIndex = outerIndex + 1
        innerIndex = 0
        
    outerIndex = 0
    innerIndex = 0
    # encoding the course numbers by dividing by 10 
    for numList in allNums:
        for num in numList:
            num = int(int(num)/10)
            allNums[outerIndex][innerIndex] = num
            innerIndex = innerIndex + 1
        outerIndex = outerIndex + 1
        innerIndex = 0
    
    # writing the course encodings into the whole frame
    cleanUsersFrame['courseNameList'] = allNames
    cleanUsersFrame['courseNumList'] = allNums   
    
    # fthe list with total course ratings of each user
    rating = []
    # for local use to later get all ratings
    curr = []
    # creating the rating of each course by a linear transformation (100*NAME_ENC + NUM_ENC)
    for i in range(cleanUsersFrame.shape[0]):
        for j in range(len(cleanUsersFrame['courseNameList'].iloc[i])):
            temp = (cleanUsersFrame['courseNameList'].iloc[i][j] * 100) + (cleanUsersFrame['courseNumList'].iloc[i][j])
            curr.append(temp) 
        rating.append(curr)
        curr = []
    # filling the ratings
    cleanUsersFrame['courseRating'] = rating  
    
    # list with all possible days in the frame
    allDays = []
    # for local use to derive allDays
    temp = []
    # leaving only 2 first letters for each day (none more are needed)
    for i in range(cleanUsersFrame.shape[0]):
        for j in range(len(cleanUsersFrame['dayPref'].iloc[i])):
            cleanUsersFrame['dayPref'].iloc[i][j] = cleanUsersFrame['dayPref'].iloc[i][j][:2]
            temp.append(cleanUsersFrame['dayPref'].iloc[i][j])
        allDays.append(temp)
        temp = []
    # encoder for the day       
    dayEncoder = LabelEncoder() 
    # useful 1D representation of all days that are in the frame 
    dayUnion = []
    for dayList in allDays:
        for day in dayList:
            dayUnion.append(day)
    # getting the days' names and their encodings in a frame
    days = pd.DataFrame(dayUnion, columns=['day'])
    dayCypher = dayEncoder.fit_transform(days)
    days['encoding'] = dayCypher
    
    outerIndex = 0
    innerIndex = 0
    # adding the days encodings in the whole frame
    for dayList in allDays:
        for day in dayList:
            day = days[days['day'] == day]['encoding'].reset_index(drop = True)[0]
            allDays[outerIndex][innerIndex] = int(day)
            innerIndex = innerIndex + 1
        outerIndex = outerIndex + 1
        innerIndex = 0
    # new column for these encodings
    cleanUsersFrame['dayPrefUPD'] = allDays
    
    # DOING ABSOULUTELY THE SAME FOR THE TIME SLOTS (AS WITH DAYS ENCODING ABOVE)
    # (LOOK UP FOR REFERENCE IF NEEDED)
    allTimes = []
    temp = []
    for i in range(cleanUsersFrame.shape[0]):
        for j in range(len(cleanUsersFrame['timeSlot'].iloc[i])):
            for k in range(len(cleanUsersFrame['timeSlot'].iloc[i][j])):
                currTime = int(cleanUsersFrame['timeSlot'].iloc[i][j][k].replace(',',''))
                allTimes.append(currTime)

    times = pd.DataFrame(allTimes, columns = ['time'])
    encoding = 0
    for i in range(len(allTimes)):
        if (i == len(allTimes)):
            if(allTimes[i] > 24):
                allTimes[i] = encoding
        else:
            if(allTimes[i] > 24):
                startTime = allTimes[i]//100
                endTime = allTimes[i]%100
                allTimes[i] = encoding
                for j in range(i+1, len(allTimes)):
                    if(allTimes[j] > 24):
                        if(((allTimes[j]%100 > startTime) and (allTimes[j]%100 <= endTime)) 
                           or ((allTimes[j]//100 >= startTime) and (allTimes[j]//100 < endTime))):
                            allTimes[j] = encoding
                encoding += 1
            
    times['cypher'] = allTimes
    times['cypher'] = times['cypher'].astype(str)
    cleanUsersFrame['timeSlotUPD'] = cleanUsersFrame['timeSlot']
    for i in range(cleanUsersFrame.shape[0]):
        for j in range(len(cleanUsersFrame['timeSlot'].iloc[i])):
            for k in range(len(cleanUsersFrame['timeSlot'].iloc[i][j])):
                currTime = int(cleanUsersFrame['timeSlot'].iloc[i][j][k].replace(',',''))
                for index in range(times.shape[0]):
                    if(times['time'].iloc[index] == currTime):
                        cleanUsersFrame['timeSlotUPD'].iloc[i][j][k] = times['cypher'].iloc[index]
                
    # creating the time rating for each user (as with course rating)
    timeRating = []
    # for local use of construction
    curr = []
    # creating the time rating by a linear transformaion (10*DAY_ENC + TIME_ENC)
    for i in range(cleanUsersFrame.shape[0]):
        for j in range(len(cleanUsersFrame['dayPref'].iloc[i])):
            for k in range(len(cleanUsersFrame['timeSlot'].iloc[i][j])):
                temp = (cleanUsersFrame['dayPrefUPD'].iloc[i][j] * 10) + (int(cleanUsersFrame['timeSlotUPD'].iloc[i][j][k]))
                curr.append(temp) 
        timeRating.append(curr)
        curr = []
    # creating a new column for this encoding
    cleanUsersFrame['timeRating'] = timeRating  
# ------------------------------------------- STAGE 2 ------------------------------------------------------   
    vectors = []
    colNames = []
    counter = 0
    for index in range(cleanUsersFrame.shape[0]):
        temp = [[cleanUsersFrame['gender'][index],cleanUsersFrame['major'][index],cleanUsersFrame['graduationYear'][index],x,y] for x in cleanUsersFrame['courseRating'][index] for y in cleanUsersFrame['timeRating'][index]]
        vectors.append(temp)
        colNames.append(str(cleanUsersFrame['ID'][index]) + '.' + str(counter))
        counter += 1
    
    uvRep = pd.DataFrame(vectors).T
    uvRep.columns = colNames

    currVectors = []
    currCols = [col for col in uvRep if col.startswith(str(currUserID))]
    for column in currCols:
        for ind in range(len(uvRep[column].dropna())):
            currVectors.append(uvRep[column][ind])          
    # ------------------------------------------- STAGE 3 ------------------------------------------------------
    dists = []
    temp = []
    colNames = []
    compCols = [col for col in uvRep.columns if col not in currCols]
    currCounter = 0
    for column in compCols:
        for currVector in currVectors:
            for ind in range(len(uvRep[column].dropna())):
                temp.append(distance.euclidean(currVector, uvRep[column][ind]))
        dists.append(temp)
        colNames.append(column)
        temp = []
        currCounter += 1
    similarities = pd.DataFrame(dists).T
    similarities.columns = colNames

    mins = []
    minCols = []
    
    for column in colNames:
        tempSorted = similarities.sort_values(by = [column])
        tempSorted.reset_index(inplace = True)
        currMin = tempSorted[column][0]
        mins.append(currMin)
        minCols.append(column.split('.')[0])

    mins = pd.DataFrame(mins, columns = ['dist'])
    mins['ID'] = minCols
    mins = mins.sort_values(by = ['dist'])
    mins.reset_index(drop = True, inplace = True)
    mins.drop_duplicates(subset = ['ID'], keep = 'first', inplace = True)
    mins.reset_index(drop = True, inplace = True)
    absMin = mins['dist'][0]
    recList = []
    for index in range(mins.shape[0]):
        recList.append([mins['ID'][index], (absMin/mins['dist'][index] // 0.01 / 100 + 0.01) * 100])
        
    return recList

In [11]:
# frame to test the functionality
testFrame = pd.DataFrame(np.array([['5ffd50e477fb743c7e959247', 'Laurette', 'male', ['biocore'], 2023, ['BIOCORE 181', 'BIOCORE 383', 'PSYCH/​COUN PSY/​RP & SE  729', 'PHMCOL-M 781'], ['Tuesday'], [['12,13', '17,18']]], ['5ffd50e477fb743c7e959248', 'Lanny', 'male', ['classics'], 2020, ['CLASSICS 206', 'CLASSICS 370', 'GREEK 801', 'COMP SCI 220'], ['Wednesday', 'Tuesday'], [['18,19', '13,14'], ['9,10','14,15']]], ['5ffd50e477fb743c7e95924a', 'Franky', 'female', ['classics', 'biocore'], 2021, ['STAT 301','AGROECOL/AGRONOMY/DY SCI  371', 'I SY E 100', 'COMP SCI/E C E 540'], ['Monday', 'Thursday'], [['17,20'], ['16,19','21,23']]]]),
                   columns=['ID', 'name', 'gender', 'major', 'graduationYear', 'courseList', 'dayPref', 'timeSlot'])
testFrame

Unnamed: 0,ID,name,gender,major,graduationYear,courseList,dayPref,timeSlot
0,5ffd50e477fb743c7e959247,Laurette,male,[biocore],2023,"[BIOCORE 181, BIOCORE 383, PSYCH/COUN PSY/RP &...",[Tuesday],"[[12,13, 17,18]]"
1,5ffd50e477fb743c7e959248,Lanny,male,[classics],2020,"[CLASSICS 206, CLASSICS 370, GREEK 801, COMP S...","[Wednesday, Tuesday]","[[18,19, 13,14], [9,10, 14,15]]"
2,5ffd50e477fb743c7e95924a,Franky,female,"[classics, biocore]",2021,"[STAT 301, AGROECOL/AGRONOMY/DY SCI 371, I SY...","[Monday, Thursday]","[[17,20], [16,19, 21,23]]"


In [12]:
# output list of the func
out = recToUser('5ffd50e477fb743c7e959248', testFrame)
out 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return f(*args, **kwargs)


[['5ffd50e477fb743c7e95924a', 100.0], ['5ffd50e477fb743c7e959247', 38.0]]