### Course Recommendation System

#### Algorithm Used
+ Cosine Similarity

#### Workflow
+ DATA
+ Vectorized our Dataset
+ Cosine Similarity Matrix
+ Id,Score
+ Recommend
    

In [3]:
#loading packages
import pandas as pd
import neattext.functions as nfx
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel
import seaborn as sns

In [4]:
 # loading dataset
df = pd.read_csv("data/udemy_courses.csv")

In [5]:
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance


In [7]:
dir(nfx)[0:10]

['BTC_ADDRESS_REGEX',
 'CURRENCY_REGEX',
 'CURRENCY_SYMB_REGEX',
 'Counter',
 'DATE_REGEX',
 'EMAIL_REGEX',
 'EMOJI_REGEX',
 'HASTAG_REGEX',
 'MASTERCard_REGEX',
 'MD5_SHA_REGEX']

In [8]:
df['course_title']

0                      Ultimate Investment Banking Course
1       Complete GST Course & Certification - Grow You...
2       Financial Modeling for Business Analysts and C...
3       Beginner to Pro - Financial Analysis in Excel ...
4            How To Maximize Your Profits Trading Options
                              ...                        
3678    Learn jQuery from Scratch - Master of JavaScri...
3679    How To Design A WordPress Website With No Codi...
3680                        Learn and Build using Polymer
3681    CSS Animations: Create Amazing Effects on Your...
3682    Using MODX CMS to Build Websites: A Beginner's...
Name: course_title, Length: 3683, dtype: object

In [9]:
# Clean Text: stopwords, special character
df['clean_course_title'] = df['course_title'].apply(nfx.remove_stopwords)

In [10]:
df['clean_course_title'] = df['clean_course_title'].apply(nfx.remove_special_characters)

In [11]:
df[{'course_title','clean_course_title'}]

Unnamed: 0,course_title,clean_course_title
0,Ultimate Investment Banking Course,Ultimate Investment Banking Course
1,Complete GST Course & Certification - Grow You...,Complete GST Course Certification Grow Practice
2,Financial Modeling for Business Analysts and C...,Financial Modeling Business Analysts Consultants
3,Beginner to Pro - Financial Analysis in Excel ...,Beginner Pro Financial Analysis Excel 2017
4,How To Maximize Your Profits Trading Options,Maximize Profits Trading Options
...,...,...
3678,Learn jQuery from Scratch - Master of JavaScri...,Learn jQuery Scratch Master JavaScript library
3679,How To Design A WordPress Website With No Codi...,Design WordPress Website Coding
3680,Learn and Build using Polymer,Learn Build Polymer
3681,CSS Animations: Create Amazing Effects on Your...,CSS Animations Create Amazing Effects Website


In [12]:
# Vectorize out Text
count_vect = CountVectorizer()
cv_mat = count_vect.fit_transform(df['clean_course_title'])

In [13]:
cv_mat

<3683x3564 sparse matrix of type '<class 'numpy.int64'>'
	with 18364 stored elements in Compressed Sparse Row format>

In [14]:
cv_mat.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
df_cv_words = pd.DataFrame(cv_mat.todense(), columns=count_vect.get_feature_names())

In [16]:
df_cv_words.head()

Unnamed: 0,000005,001,01,02,10,100,101,101master,102,10k,...,zend,zero,zerotohero,zf2,zinsen,zoho,zombie,zu,zuhause,zur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Cosine Similarity Matrix
cosine_sim_mat = cosine_similarity(cv_mat)

In [18]:
 cosine_sim_mat[0:10]

array([[1.        , 0.20412415, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.20412415, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25      , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [19]:
#Get Course Id/Index
course_indices = pd.Series(df.index,index=df['course_title']).drop_duplicates()

In [20]:
course_indices[0:10]

course_title
Ultimate Investment Banking Course                              0
Complete GST Course & Certification - Grow Your CA Practice     1
Financial Modeling for Business Analysts and Consultants        2
Beginner to Pro - Financial Analysis in Excel 2017              3
How To Maximize Your Profits Trading Options                    4
Trading Penny Stocks: A Guide for All Levels In 2017            5
Investing And Trading For Beginners: Mastering Price Charts     6
Trading Stock Chart Patterns For Immediate, Explosive Gains     7
Options Trading 3 : Advanced Stock Profit and Success Method    8
The Only Investment Strategy You Need For Your Retirement       9
dtype: int64

In [21]:
course_indices['How To Maximize Your Profits Trading Options']

4

In [27]:
course_user_likes = input("Search course of Your choice: ")

Search course of Your choice: How To Maximize Your Profits Trading Options


In [28]:
idx = course_indices[course_user_likes]

In [29]:
idx 

4

In [30]:
scores = list(enumerate(cosine_sim_mat[idx]))

In [33]:
scores[0:15]

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 1.0),
 (5, 0.20412414523193154),
 (6, 0.20412414523193154),
 (7, 0.1889822365046136),
 (8, 0.3779644730092272),
 (9, 0.0),
 (10, 0.20412414523193154),
 (11, 0.5),
 (12, 0.0),
 (13, 0.17677669529663687),
 (14, 0.35355339059327373)]

In [34]:
# Sort our scores per cosine score
sorted_scores = sorted(scores,key=lambda x:x[1],reverse=True)

In [36]:
sorted_scores[0:10]

[(4, 1.0),
 (410, 0.5773502691896258),
 (43, 0.5669467095138407),
 (96, 0.5303300858899106),
 (138, 0.5303300858899106),
 (195, 0.5303300858899106),
 (444, 0.5303300858899106),
 (803, 0.5303300858899106),
 (11, 0.5),
 (59, 0.5)]

In [37]:
selected_course_indices = [i[0] for i in sorted_scores[:]]

In [41]:
selected_course_indices[0:20]

[4,
 410,
 43,
 96,
 138,
 195,
 444,
 803,
 11,
 59,
 68,
 71,
 97,
 330,
 378,
 514,
 647,
 738,
 947,
 991]

In [42]:
# Selected Courses Scores
selected_course_scores = [i[1] for i in sorted_scores[:]]

In [43]:
recommended_result = df['course_title'].iloc[selected_course_indices]

In [44]:
rec_df = pd.DataFrame(recommended_result)

In [45]:
rec_df.head()

Unnamed: 0,course_title
4,How To Maximize Your Profits Trading Options
410,Trading Options Basics
43,Options Trading - How to Win with Weekly Options
96,Intermediate Options trading concepts for Stoc...
138,Forex Trading with Fixed 'Risk through Options...


In [46]:
rec_df['similarity_scores'] = selected_course_scores

In [47]:
rec_df.head()

Unnamed: 0,course_title,similarity_scores
4,How To Maximize Your Profits Trading Options,1.0
410,Trading Options Basics,0.57735
43,Options Trading - How to Win with Weekly Options,0.566947
96,Intermediate Options trading concepts for Stoc...,0.53033
138,Forex Trading with Fixed 'Risk through Options...,0.53033


 def recommend_course(title,num_of_rec=10):
    # ID for title
    idx = course_indices[title]
    # Course Indice
    # Search inside cosine_sim_out
    scores = list[enumerate(cosine_sim_mat[idx])]
    # Scores
    # Sort Scores
    sorted_scores = sorted(scores,key=lambda x:x[1],reverse=True)
    #Recommend
    selected_course_indices = [i[0] for i in sorted_scores[1:]]
    selected_course_scores = [i[1] for i in sorted_scores[1:]]
    result = df['course_title'].iloc[selected_course_indices]
    rec_df = pd.DataFrame(result)
    rec_df['similarity_scores'] = selected_course_scores
    return rec_df.head(num_of_rec)

recommend_course('How To Maximize Your Profits Trading Options')