# Course Recommandation Engine Using Udemy Dataset

In [39]:
# Importing Data preprocessing Libraries
import numpy as np
import pandas as pd

In [40]:
# Importing text preprocessing libraries
import neattext.functions as nfx
import nltk
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel


In [41]:
# Load dataset
df = pd.read_csv("udemy_courses.csv")

In [42]:
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3678 entries, 0 to 3677
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3678 non-null   int64  
 1   course_title         3678 non-null   object 
 2   url                  3678 non-null   object 
 3   is_paid              3678 non-null   bool   
 4   price                3678 non-null   int64  
 5   num_subscribers      3678 non-null   int64  
 6   num_reviews          3678 non-null   int64  
 7   num_lectures         3678 non-null   int64  
 8   level                3678 non-null   object 
 9   content_duration     3678 non-null   float64
 10  published_timestamp  3678 non-null   object 
 11  subject              3678 non-null   object 
dtypes: bool(1), float64(1), int64(5), object(5)
memory usage: 319.8+ KB


In [44]:
# Removing null and duplicate values
df.isnull().sum()

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

In [45]:
print("Duplicate Values: ",df.duplicated().sum())

Duplicate Values:  6


In [46]:
df.drop_duplicates(inplace=True,ignore_index=True)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3672 non-null   int64  
 1   course_title         3672 non-null   object 
 2   url                  3672 non-null   object 
 3   is_paid              3672 non-null   bool   
 4   price                3672 non-null   int64  
 5   num_subscribers      3672 non-null   int64  
 6   num_reviews          3672 non-null   int64  
 7   num_lectures         3672 non-null   int64  
 8   level                3672 non-null   object 
 9   content_duration     3672 non-null   float64
 10  published_timestamp  3672 non-null   object 
 11  subject              3672 non-null   object 
dtypes: bool(1), float64(1), int64(5), object(5)
memory usage: 319.3+ KB


In [48]:
# Clean Text: By removing stopwords,special characters,punctuations
df["clean_course_title"] = df["course_title"].apply(nfx.remove_stopwords)

In [49]:
df["clean_course_title"] = df["clean_course_title"].apply(nfx.remove_special_characters)

In [50]:
df["clean_course_title"] = df["clean_course_title"].apply(nfx.remove_punctuations)

In [51]:
df["clean_course_title"] = df["clean_course_title"].apply(lambda x: x.lower())

In [54]:
# Counting "course" word which is common in many course 
# It will impact which calculating similarity so we will remove "course" word 
def word_count(col):
    x = []
    for doc in col:
        m = doc.find("course")
        if m != -1:
            x.append(m)
    print(x)

In [55]:
word_count(df["clean_course_title"])

[28, 13, 14, 31, 18, 40, 17, 15, 28, 27, 33, 12, 24, 25, 6, 10, 29, 22, 42, 30, 41, 42, 26, 15, 45, 38, 44, 6, 25, 49, 27, 39, 31, 22, 39, 51, 33, 9, 44, 44, 9, 46, 33, 42, 18, 17, 40, 45, 21, 26, 25, 36, 39, 15, 39, 36, 41, 17, 47, 43, 13, 41, 37, 38, 45, 44, 31, 17, 23, 44, 24, 31, 32, 24, 15, 35, 7, 44, 36, 18, 21, 10, 24, 18, 0, 24, 8, 15, 23, 27, 22, 13, 26, 48, 41, 23, 38, 0, 7, 32, 35, 13, 45, 37, 16, 39, 19, 44, 21, 15, 28, 150, 26, 15, 21, 22, 39, 21, 16, 19, 40, 36, 47, 22, 29, 20, 36, 27, 41, 44, 31, 31, 48, 17, 9, 9, 38, 20, 22, 50, 31, 23, 18, 36, 18, 19, 22, 40, 52, 26, 37, 35, 25, 19, 14, 20, 16, 15, 32, 13, 25, 33, 16, 29, 19, 7, 21, 26, 41, 21, 20, 25, 16, 25, 16, 31, 32, 25, 48, 27, 16, 20, 14, 26, 23, 20, 32, 42, 8, 14, 46, 34, 19, 27, 15, 45, 28, 15, 6, 15, 14, 24, 6, 15, 38, 11, 5, 14, 21, 21, 18, 25, 21, 20, 25, 13, 16, 23, 29, 30, 20, 23, 33, 16, 19, 30, 22, 14, 28, 20, 18, 5, 17, 21, 33, 32, 21, 16, 37, 15, 13, 16, 27, 31, 20, 15, 24, 15, 18, 24, 23, 15]


In [56]:
df['clean_course_title'] = df['clean_course_title'].apply(lambda x: x.replace("course","@"))

In [57]:
df["clean_course_title"] = df["clean_course_title"].apply(nfx.remove_special_characters)

In [60]:
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,clean_course_title
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance,ultimate investment banking
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance,complete gst certification grow practice
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance,financial modeling business analysts consultants
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance,beginner pro financial analysis excel 2017
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance,maximize profits trading options


In [61]:
# Converting all the text into numbers
vectorizer = CountVectorizer()
vec_mat = vectorizer.fit_transform(df["clean_course_title"])

In [62]:
# sparse Metrics
vec_mat

<3672x3551 sparse matrix of type '<class 'numpy.int64'>'
	with 18061 stored elements in Compressed Sparse Row format>

In [64]:
# Dense Metrics 
vec_mat.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [65]:
# Converting metrics into dataframe 
df_vec_words = pd.DataFrame(vec_mat.todense(), columns=vectorizer.get_feature_names())



In [66]:
df_vec_words.head()

Unnamed: 0,000005,001,01,02,10,100,101,101master,102,10k,...,zend,zero,zerotohero,zf2,zinsen,zoho,zombie,zu,zuhause,zur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
# Cosine Similarity of each vector(means each course) with another
cos_mat = cosine_similarity(vec_mat)

In [70]:
cos_mat

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.23570226],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.23570226, 0.        ,
        1.        ]])

In [75]:
def recommand(user_input,num_of_rec=10):
    # converting user input text into vector
    user_input_vec_mat = vectorizer.transform([user_input])
    
    #checking Similarity of user input vector with all the vectors in the dataframe
    sim_user_input = cosine_similarity(vec_mat, user_input_vec_mat).flatten()
    
    course_indices = sim_user_input.argsort()[::-1][:10] # Get indices of top 10 similar courses
    

    # Sort courses based on number of subscribers
    recommended_course_indices = sorted(course_indices, key=lambda idx: df.iloc[idx].num_subscribers, reverse=True)
    
    result_df = df.iloc[recommended_course_indices]
    
    final_recommended_courses = result_df[['course_title','url','price','num_subscribers']]
    return final_recommended_courses.head(num_of_rec)


In [81]:
recommand("Banking")

Unnamed: 0,course_title,url,price,num_subscribers
39,The Complete Investment Banking Course 2017,https://www.udemy.com/the-complete-investment-...,195,8575
0,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,200,2147
240,Advanced Accounting for Investment Banking,https://www.udemy.com/advanced-accounting-for-...,50,1260
227,Investment Banking: How to Land a Job on Wall ...,https://www.udemy.com/how-to-land-a-job-on-wal...,75,1218
120,Quantitative Aptitude for Banking & Competitiv...,https://www.udemy.com/quantitative-aptitude-fo...,50,1056
1144,Banking Credit Analysis Process (for Bankers),https://www.udemy.com/credit-analysis-process/,180,894
528,"Accounting, Finance and Banking - A Comprehens...",https://www.udemy.com/accounting-finance-and-b...,180,507
418,Business Banking 101,https://www.udemy.com/business-banking-101/,25,132
417,The Investment Banking Recruitment Series,https://www.udemy.com/investmentbanking/,40,17
887,Workshop on Banking Credit Analysis Process,https://www.udemy.com/workshop-on-banking-cred...,100,0
