# Course Recommandation Engine Using Udemy Dataset

In [1]:
# Importing Data preprocessing Libraries
import numpy as np
import pandas as pd

In [2]:
# Importing text preprocessing libraries
import neattext.functions as nfx
import nltk
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel


In [3]:
# Load dataset
df = pd.read_csv("udemy_courses.csv")

In [4]:
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3678 entries, 0 to 3677
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3678 non-null   int64  
 1   course_title         3678 non-null   object 
 2   url                  3678 non-null   object 
 3   is_paid              3678 non-null   bool   
 4   price                3678 non-null   int64  
 5   num_subscribers      3678 non-null   int64  
 6   num_reviews          3678 non-null   int64  
 7   num_lectures         3678 non-null   int64  
 8   level                3678 non-null   object 
 9   content_duration     3678 non-null   float64
 10  published_timestamp  3678 non-null   object 
 11  subject              3678 non-null   object 
dtypes: bool(1), float64(1), int64(5), object(5)
memory usage: 319.8+ KB


In [6]:
# Removing null and duplicate values
df.isnull().sum()

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

In [7]:
print("Duplicate Values: ",df.duplicated().sum())

Duplicate Values:  6


In [8]:
df.drop_duplicates(inplace=True,ignore_index=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3672 non-null   int64  
 1   course_title         3672 non-null   object 
 2   url                  3672 non-null   object 
 3   is_paid              3672 non-null   bool   
 4   price                3672 non-null   int64  
 5   num_subscribers      3672 non-null   int64  
 6   num_reviews          3672 non-null   int64  
 7   num_lectures         3672 non-null   int64  
 8   level                3672 non-null   object 
 9   content_duration     3672 non-null   float64
 10  published_timestamp  3672 non-null   object 
 11  subject              3672 non-null   object 
dtypes: bool(1), float64(1), int64(5), object(5)
memory usage: 319.3+ KB


In [10]:
# Clean Text: By removing stopwords,special characters,punctuations
df["clean_course_title"] = df["course_title"].apply(nfx.remove_stopwords)

In [11]:
df["clean_course_title"] = df["clean_course_title"].apply(nfx.remove_special_characters)

In [12]:
df["clean_course_title"] = df["clean_course_title"].apply(nfx.remove_punctuations)

In [13]:
df["clean_course_title"] = df["clean_course_title"].apply(lambda x: x.lower())

In [38]:
def word_count(col):
    x = []
    for doc in col:
        m = doc.find("course")
        if m != -1:
            x.append(m)
    print(x)

In [15]:
word_count(df["clean_course_title"])

[28, 13, 14, 31, 18, 40, 17, 15, 28, 27, 33, 12, 24, 25, 6, 10, 29, 22, 42, 30, 41, 42, 26, 15, 45, 38, 44, 6, 25, 49, 27, 39, 31, 22, 39, 51, 33, 9, 44, 44, 9, 46, 33, 42, 18, 17, 40, 45, 21, 26, 25, 36, 39, 15, 39, 36, 41, 17, 47, 43, 13, 41, 37, 38, 45, 44, 31, 17, 23, 44, 24, 31, 32, 24, 15, 35, 7, 44, 36, 18, 21, 10, 24, 18, 0, 24, 8, 15, 23, 27, 22, 13, 26, 48, 41, 23, 38, 0, 7, 32, 35, 13, 45, 37, 16, 39, 19, 44, 21, 15, 28, 150, 26, 15, 21, 22, 39, 21, 16, 19, 40, 36, 47, 22, 29, 20, 36, 27, 41, 44, 31, 31, 48, 17, 9, 9, 38, 20, 22, 50, 31, 23, 18, 36, 18, 19, 22, 40, 52, 26, 37, 35, 25, 19, 14, 20, 16, 15, 32, 13, 25, 33, 16, 29, 19, 7, 21, 26, 41, 21, 20, 25, 16, 25, 16, 31, 32, 25, 48, 27, 16, 20, 14, 26, 23, 20, 32, 42, 8, 14, 46, 34, 19, 27, 15, 45, 28, 15, 6, 15, 14, 24, 6, 15, 38, 11, 5, 14, 21, 21, 18, 25, 21, 20, 25, 13, 16, 23, 29, 30, 20, 23, 33, 16, 19, 30, 22, 14, 28, 20, 18, 5, 17, 21, 33, 32, 21, 16, 37, 15, 13, 16, 27, 31, 20, 15, 24, 15, 18, 24, 23, 15]


In [16]:
df['clean_course_title'] = df['clean_course_title'].apply(lambda x: x.replace("course","@"))

In [17]:
df["clean_course_title"] = df["clean_course_title"].apply(nfx.remove_special_characters)

In [18]:
df["clean_course_title"][0]

'ultimate investment banking '

In [19]:
df

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,clean_course_title
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance,ultimate investment banking
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance,complete gst certification grow practice
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance,financial modeling business analysts consultants
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance,beginner pro financial analysis excel 2017
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance,maximize profits trading options
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3667,775618,Learn jQuery from Scratch - Master of JavaScri...,https://www.udemy.com/easy-jquery-for-beginner...,True,100,1040,14,21,All Levels,2.0,2016-06-14T17:36:46Z,Web Development,learn jquery scratch master javascript library
3668,1088178,How To Design A WordPress Website With No Codi...,https://www.udemy.com/how-to-make-a-wordpress-...,True,25,306,3,42,Beginner Level,3.5,2017-03-10T22:24:30Z,Web Development,design wordpress website coding
3669,635248,Learn and Build using Polymer,https://www.udemy.com/learn-and-build-using-po...,True,40,513,169,48,All Levels,3.5,2015-12-30T16:41:42Z,Web Development,learn build polymer
3670,905096,CSS Animations: Create Amazing Effects on Your...,https://www.udemy.com/css-animations-create-am...,True,50,300,31,38,All Levels,3.0,2016-08-11T19:06:15Z,Web Development,css animations create amazing effects website


In [20]:
# Converting all the text into numbers
vectorizer = CountVectorizer()
vec_mat = vectorizer.fit_transform(df["clean_course_title"])

In [21]:
# sparse Metrics
vec_mat

<3672x3551 sparse matrix of type '<class 'numpy.int64'>'
	with 18061 stored elements in Compressed Sparse Row format>

In [22]:
# Dense Metrics 
vec_mat.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
# Converting metrics into dataframe 
df_vec_words = pd.DataFrame(vec_mat.todense(), columns=vectorizer.get_feature_names())



In [24]:
df_vec_words.head()

Unnamed: 0,000005,001,01,02,10,100,101,101master,102,10k,...,zend,zero,zerotohero,zf2,zinsen,zoho,zombie,zu,zuhause,zur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Cosine Similarity
cos_mat = cosine_similarity(vec_mat)

In [26]:
cos_mat

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.23570226],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.23570226, 0.        ,
        1.        ]])

In [34]:
def recommand(user_input,num_of_rec=10):
    user_input_cv = vectorizer.transform([user_input])
    sim_user_input = cosine_similarity(vec_mat, user_input_cv).flatten()
    
    course_indices = sim_user_input.argsort()[::-1][:10] # Get indices of top 5 similar courses
    

    # Sort courses based on number of subscribers
    recommended_course_indices = sorted(course_indices, key=lambda idx: df.iloc[idx].num_subscribers, reverse=True)
    
    result_df = df.iloc[recommended_course_indices]
    
    final_recommended_courses = result_df[['course_title','url','price','num_subscribers']]
    return final_recommended_courses.head(num_of_rec)


In [37]:
recommand("javascript",8)

Unnamed: 0,course_title,url,price,num_subscribers
3012,JavaScript Intro to learning JavaScript web pr...,https://www.udemy.com/javascript-intro-to-lear...,20,17554
2620,JavaScript For Beginners : Learn JavaScript Fr...,https://www.udemy.com/javascript-course-for-be...,195,11285
2661,JavaScript for Beginners Welcome to learning J...,https://www.udemy.com/javascript-for-beginners...,50,10864
3400,JavaScript Introduction to Object Oriented Jav...,https://www.udemy.com/javascript-introduction-...,90,7789
3063,JavaScript in Action JavaScript Projects,https://www.udemy.com/javascript-in-action-lea...,150,7720
2818,Explore JavaScript Beginners Guide to Coding J...,https://www.udemy.com/javascript-beginners-cou...,200,5739
2552,JavaScript the Basics - JavaScript for Beginners,https://www.udemy.com/javascript-the-basics-fo...,120,4028
3204,JavaScript programming: JavaScript for beginners,https://www.udemy.com/learn-javascript-online/,55,2215


In [29]:
# fetching the number of subscribers and year graph

In [30]:
df[df["course_title"] == "Ultimate Investment Banking Course"]

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,clean_course_title
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance,ultimate investment banking
