In [1]:
import pandas as pd
import numpy as np

In [2]:
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords

In [3]:
import re
import os
import warnings; warnings.simplefilter('ignore')

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yashu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yashu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yashu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
df = pd.read_csv('data/Udemy_Clean.csv')

In [6]:
rating_counts = df[df['No_of_Ratings'].notnull()]['No_of_Ratings'].astype(int)
rating_averages = df[df['Overall_Rating'].notnull()]['Overall_Rating'].astype(int)
c = rating_averages.mean()

In [7]:
m = rating_counts.quantile(0.95)

In [8]:
col_list = ['Title', 'Overall_Rating', 'No_of_Ratings', 'Category']
qualified = df[(df['No_of_Ratings'] >= m)
              & (df['No_of_Ratings'].notnull())
              & (df['Overall_Rating'].notnull())][col_list]

qualified['No_of_Ratings'] = qualified['No_of_Ratings'].astype(int)
qualified['Overall_Rating'] = qualified['Overall_Rating'].astype(int)

In [9]:
def weighted_rating(x):
    v = x['No_of_Ratings']
    r = x['Overall_Rating']
    return (v/(v+m) * r) + (m/(m+v) * c)

In [10]:
qualified['weighted_rating'] = qualified.apply(weighted_rating, axis=1)

In [11]:
qualified = qualified.sort_values('weighted_rating', ascending=False).head(250)

In [12]:
popularity_based = qualified.head(10)
popularity_based

Unnamed: 0,Title,Overall_Rating,No_of_Ratings,Category,weighted_rating
1249,2021 Complete Python Bootcamp From Zero to Her...,4,374285,Development,3.995098
6460,Microsoft Excel - Excel from Beginner to Advanced,4,240810,Office Productivity,3.992426
6575,The Web Developer Bootcamp 2021,4,210136,Development,3.991341
5645,Angular - The Complete Guide (2021 Edition),4,148888,Development,3.987873
5640,Machine Learning A-Z™: Hands-On Python & R In ...,4,148491,Development,3.987841
8148,Java Programming Masterclass covering Java 11 ...,4,144174,Development,3.987487
5290,The Complete 2021 Web Development Bootcamp,4,141689,Development,3.987274
10790,The Complete Digital Marketing Course - 12 Cou...,4,136608,Marketing,3.986814
2082,"React - The Complete Guide (incl Hooks, React ...",4,119526,Development,3.984991
13154,The Complete JavaScript Course 2021: From Zero...,4,110537,Development,3.983813


In [13]:
rating_counts_stars = df[df['No_of_Ratings'].notnull()]['No_of_Ratings'].astype(float)
rating_averages_stars = df[df['Overall_Rating'].notnull()]['Overall_Rating'].astype(float)
c_stars = rating_averages_stars.mean()

In [14]:
m_stars = rating_counts_stars.quantile(0.95)

In [15]:
col_list_stars = ['Title', 'Overall_Rating', 'No_of_Ratings', 'Category']
qualified_stars = df[(df['No_of_Ratings'] >= m)
              & (df['No_of_Ratings'].notnull())
              & (df['Overall_Rating'].notnull())][col_list]

qualified_stars['No_of_Ratings'] = qualified_stars['No_of_Ratings'].astype(float)
qualified_stars['Overall_Rating'] = qualified_stars['Overall_Rating'].astype(float)

In [16]:
def weighted_rating_stars(x_stars):
    v_stars = x_stars['No_of_Ratings']
    r_stars = x_stars['Overall_Rating']
    return (v_stars/(v_stars+m_stars) * r_stars) + (m_stars/(m_stars+v_stars) * c_stars)

In [17]:
qualified_stars['weighted_rating_stars'] = qualified_stars.apply(weighted_rating_stars, axis=1)

In [18]:
qualified_stars = qualified_stars.sort_values('weighted_rating_stars', ascending=False).head(250)

In [19]:
trending_courses = qualified_stars.head(10)
trending_courses

Unnamed: 0,Title,Overall_Rating,No_of_Ratings,Category,weighted_rating_stars
6149,iOS & Swift - The Complete iOS App Development...,4.8,63873.0,Development,4.743572
4746,"Advanced CSS and Sass: Flexbox, Grid, Animatio...",4.8,31413.0,Development,4.691904
6575,The Web Developer Bootcamp 2021,4.7,210136.0,Development,4.683993
5290,The Complete 2021 Web Development Bootcamp,4.7,141689.0,Development,4.676474
2082,"React - The Complete Guide (incl Hooks, React ...",4.7,119526.0,Development,4.672255
13154,The Complete JavaScript Course 2021: From Zero...,4.7,110537.0,Development,4.670078
6249,The Complete SQL Bootcamp 2021: Go from Zero t...,4.7,104784.0,Business,4.668496
9117,Complete C# Unity Game Developer 2D,4.7,86930.0,Development,4.662313
252,Ultimate AWS Certified Solutions Architect Ass...,4.7,84312.0,IT & Software,4.661196
5412,Modern React with Redux,4.7,69881.0,Development,4.65362


In [20]:
temp = df.apply(lambda x: pd.Series(x['Category']),axis=1).stack().reset_index(level=1, drop=True)
temp.name = 'category'
df_cat = df.drop('Category', axis=1).join(temp)

In [21]:
def make_toplist(genre, percentile=0.85):
    dataframe = df_cat[df_cat['category'] == genre]
    rating_counts = dataframe[dataframe['No_of_Ratings'].notnull()]['No_of_Ratings'].astype(float)
    rating_averages = dataframe[dataframe['Overall_Rating'].notnull()]['Overall_Rating'].astype(float)
    cg = rating_averages.mean()
    mg = rating_counts.quantile(percentile)
    col_list_g = ['Title', 'Overall_Rating', 'No_of_Ratings', 'category']
    qualified_g = dataframe[(dataframe['No_of_Ratings'] >= mg)
                           & (dataframe['No_of_Ratings'].notnull())
                           & (dataframe['Overall_Rating'].notnull())][col_list_g]
    qualified_g['No_of_Ratings'] = qualified_g['No_of_Ratings'].astype(float)
    qualified_g['Overall_Rating'] = qualified_g['Overall_Rating'].astype(float)
    
    qualified_g['weighted_rating_g'] = qualified.apply(lambda xg: (xg['No_of_Ratings']/(xg['No_of_Ratings']+mg) * xg['Overall_Rating']) + (mg/(mg+xg['No_of_Ratings']) * cg), axis=1)
    qualified_g = qualified_g.sort_values('weighted_rating_g', ascending=False).head(250)
    
    return qualified_g

In [22]:
make_toplist('Business').head(5)

Unnamed: 0,Title,Overall_Rating,No_of_Ratings,category,weighted_rating_g
6249,The Complete SQL Bootcamp 2021: Go from Zero t...,4.7,104784.0,Business,3.997825
1569,PMP Exam Prep Seminar - 2021 Exam Content with...,4.6,70014.0,Business,3.996772
944,Tableau 2020 A-Z: Hands-On Tableau Training fo...,4.6,68484.0,Business,3.996702
8001,Microsoft Power BI - A Complete Introduction [...,4.5,48610.0,Business,3.995402
2722,An Entire MBA in 1 Course:Award Winning Busine...,4.5,47851.0,Business,3.995332


In [23]:
make_toplist('Personal Development').head(5)

Unnamed: 0,Title,Overall_Rating,No_of_Ratings,category,weighted_rating_g
11877,Become a SuperLearner® 2: Learn Speed Reading ...,4.5,32436.0,Personal Development,3.997672
15097,Productivity and Time Management for the Overw...,4.5,32269.0,Personal Development,3.99766
2105,Life Coaching Certification Course (Beginner t...,4.6,31535.0,Personal Development,3.997607
16127,NLP Practitioner Certification Course (Beginne...,4.5,21086.0,Personal Development,3.996455
5509,Storytelling to Influence,4.3,20727.0,Personal Development,3.996395


In [24]:
make_toplist('Marketing').head(5)

Unnamed: 0,Title,Overall_Rating,No_of_Ratings,category,weighted_rating_g
10790,The Complete Digital Marketing Course - 12 Cou...,4.5,136608.0,Marketing,3.999669
6981,Ultimate Google Ads Training 2020: Profit with...,4.6,44432.0,Marketing,3.998989
2016,Instagram Marketing 2021: Complete Guide To In...,4.5,29785.0,Marketing,3.998498
6265,Facebook Ads & Facebook Marketing MASTERY 2021...,4.4,28049.0,Marketing,3.998407
2172,Digital Marketing Masterclass - 23 Courses in 1,4.4,16850.0,Marketing,3.997374


In [25]:
make_toplist('Health & Fitness').head(5)

Unnamed: 0,Title,Overall_Rating,No_of_Ratings,category,weighted_rating_g
12084,Cognitive Behavioural Therapy (CBT) Practition...,4.5,29153.0,Health & Fitness,3.998018
48,Introduction to Herbalism,4.4,382.0,Health & Fitness,
184,The Flow State Transformational Training Video...,4.2,350.0,Health & Fitness,
284,Deep Tissue Massage Certificate Course (5 CEU's),4.7,3146.0,Health & Fitness,
322,Diploma in Gut Health,4.6,666.0,Health & Fitness,


In [26]:
make_toplist('Teaching & Academics').head(5)

Unnamed: 0,Title,Overall_Rating,No_of_Ratings,category,weighted_rating_g
1564,IELTS Band 7+ Complete Prep Course,4.5,26800.0,Teaching & Academics,3.994181
8485,Spanish for Beginners. The Complete Method. Le...,4.7,18629.0,Teaching & Academics,3.991706
2852,English Grammar Launch: Upgrade your speaking ...,4.6,15336.0,Teaching & Academics,3.98999
195,Arabic language | The comprehensive course - L...,4.6,2338.0,Teaching & Academics,
354,Udemy Course Creation Blueprint Online - Unoff...,4.9,762.0,Teaching & Academics,


In [27]:
make_toplist('Development').head(5)

Unnamed: 0,Title,Overall_Rating,No_of_Ratings,category,weighted_rating_g
4886,iOS 10 & Swift 3: From Beginner to Paid Profes...,4.6,13652.0,Development,4.025367
1468,Complete C# Masterclass,4.5,13674.0,Development,4.025338
15270,Complete Guide to Elasticsearch,4.6,13748.0,Development,4.025241
8528,Apache Spark with Scala - Hands On with Big Data!,4.6,13778.0,Development,4.025202
11812,Learn Python & Ethical Hacking From Scratch,4.6,13790.0,Development,4.025186


In [28]:
make_toplist('Photography & Video').head(5)

Unnamed: 0,Title,Overall_Rating,No_of_Ratings,category,weighted_rating_g
8303,Photography Masterclass: A Complete Guide to P...,4.7,52119.0,Photography & Video,3.998564
7409,Premiere Pro CC for Beginners: Video Editing i...,4.7,27231.0,Photography & Video,3.997277
10772,iPhone Photography | Take Professional Photos ...,4.6,16425.0,Photography & Video,3.995541
425,Photography Beginners: DSLR Photography Camera...,4.6,1432.0,Photography & Video,
511,Basic Food Photography,4.5,1184.0,Photography & Video,


## Content Based

In [29]:
udemy = pd.read_csv('data/udemy.csv')

In [30]:
def extract_best_indices(m, topk, mask=None):
    if len(m.shape) > 1:
        cos_sim = np.mean(m, axis=0)
    else:
        cos_sim = m
    index = np.argsort(cos_sim)[::-1]
    if mask is not None:
        assert mask.shape == m.shape
        mask = mask[index]
    else:
        mask = np.ones(len(cos_sim))
    mask = np.logical_or(cos_sim[index] != 0, mask)
    best_index = index[mask][:topk]
    return best_index

In [31]:
STOPWORDS = set(stopwords.words('english'))
def tokenizer(nltk, min_words = 4, max_words = 200, stopwords = 'english', lemmatize=True):
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in word_tokenize(nltk)]
    else:
        tokens = [w for w in word_tokenize(nltk)]
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words and w not in stopwords)]
    return tokens

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
token_stop = tokenizer(''.join(STOPWORDS), lemmatize=False)

vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer)
tfidf_mat = vectorizer.fit_transform(udemy['description'].values.astype('U'))

In [33]:
search_string = 'python'

In [34]:
def recommender(search_string, tfidf_mat):
    def get_recommendations_tfidf(nltk, tfidf_mat):
        tokens = [str(tok) for tok in tokenizer(nltk)]
        vec = vectorizer.transform(tokens)
        mat = cosine_similarity(vec, tfidf_mat)
        best_index = extract_best_indices(mat, topk=10)
        return best_index
    best = get_recommendations_tfidf(search_string, tfidf_mat)
    return udemy[['Course Name','Categories','Short Description','Difficulty','Numberofenroll','Original rating']].iloc[best]

In [35]:
content_based = recommender(search_string, tfidf_mat)
content_based

Unnamed: 0,Course Name,Categories,Short Description,Difficulty,Numberofenroll,Original rating
68594,Core Python for Everyone,"Programming Languages,Python",Python,All Level,1714,4.5
30138,"Python,Python for Beginners Python Real time e...","IT & Software,IT Certifications,Python","python 3, python programming, python , python ...",All Level,1351,4.0
69418,Build Hotel Management System With TKinter And...,"Development,Programming Languages,Hotel Manage...",Learn Python! Master python advance programmin...,All Level,10142,3.3
63228,Python for Beginners - Learn Python in 100 Steps,"Programming Languages,Python",Python for Beginners. Learn Python with 200+ P...,All Level,41825,4.4
63469,Complete Python Programming Python Basics to A...,"Programming Languages,Python",Learn Python Programming Python Basics to Pyth...,All Level,48250,4.3
67483,Python Automation for Everyone,"Development,Programming Languages,Python",Python for beginners | Python Fundamentals | L...,All Level,3741,4.2
63029,Python Coding MADE EASY : A Python Coding Cour...,"Development,Programming Languages,Python",Python coding for beginners. Python coding for...,All Level,17250,4.6
31257,Python Programming from Basics to Advanced,"IT & Software,Other IT & Software,Python",Python: Learn to code with Python programming ...,All Level,838,4.2
49316,Python for Beginners: Complete Python Programming,"Teaching & Academics,Other Teaching & Academic...",Python: Learn to code with Python programming ...,All Level,21425,3.8
27964,Python for Beginners: Learn Python Programming...,"Other IT & Software,Python",Python 101: This Python for Beginners course t...,All Level,25563,4.4


In [36]:
topics_maybe_interested = content_based['Categories']
topics_maybe_interested = pd.DataFrame(topics_maybe_interested)

In [37]:
topics_maybe_interested['newcol'] = topics_maybe_interested['Categories']
topics_maybe_interested.dropna()
tmi = topics_maybe_interested['newcol'].str.split(',', expand=True).stack().value_counts()
tmi = tmi.to_dict()
topics_mi = [*tmi]
topics_mi

['Python',
 'Programming Languages',
 'Development',
 'IT & Software',
 'Other IT & Software',
 'IT Certifications',
 'Hotel Management',
 'Teaching & Academics',
 'Other Teaching & Academics']