In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### EDA

In [None]:
# Load EDA Pkgs
import pandas as pd
import numpy as np

In [None]:
# Load Data Viz Pkgs
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Hide All Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load Dataset
df = pd.read_csv("/content/udemy_courses.csv")

In [None]:
df.head()

### What parameters we analyzed
+ Course Title
	- What is the most frequent words in course title
	- Longest/Shortest course title

+ Subjects/Category
	- What is the distribution of subjects
	- How many courses per subject
	- Distribution of subjects per year
	- How many people purchase a particular subject
	- Which subject is the most popular

+ Published Year
	- Number of courses per year
	- Which year has the highest number of courses
	- What is the trend of courses per year

+ Levels
	- How many levels do we have
	- What is the distribution of courses per levels
	- Which subject have the highest levels
	- How many subscribers per levels
	- How many courses per levels

+ Duration of Course
	- Which courses have the highest duration (paid or not)
	- Which courses have higher duration
	- Duration vs number of subscribers

+ Subscribers
	- Which course have the highest number of subscribers
	- Average number of subscribers
	- Number of subscribers per Subject
	- Number of subscribers per year

+ Price
	- What is the average price of a course
	- What is the min/max price
	- How much does Udemy earn
	- The most profitable courses

+ Correlation Questions
	- Does number of subscribers depend on
	- number of reviews
	- price
	- number of lectures
	- content duration



#### EDA on Subject/Category
+ Subjects/Category
	- What is the distribution of subjects
	- How many courses per subject
	- Distribution of subjects per year
	- How many people purchase a particular subject
	- Which subject is the most popular

In [None]:
df.columns

In [None]:
# How many subject/category are there in the dataset
df['subject'].unique()
print("No. of unique domains: ", len(df['subject'].unique()))

In [None]:
# How many subject/category are there in the dataset
len(df['subject'].unique())

In [None]:
# Distribution of Subject/Value Counts
df['subject'].value_counts()

In [None]:
# Plot Distribution of Subject/Value Counts
df['subject'].value_counts().plot(kind='bar')

In [None]:
# Plot Distribution of Subject/Value Counts with Pie Chart
plt.figure(figsize=(10,5))
df['subject'].value_counts().plot(kind='pie')
plt.show()

In [None]:
# Method 2: using seaborn
plt.figure(figsize=(10,7))
sns.countplot(df['subject'])
plt.title("Plot of Value Count of Subject")
plt.xticks(rotation=45)
plt.show()

### EDA on Subscribers
+ Subscribers
	- Which course have the highest number of subscribers
	- Average number of subscribers
	- Number of subscribers per Subject
	- Number of subscribers per year

In [None]:
df.head()

In [None]:
# Number of Courses per Subject
df['subject'].value_counts()

In [None]:
# Number of subscribers per Subject
df.groupby('subject')['num_subscribers'].sum()

In [None]:
# Plot of the Number of subscribers per Subject
df.groupby('subject')['num_subscribers'].sum().plot(kind='bar')

In [None]:
# Plot of the Number of subscribers per Subject
df.groupby('subject')['num_subscribers'].sum().plot(kind='pie')

#### Narrative
+ More subscribers for Web dev than Business Finance
+ The top subject category is Web Dev, Business Finance

In [None]:
# Total Number of Subscribers
df['num_subscribers'].sum()


In [None]:
##Average number of subscribers (avg subs per course)
df['num_subscribers'].mean()


In [None]:
# Min number of subscriber
df['num_subscribers'].min()

In [None]:
# Max number of subscriber
df['num_subscribers'].max()

In [None]:
# Which course has the highest number of sub
df['num_subscribers'].idxmax()

In [None]:
# Get the location
df.iloc[2832]

#### Narrative
+ Udemy has over 11 million subscribers
+ Highest num of sub: 268923
+ Average num of sub: 3193
+ Course with the highest no. of subscribers = "Learn HTML5 Programming from Scratch"

#### EDA on Levels
Levels
	- How many levels do we have
	- What is the distribution of courses per levels
	- Which subject have the highest levels
	- How many subscribers per levels
	- How many courses per levels

In [None]:
# How many levels do we have
df['level'].unique()

In [None]:
# What is the distribution of courses per levels
df['level'].value_counts()

In [None]:
# What is the distribution of courses per levels
df['level'].value_counts().plot(kind='bar')

In [None]:
# What is the distribution of courses per levels
df['level'].value_counts().plot(kind='pie')

In [None]:
#  How many subscribers per levels
df.groupby('level')['num_subscribers'].sum()

In [None]:
df[['num_subscribers','level']]

In [None]:
# Plot of How many subscribers per levels
df.groupby('level')['num_subscribers'].sum().plot(kind='bar')

In [None]:
#  How many subscribers per levels
df.groupby('level')['num_subscribers'].sum().plot(kind='pie')

In [None]:
# How many levels per subject category
df.groupby('subject')['level'].value_counts()

In [None]:
# Plot of How many levels per subject category
df.groupby('subject')['level'].value_counts().plot(kind='bar')

In [None]:
# Using Seaborn
plt.figure(figsize=(20,10))
sns.barplot(x='level',y='num_subscribers', hue='subject',data=df,ci=None)
plt.show()

In [None]:
# Using Seaborn
plt.figure(figsize=(20,10))
sns.barplot(x='level',y='num_lectures', hue='subject',data=df,ci=None)
plt.show()

In [None]:
# EDA on Lectures
df.head()

In [None]:
def plot_num_of_countable_feature(feature):
    # Using Seaborn
    plt.figure(figsize=(10,7))
    plt.title("Plot of {} per level per subject".format(feature))
    sns.barplot(x='level',y=feature, hue='subject',data=df,ci=None)
    plt.show()
    plt.figure(figsize=(10,7))
    #  How many subscribers per levels
    plt.title("Plot of {} per level".format(feature))
    df.groupby('level')[feature].sum().plot(kind='pie')
    plt.show()

In [None]:
# Plot For the lectures * reviews
plot_num_of_countable_feature('num_reviews')

In [None]:
# Plot of lectures
plot_num_of_countable_feature('num_lectures')

### EDA on Price
+ Price
	- What is the average price of a course
	- What is the min/max price
	- How much does Udemy earn
	- The most profitable courses


In [None]:
# Datatype
df['price'].dtype

In [None]:
# unique price
df.price.unique()

In [None]:
df['price'] = df['price'].replace(['Free', 'TRUE'], '0')


In [None]:
df['price'] = df['price'].astype(float)


In [None]:
# What is the average price
df['price'].mean()

In [None]:
# What is the max price
df['price'].max()

In [None]:
# What is the max price location
df['price'].idxmax()

In [None]:
df.iloc[0]

In [None]:
# What is the most profitable course
# price x num_subscriber
df['profit'] = df['price'] * df['num_subscribers']

In [None]:
# What is the most profitable
df['profit'].max()

#### Narative
+ Most profitable course made 24 million $

In [None]:
# The location of course
df['profit'].idxmax()

In [None]:
df.iloc[3235]

In [None]:
### How Many Courses on Udemy Are paid or free
df['is_paid'].unique()

In [None]:
paid_dict = {'True':True,'False':False,'FALSE':False,'TRUE':True,'https://www.udemy.com/learnguitartoworship/':True}

In [None]:
df['is_paid'] = df['is_paid'].map(paid_dict)

In [None]:
### How Many Courses on Udemy Are paid or free
df['is_paid'].value_counts()

In [None]:
### How Many Courses on Udemy Are paid or free
df['is_paid'].value_counts().plot(kind='bar')

In [None]:
# How many paid/free course in each subject category
df.groupby('is_paid')['subject'].value_counts()

In [None]:
# How many paid/free course in each subject category
df.groupby('is_paid')['subject'].value_counts().plot(kind='bar')

In [None]:

plt.figure(figsize=(10,7))
sns.barplot(x='level',y='is_paid', hue='subject',data=df,ci=None)
plt.show()


### Correlation Question
+ Correlation Questions
	- Does number of subscribers depend on
	- number of reviews
	- price
	- number of lectures
	- content duration

In [None]:
sns.scatterplot(data=df,x='price',y='num_subscribers')

In [None]:
sns.scatterplot(data=df,x='price',y='num_reviews')

In [None]:
plt.figure(figsize=(20,10))
plt.title("Does Price Influence Subscription Per Subject Category")
sns.lineplot(data=df,x='price',y='num_subscribers',hue='subject')
plt.show()

In [None]:
plt.figure(figsize=(20,10))
plt.title("Does Reviews Influence Subscription Per Subject Category")
sns.lineplot(data=df,x='num_reviews',y='num_subscribers',hue='subject')
plt.show()

### Question on Time
+ Published Year
	- Number of courses per year
	- Distribution of subjects per year
	- Which year has the highest number of courses
	- What is the trend of courses per year

In [None]:
df.head()

In [None]:
df['published_timestamp'].dtype

In [None]:
df['published_timestamp'].head()

In [None]:
df['published_timestamp'].str.split('T')

In [None]:
df['published_date'] = df['published_timestamp'].str.split('T').str.get(0)

In [None]:
df['published_date']

In [None]:
from datetime import datetime
def convert_to_dt(x):
    mydate = datetime.strptime(x,'%Y-%m-%d')
    result = mydate.strftime("%Y-%m-%d")
    return result

In [None]:
df['published_date'] = df['published_date'].str.replace('3 hours','2017-01-18')

In [None]:
convert_to_dt("2017-01-18")

In [None]:
df['published_date'].apply(lambda x: convert_to_dt(x))

In [None]:
df['published_date'] = df['published_date'].apply(lambda x: convert_to_dt(x))

In [None]:
df.dtypes

In [None]:
df['published_date'] = pd.to_datetime(df['published_date'],format="%Y-%m-%d")

In [None]:
df.dtypes

In [None]:
df['year'] = df['published_date'].dt.year
df['month'] = df['published_date'].dt.month
df['day'] = df['published_date'].dt.day

In [None]:
df.groupby('year')['num_subscribers'].value_counts()

In [None]:
plt.figure(figsize=(20,10))
plt.title("Plot of Subscription Per Year Per Subject Category")
sns.lineplot(data=df,x='year',y='num_subscribers',hue='subject')
plt.show()

#### Questions on Course Title
+ Course Title
	- What is the most frequent words in course title
	- Longest/Shortest course title
	- How can we build recommendation systems via title using similarity
	- Most famous courses by number of subscribers

In [None]:
df.head()

In [None]:
# Get the Longest Title for a course
df['course_title'].str.len()

In [None]:
# Get the Longest Title for a course
df['course_title_len'] = df['course_title'].str.len()


In [None]:
# Get Longest title
df['course_title_len'].max()

In [None]:
# Get tlocation
df.iloc[243]

### Keyword Extraction
+ Rake
+ Yakes
+ Collection


In [None]:
df['course_title']

In [None]:
!pip install neattext


In [None]:
# Text Cleaning to remove stopwords
import neattext.functions as nfx

In [None]:
dir(nfx)

In [None]:
df['clean_title'] = df['course_title'].apply(nfx.remove_stopwords)

In [None]:
df[['clean_title','course_title']]

In [None]:
df['clean_title'] = df['clean_title'].apply(nfx.remove_special_characters)

In [None]:
df['clean_title']

In [None]:
# Tokenization
all_title_list = df['clean_title'].to_list()

In [None]:
all_title_list[1:100]

In [None]:
for line in all_title_list[1:5]:
    print(line.split())
    for i in line.split():
        print(i)

In [None]:
# Convert list to tokens
all_tokens = [i for line in all_title_list for i in line.split()]

In [None]:
all_tokens

In [None]:
from collections import Counter

In [None]:
word_freq = Counter(all_tokens)

In [None]:
# TOp Words
dict(word_freq.most_common(30))

In [None]:
top_words = dict(word_freq.most_common(50))

In [None]:
plt.figure(figsize=(20,10))
plt.bar(*zip(*top_words.items()))
plt.xticks(rotation=45)
plt.show()

In [None]:
!pip install rake-nltk


In [None]:
from rake_nltk import Rake

In [None]:
import nltk
nltk.download('stopwords')


In [None]:
r = Rake()

In [None]:
join_words = ' '.join(all_tokens)

In [None]:
nltk.download('punkt')

In [None]:
r.extract_keywords_from_text(join_words)

In [None]:
print(r.get_ranked_phrases_with_scores())

# **Recommendation System**

In [None]:
# Load ML/Rc Pkgs
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel

In [None]:
df.head()

In [None]:
df['course_title']

In [None]:
dir(nfx)

In [None]:
# Clean Text:stopwords,special charac
df['clean_course_title'] = df['course_title'].apply(nfx.remove_stopwords)

In [None]:
df[['course_title','clean_course_title']]

In [None]:
# Vectorize our Text
count_vect = CountVectorizer()
cv_mat = count_vect.fit_transform(df['clean_course_title'])

In [None]:
# Sparse
cv_mat

In [None]:
# Dense
cv_mat.todense()

In [None]:
df_cv_words = pd.DataFrame(cv_mat.todense(), columns=count_vect.get_feature_names_out())


In [None]:
df_cv_words.head()

In [None]:
# Cosine Similarity Matrix
cosine_sim_mat = cosine_similarity(cv_mat)

In [None]:
cosine_sim_mat

In [None]:
df.head()

In [None]:
# Get Course ID/Index
course_indices = pd.Series(df.index,index=df['course_title']).drop_duplicates()

In [None]:
course_indices

In [None]:
course_indices['How To Maximize Your Profits Trading Options']

In [None]:
idx = course_indices['How To Maximize Your Profits Trading Options']

In [None]:
idx

In [None]:
scores = list(enumerate(cosine_sim_mat[idx]))

In [None]:
scores

In [None]:
# Sort our scores per cosine score
sorted_scores = sorted(scores,key=lambda x:x[1],reverse=True)

In [None]:
# Omit the First Value/itself
sorted_scores[1:]

In [None]:
# Selected Courses Indices
selected_course_indices = [i[0] for i in sorted_scores[1:]]

In [None]:
selected_course_indices

In [None]:
# Selected Courses Scores
selected_course_scores = [i[1] for i in sorted_scores[1:]]

In [None]:
recommended_result = df['course_title'].iloc[selected_course_indices]

In [None]:
rec_df = pd.DataFrame(recommended_result)

In [None]:
rec_df.head()

In [None]:
rec_df['similarity_scores'] = selected_course_scores

In [None]:
rec_df

In [None]:
def recommend_course(title,num_of_rec=10):
    # ID for title
    idx = course_indices[title]
    # Course Indice
    # Search inside cosine_sim_mat
    scores = list(enumerate(cosine_sim_mat[idx]))
    # Scores
    # Sort Scores
    sorted_scores = sorted(scores,key=lambda x:x[1],reverse=True)
    # Recomm
    selected_course_indices = [i[0] for i in sorted_scores[1:]]
    selected_course_scores = [i[1] for i in sorted_scores[1:]]
    result = df['course_title'].iloc[selected_course_indices]
    rec_df = pd.DataFrame(result)
    rec_df['similarity_scores'] = selected_course_scores
    return rec_df.head(num_of_rec)


In [None]:
recommend_course('Trading Options Basics',20)

NameError: name 'recommend_course' is not defined

In [None]:
df.to_csv("/content/drive/MyDrive/new_udemy_courses_clean.csv")

In [None]:
recommend_course('How To Maximize Your Profits Trading Options',5)

In [None]:
course_indices