In [1]:
import pandas as pd
import re
import numpy as np
import nltk
import string
import html
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from nltk.corpus import wordnet
from typing import List

import spacy
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix, f1_score

import warnings
warnings.filterwarnings("ignore")

# Part I.  Topic Modelling and Analysis (5pts)

Pick from **one** of the dataset options below:
* **Negative McDonalds Yelp reviews**: `datasets/mcdonalds-yelp-negative-reviews.csv`
* **[Top 5000 Udemy courses](https://www.kaggle.com/datasets/90eededa5561eee7f62c0e68ecdad14c2bdb58bc923834067025dee655a6083e?resource=download)** - a Kaggle dataset of the course descriptions of the top 5000 Udemy courses in 2022: `datasets/top5000_udemy.csv`

In your notebook, explore the data and perform topic modelling. You may use any vectorization or text preprocessing techniques we have discussed.

In order to earn full credit, you must:

* Show the **# of topics you tried, and explain why you ultimately decided on the final #**.
* Demonstrate **adequate text preprocessing (there are likely obvious stopwords / fuzzy matching / regex groupings that can be done to improve the final results)** - show what you tried.
* In 2-3 sentences: A **business analysis of these topics - what do they reveal as actionable next steps or insights for McDonalds or Udemy?** Please be specific in your recommendations/insights.
    - **Not specific**: *We recommend Amazon look into the quality of their toys, since the reviews show disatisfaction with the value of their product.*
    - **Specific**: *Amazon should explore more durable batteries/hardwares. For example, X% of reviews mention that the toys' batteries were broken or immediately died. This is part of a larger theme of components not being ready to use out the box, which often leads to disappointment on holiday occasions when children open up their gifts. See the following document snippets as examples:...*

In [2]:
df = pd.read_csv('../datasets/Udemy.csv')
df = df[['course_name', 'course description', 'reviews_avg', 'reviews_count', 'course_duration', 'lectures_count',
             'price_after_discount', 'main_price', 'students_count', 'instructor']]
df.shape

(5027, 10)

In [3]:
df.head(2)

Unnamed: 0,course_name,course description,reviews_avg,reviews_count,course_duration,lectures_count,price_after_discount,main_price,students_count,instructor
0,2022 Complete Python Bootcamp From Zero to Her...,Learn Python like a Professional Start from t...,Rating: 4.6 out of 5,440383 reviews,22 total hours,155 lectures,Current price: E£319.99,"Original price: E£1,399.99","1,629,692 students",Jose Portilla
1,The Web Developer Bootcamp 2022,COMPLETELY REDONE - The only course you need t...,Rating: 4.7 out of 5,248508 reviews,64 total hours,615 lectures,Current price: E£269.99,"Original price: E£1,399.99","830,559 students",Colt Steele


## Preprocessing

### Numerical: transfer string to float

In [4]:
def find_number(line):
    num = re.findall(r'[\d+\.]',line)
    return "".join(num)

In [5]:
for i in ['reviews_avg', 'reviews_count', 'course_duration', 'lectures_count',
          'price_after_discount', 'main_price', 'students_count']:
    df[i] = df[i].astype(str)
    df[i] = df[i].apply(lambda x: find_number(x))
    df[i] = pd.to_numeric(df[i], errors='coerce')

df.head()

Unnamed: 0,course_name,course description,reviews_avg,reviews_count,course_duration,lectures_count,price_after_discount,main_price,students_count,instructor
0,2022 Complete Python Bootcamp From Zero to Her...,Learn Python like a Professional Start from t...,4.65,440383.0,22.0,155.0,319.99,1399.99,1629692.0,Jose Portilla
1,The Web Developer Bootcamp 2022,COMPLETELY REDONE - The only course you need t...,4.75,248508.0,64.0,615.0,269.99,1399.99,830559.0,Colt Steele
2,The Complete 2022 Web Development Bootcamp,Become a Full-Stack Web Developer with just ON...,4.75,234837.0,65.5,490.0,349.99,1699.99,794897.0,Dr. Angela Yu
3,Angular - The Complete Guide (2023 Edition),"Master Angular 14 (formerly ""Angular 2"") and b...",4.65,174576.0,34.5,472.0,319.99,1599.99,634196.0,Maximilian Schwarzmüller
4,Java Programming Masterclass covering Java 11 ...,Learn Java In This Course And Become a Compute...,4.55,171838.0,80.5,401.0,349.99,849.99,727934.0,"Tim Buchalka, Tim Buchalka's Learn Programming..."


### Categorical: only keep top 11 instructors

In [6]:
cats = df['instructor'].value_counts()[lambda x: x > 25].index
cats

Index(['Packt Publishing', 'Bluelime Learning Solutions', 'Laurence Svekis',
       'Eduonix Learning Solutions, Eduonix-Tech .', 'Infinite Skills',
       'YouAccel Training', 'Oak Academy', 'Loony Corn', 'Stephen Grider',
       'John Elder', 'Stone River eLearning'],
      dtype='object')

In [7]:
df_cat = pd.get_dummies(df.instructor)
df_cat = df_cat[cats]
df = pd.concat([df, df_cat], axis=1)
df.head(2)

Unnamed: 0,course_name,course description,reviews_avg,reviews_count,course_duration,lectures_count,price_after_discount,main_price,students_count,instructor,...,Bluelime Learning Solutions,Laurence Svekis,"Eduonix Learning Solutions, Eduonix-Tech .",Infinite Skills,YouAccel Training,Oak Academy,Loony Corn,Stephen Grider,John Elder,Stone River eLearning
0,2022 Complete Python Bootcamp From Zero to Her...,Learn Python like a Professional Start from t...,4.65,440383.0,22.0,155.0,319.99,1399.99,1629692.0,Jose Portilla,...,0,0,0,0,0,0,0,0,0,0
1,The Web Developer Bootcamp 2022,COMPLETELY REDONE - The only course you need t...,4.75,248508.0,64.0,615.0,269.99,1399.99,830559.0,Colt Steele,...,0,0,0,0,0,0,0,0,0,0


### Text         

I first do the lemmitization. The reason why I choose lemmitization is that I think results in lemma are easier to interpret when doing word counts in comparison with stem. Stem is more difficult to interpret since you have to get back to the original text so that you can figure out what exact words are used. In contrast, lemma returns the base or dictionary form of a word and it's more intuitive.

In [8]:
## Reference: https://gist.github.com/gaurav5430/9fce93759eb2f6b1697883c3782f30de#file-nltk-lemmatize-sentences-py
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def lem(line):
    word_tokens = nltk.word_tokenize(line)
    word_tokens = [lemmatize_sentence(t) for t in word_tokens]
    cleaned_review = " ".join(word_tokens)
        
    return cleaned_review

In [9]:
df['title_description'] = df['course_name'] + ' ' + df['course description']
# df.drop(columns=['course_name', 'course description'], inplace = True)
df['title_description'] = df['title_description'].astype(str)
df['title_description'] = df['title_description'].apply(lambda x: x.lower())
df['title_description'] = df['title_description'].apply(lambda x: lem(x))
df.head()

Unnamed: 0,course_name,course description,reviews_avg,reviews_count,course_duration,lectures_count,price_after_discount,main_price,students_count,instructor,...,Laurence Svekis,"Eduonix Learning Solutions, Eduonix-Tech .",Infinite Skills,YouAccel Training,Oak Academy,Loony Corn,Stephen Grider,John Elder,Stone River eLearning,title_description
0,2022 Complete Python Bootcamp From Zero to Her...,Learn Python like a Professional Start from t...,4.65,440383.0,22.0,155.0,319.99,1399.99,1629692.0,Jose Portilla,...,0,0,0,0,0,0,0,0,0,2022 complete python bootcamp from zero to her...
1,The Web Developer Bootcamp 2022,COMPLETELY REDONE - The only course you need t...,4.75,248508.0,64.0,615.0,269.99,1399.99,830559.0,Colt Steele,...,0,0,0,0,0,0,0,0,0,the web developer bootcamp 2022 completely red...
2,The Complete 2022 Web Development Bootcamp,Become a Full-Stack Web Developer with just ON...,4.75,234837.0,65.5,490.0,349.99,1699.99,794897.0,Dr. Angela Yu,...,0,0,0,0,0,0,0,0,0,the complete 2022 web development bootcamp bec...
3,Angular - The Complete Guide (2023 Edition),"Master Angular 14 (formerly ""Angular 2"") and b...",4.65,174576.0,34.5,472.0,319.99,1599.99,634196.0,Maximilian Schwarzmüller,...,0,0,0,0,0,0,0,0,0,angular - the complete guide ( 2023 edition ) ...
4,Java Programming Masterclass covering Java 11 ...,Learn Java In This Course And Become a Compute...,4.55,171838.0,80.5,401.0,349.99,849.99,727934.0,"Tim Buchalka, Tim Buchalka's Learn Programming...",...,0,0,0,0,0,0,0,0,0,java program masterclass cover java 11 & java ...


observe potential topics and decide regex cleaning target

In [10]:
def Countvec(words, Ngram=(1,1), token_pattern=None, min_df=1, max_df=1.0):
    '''Create count vectorizer'''
    vectorizer = CountVectorizer(stop_words="english", ngram_range=Ngram, lowercase=True, 
                                 token_pattern=token_pattern, min_df=min_df, max_df=max_df)
    X = vectorizer.fit_transform(words) 
    X = X.toarray()
    print(X.shape)    
    feature = vectorizer.get_feature_names()
    corpus_df = pd.DataFrame(X, columns=feature)
    return corpus_df

In [11]:
description = df['title_description'].tolist()

In [12]:
description_vec = Countvec(words=description, Ngram=(1,1), token_pattern=r'[a-zA-Z]{3,}')
description_vec.sum().sort_values(ascending=False)[:50]

(5027, 4395)


learn          3603
python         1541
use            1378
course         1205
build          1177
program        1147
beginner       1136
web            1028
complete       1002
data            997
development     852
java            733
create          718
javascript      694
master          681
project         652
game            586
step            586
application     577
test            565
guide           562
scratch         560
sql             473
app             472
apps            470
developer       457
react           455
html            441
real            423
basic           418
advanced        416
framework       413
api             406
android         406
design          405
code            373
machine         369
website         363
unity           333
php             332
net             319
make            316
building        314
science         313
language        285
cod             281
core            275
start           272
database        272
practical       271


In [13]:
description_vec2 = Countvec(description, (2,2), r'[a-zA-Z]{3,}')
description_vec2.sum().sort_values(ascending=False)[:50]

(5027, 33537)


machine learn              348
data science               286
step step                  263
web development            257
real world                 207
learn python               198
learn build                194
course learn               193
beginner learn             179
python program             174
asp net                    164
complete guide             157
learn create               151
deep learn                 140
web application            140
sql server                 137
data structure             129
net core                   129
program language           127
learn use                  127
game development           122
spring boot                113
rest api                   111
learn basic                104
html css                    98
course beginner             98
beginner advanced           97
guide learn                 93
learn program               88
crash course                87
app development             87
learn code                  87
java pro

By observing results of count vectorizer, I find out that there are some potential topics:

-  data science (machine learning, data science, artificial intelligence)        
- web development (web development, web application, web apps, web developer, html css, javascript)       
- database (sql server, mysql, data structure, structure algorithm)      
- Application (real world, application use)  

For regex, we can see that "beginner", "basic" have the similar meaning and they can be interprted together as "_beginner_". "sql", "mysql", "database" can be interprted together as "_database_". In addition, I  remove some common keywords that are too general like "learn", "course", "use".

In [14]:
def word_replace(line):
    line = re.sub(r'\b(beginner|basic|step)\b', '_beginner_', line)
    line = re.sub(r'\b(sql|mysql|database)\b', '_database_', line)
    line = re.sub(r'\b(learn|courses?|use)\b', '', line)
    return line

In [15]:
df['new_title_desc'] = df['title_description'].apply(lambda x: word_replace(x))

In [16]:
df.columns

Index(['course_name', 'course description', 'reviews_avg', 'reviews_count',
       'course_duration', 'lectures_count', 'price_after_discount',
       'main_price', 'students_count', 'instructor', 'Packt Publishing',
       'Bluelime Learning Solutions', 'Laurence Svekis',
       'Eduonix Learning Solutions, Eduonix-Tech .', 'Infinite Skills',
       'YouAccel Training', 'Oak Academy', 'Loony Corn', 'Stephen Grider',
       'John Elder', 'Stone River eLearning', 'title_description',
       'new_title_desc'],
      dtype='object')

In [17]:
df_num = df[['reviews_avg', 'reviews_count', 'course_duration', 'lectures_count', 'price_after_discount',
             'main_price', 'students_count']]
df_cat = df[['Packt Publishing','Bluelime Learning Solutions', 'Laurence Svekis', 
             'Eduonix Learning Solutions, Eduonix-Tech .', 'Infinite Skills', 'YouAccel Training', 
             'Oak Academy', 'Loony Corn', 'Stephen Grider', 'John Elder', 'Stone River eLearning']]

## Perform Topic Modelling

Step 1: Vectorize The Corpus

In [18]:
vectorizer = TfidfVectorizer(ngram_range=(2,2),
                             min_df=0.008, max_df=0.5, stop_words="english")

X = vectorizer.fit_transform(df['new_title_desc'])
terms = vectorizer.get_feature_names()
tf_idf = pd.DataFrame(X.toarray(), columns=terms)

concat_list = [tf_idf, df_num, df_cat]
tf_idf = pd.concat(concat_list, axis=1)

print(f"TF-IDF: {tf_idf.shape}")
print(tf_idf.head(5))
tf_idf.head(5)

TF-IDF: (5027, 81)
   _beginner_ _beginner_  _beginner_ advanced  _beginner_ expert  \
0                    0.0                  0.0                0.0   
1                    0.0                  0.0                0.0   
2                    0.0                  0.0                0.0   
3                    0.0                  0.0                0.0   
4                    0.0                  0.0                0.0   

   _beginner_ guide  _beginner_ python  _database_ _database_  \
0               0.0                0.0                    0.0   
1               0.0                0.0                    0.0   
2               0.0                0.0                    0.0   
3               0.0                0.0                    0.0   
4               0.0                0.0                    0.0   

   _database_ server  absolute _beginner_  android app  app development  ...  \
0                0.0                  0.0          0.0              0.0  ...   
1                0.0 

Unnamed: 0,_beginner_ _beginner_,_beginner_ advanced,_beginner_ expert,_beginner_ guide,_beginner_ python,_database_ _database_,_database_ server,absolute _beginner_,android app,app development,...,Bluelime Learning Solutions,Laurence Svekis,"Eduonix Learning Solutions, Eduonix-Tech .",Infinite Skills,YouAccel Training,Oak Academy,Loony Corn,Stephen Grider,John Elder,Stone River eLearning
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


Step 2: Fit NMF Model

In [19]:
nmf = NMF(n_components=4)
W = nmf.fit_transform(X)
H = nmf.components_

Step 3: Report Results For Each Topic

In [20]:
def get_top_tf_idf_tokens_for_topic(H: np.array, feature_names: List[str], num_top_tokens: int = 5):
  """
  Uses the H matrix (K components x M original features) to identify for each
  topic the most frequent tokens.
  """
  for topic, vector in enumerate(H):
    print(f"TOPIC {topic}\n")
    total = vector.sum()
    top_scores = vector.argsort()[::-1][:num_top_tokens]
    token_names = list(map(lambda idx: feature_names[idx], top_scores))
    strengths = list(map(lambda idx: vector[idx] / total, top_scores))
    
    for strength, token_name in zip(strengths, token_names):
        print(f"\b{token_name} ({round(strength * 100, 1)}%)\n")
    print(f"=" * 50)

get_top_tf_idf_tokens_for_topic(H, tf_idf.columns.tolist(), 5)

TOPIC 0

_beginner_ _beginner_ (51.7%)

_beginner_ guide (9.7%)

program _beginner_ (4.5%)

_beginner_ advanced (3.6%)

_database_ server (2.4%)

TOPIC 1

data science (51.7%)

science machine (8.0%)

python data (6.7%)

data analysis (6.7%)

python program (4.3%)

TOPIC 2

web development (42.5%)

html cs (15.3%)

stack web (6.1%)

cs javascript (5.6%)

web developer (4.7%)

TOPIC 3

real world (30.1%)

complete guide (18.3%)

web application (6.8%)

build real (6.5%)

asp net (3.7%)



Get the Top Documents For Each Topic            
We can also use the W matrix to grab top documents per topic (ie. the document that had the greatest percentage of of each topic).

In [21]:
def get_top_documents_for_each_topic(W: np.array, documents: List[str], num_docs: int = 5):
    sorted_docs = W.argsort(axis=0)[::-1]
    top_docs = sorted_docs[:num_docs].T
    per_document_totals = W.sum(axis=1)
    for topic, top_documents_for_topic in enumerate(top_docs):
        print(f"Topic {topic}")
        for doc in top_documents_for_topic:
            score = W[doc][topic]
            percent_about_topic = round(score / per_document_totals[doc] * 100, 1)
            print(f"{percent_about_topic}%", documents[doc])
            print("=" * 50)

In [22]:
get_top_documents_for_each_topic(W, df['title_description'].tolist(), num_docs=10)

Topic 0
100.0% gitting start : step-by-step git and github crash course git start now with a hands-on guide to learn both git and github from an expert .
100.0% sap smart form for beginner learn smartforms step by step ( hands-on approach ) | 17 practice example | all node in smartforms be discuss
100.0% 17 beginner c # walkthrough project step by step learn how to code in c # by building 17 project
100.0% python exercise for beginner : solve 100+ cod challenge practice your python skill with 100+ python exercise and check your solution with step-by-step video explanation .
100.0% build an api from scratch with python , django , sqlite3 create an api step by step
100.0% unity 3d 2017 - build , program & publish crossy road game step by step video lesson to create and publish a 3d 8 bit art game like crossy road to the app store . use unity3d
100.0% look to learn bdd - cucumber .... ? get expertise in 2 hr step by step bdd concept | cucumber framework | gherkin language |end to end inte

In [23]:
def get_top_documents_into_dataframe(W: np.array, documents: List[str], num_docs: int = 10):
    NMF_df = pd.DataFrame()
    topics = []
    docs = []
    sorted_docs = W.argsort(axis=0)[::-1]
    top_docs = sorted_docs[:num_docs].T
    for topic, top_documents_for_topic in enumerate(top_docs):
        for doc in top_documents_for_topic:
            topics.append(topic)
            docs.append(description[doc])
    NMF_df['topic'] = topics
    NMF_df['title_description'] = docs
    
    return NMF_df

Choose top 20 course in each topics

In [24]:
documents = df['title_description'].tolist()
NMF_df = get_top_documents_into_dataframe(W, documents, 20)
NMF_df

Unnamed: 0,topic,title_description
0,0,gitting start : step-by-step git and github cr...
1,0,sap smart form for beginner learn smartforms s...
2,0,17 beginner c # walkthrough project step by st...
3,0,python exercise for beginner : solve 100+ cod ...
4,0,"build an api from scratch with python , django..."
...,...,...
75,3,apply machine learn for healthcare learn to im...
76,3,azure data factory real world project end to e...
77,3,materialize cs from scratch with 5 project mas...
78,3,angular 14 - beginner practical guide [ 2022 ]...


In [25]:
inner_merged_total = pd.merge(NMF_df, df, how="inner", on=["title_description"])
inner_merged_total.head(2)

Unnamed: 0,topic,title_description,course_name,course description,reviews_avg,reviews_count,course_duration,lectures_count,price_after_discount,main_price,...,Laurence Svekis,"Eduonix Learning Solutions, Eduonix-Tech .",Infinite Skills,YouAccel Training,Oak Academy,Loony Corn,Stephen Grider,John Elder,Stone River eLearning,new_title_desc
0,0,gitting start : step-by-step git and github cr...,Gitting Started: Step-by-Step Git and Github C...,Git started now with a hands-on guide to learn...,4.85,384.0,2.5,38.0,269.99,449.99,...,0,0,0,0,0,0,0,0,0,gitting start : _beginner_-by-_beginner_ git a...
1,0,sap smart form for beginner learn smartforms s...,SAP Smart Forms for beginners,Learn SmartForms step by step (Hands-on approa...,4.45,628.0,4.5,34.0,199.99,229.99,...,0,0,0,0,0,0,0,0,0,sap smart form for _beginner_ smartforms _beg...


In [26]:
inner_merged_total.columns

Index(['topic', 'title_description', 'course_name', 'course description',
       'reviews_avg', 'reviews_count', 'course_duration', 'lectures_count',
       'price_after_discount', 'main_price', 'students_count', 'instructor',
       'Packt Publishing', 'Bluelime Learning Solutions', 'Laurence Svekis',
       'Eduonix Learning Solutions, Eduonix-Tech .', 'Infinite Skills',
       'YouAccel Training', 'Oak Academy', 'Loony Corn', 'Stephen Grider',
       'John Elder', 'Stone River eLearning', 'new_title_desc'],
      dtype='object')

In [27]:
# Average review score in each topic
inner_merged_total.groupby('topic')['reviews_avg'].mean()

topic
0    4.435000
1    4.500000
2    4.280000
3    4.372727
Name: reviews_avg, dtype: float64

In [28]:
# Total number of review in each group
inner_merged_total.groupby('topic')['reviews_count'].sum()

topic
0    14575.0
1    82829.0
2    21647.0
3    39397.0
Name: reviews_count, dtype: float64

In [29]:
# Total number of student in each group
inner_merged_total.groupby('topic')['students_count'].sum()

topic
0    229856.0
1    752889.0
2    911616.0
3    341346.0
Name: students_count, dtype: float64

From topic modelling result, we can see that Udemy provides 4 types of courses for different customer segments: 
- The first one is beginning level courses. In this type, there are around 60% of courses put emphasis on step-by-step teaching style. Therefore, these courses are suitable for those customers who don't have background about any topics and would like to learn from basic level.   
- The second type is about data science. More than 60% of course descriptions in this topic mention about data science, machine learning and data analysis. Therefore, these courses are suitable for those customers who would like to gain skills such as data manipulation, data visualization, statistical analysis and model building. They can also learn about how to utilize data science such as time series analysis and natural language processing.
- The third type is about web development. Beside only mentioning about web development in the description, there are around 25% of courses descriptions mention that they will teach about  coding skills such as html, javascript, and css. These are all important skills for customers who want to learn how to create their own websites.
- The last type of course emphasizes on real world application. Some customers would care more about how to apply what they learn to solve real-world problem rather than just learning the skills. Therefore, they would be more willing to take courses whose description contains keyword such as "real world", "application", "practical".


In addition, after I dive into details of top 20 courses in each type, I found out that the total number of students in the web development cources are highest compared to other types. However, these courses got the least percentage of reviews. That is to say, only 2% of students gave reviews. And they also got the lowest review score among four types. Therefore, I suggest that Udemy can put more effort on gathering feedbacks from students who take web development courses to understand the biggest pain points. For example, Udemy can send each of them an email to ask about feedback or embed forms inside online course lessons. Replying for all reviews may help as well since it shows how much you care about your customers if you provide a reasonable and polite explanation to negative reviews. Thus, negative reviews will not only help uncover ways to improve courses, but potential customers will also not be discouraged by reading the negative reviews.

# Part II. Emotion Classification (5 pts)

Use the `datasets/emotions_dataset.zip` (see the original Dataset source on [Kaggle](https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp)) to build a classification model that predicts the emotion of sentence. If you would like, you may classify only the top 4 emotions, and group all other classes as `Other`. 

In order to earn full credit, you must:

* Show the performance of your model with `CountVectorizer`, `TfIdfVectorizer`, `word2vec`, and `glove` embeddings.
    - for `word2vec`, make sure not to use the `en_core_web_sm` dataset (these are not real embeddings)
* Perform text preprocessing (or explain why it was not necessary):
    - stopword removal
    - ngram tokenization
    - stemming/lemmatization
    - fuzzy matching / regex cleaning / etc. (as you deem necessary, but show that you analyzed the text to make your decision)
* Show **AUROC / F1 scores** for on the holdout (test + validation) datasets.
* A brief discussion (2-3 sentences) of what could improve your model and why.

In [30]:
df_train = pd.read_csv("../datasets/emotions/train.txt", delimiter=';', header=None, names=['sentence','label'])
df_test = pd.read_csv("../datasets/emotions/test.txt", delimiter=';', header=None, names=['sentence','label'])
df_val = pd.read_csv("../datasets/emotions/val.txt", delimiter=';', header=None, names=['sentence','label'])

In [31]:
df_train["dataset_type"] = "train"
df_test["dataset_type"] = "train"
df_val["dataset_type"] = "test"
df = pd.concat([df_train, df_test, df_val])
df.head()

Unnamed: 0,sentence,label,dataset_type
0,i didnt feel humiliated,sadness,train
1,i can go from feeling so hopeless to so damned...,sadness,train
2,im grabbing a minute to post i feel greedy wrong,anger,train
3,i am ever feeling nostalgic about the fireplac...,love,train
4,i am feeling grouchy,anger,train


In [32]:
df['label'].unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

## Decide Rexget target

In [33]:
sen = df['sentence'].tolist()
sen_vec = Countvec(words=sen, Ngram=(2,2), token_pattern=r'[a-zA-Z]{3,}')
sen_vec.sum().sort_values(ascending=False)[:50]

(20000, 102528)


feel like           2169
feeling little       269
just feel            239
feel little          226
feeling like         209
make feel            207
feeling bit          203
href http            195
feel really          185
feeling pretty       184
makes feel           180
don feel             163
feel bit             162
ive feeling          141
didnt feel           138
dont know            128
feel pretty          123
feeling really       123
help feel            122
want feel            122
really feel          115
left feeling         115
feels like           112
know feel            111
feeling quite         98
just feeling          96
feel need             95
remember feeling      93
did feel              90
feel passionate       83
starting feel         81
feel quite            80
people feel           78
little bit            77
feel blessed          75
feel way              74
dont want             72
don know              71
feel accepted         70
didn feel             70


In [34]:
def word_replace_part2(line):
    line = line.lower()
    line = re.sub(r'\b(feel((ing)?|s?)|like)\b', '', line)
    line = re.sub(r'\b(href|https?|https:|www|just|im|ive)\b', '', line)
    return line

In [35]:
df['sentence'] = df['sentence'].apply(lambda x: word_replace_part2(x))

## CountVectorizer & TF-ITF

In [36]:
def Vectorize(Vectorization, Ngram, text, min_df=0.005, max_df=0.5):
    if Vectorization == 'COUNT':
        vectorizer = CountVectorizer(stop_words = 'english', ngram_range=Ngram, min_df=min_df, max_df=max_df)
        X = vectorizer.fit_transform(text)
        Count = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
        
        return Count
        
    elif Vectorization == 'TFIDF':
        vectorizer = TfidfVectorizer(stop_words = 'english', ngram_range=Ngram, min_df=min_df, max_df=max_df)
        corpus = list(text.values)
        X = vectorizer.fit_transform(corpus)
        terms = vectorizer.get_feature_names()
        tf_idf = pd.DataFrame(X.toarray(), columns=terms)
        
        return tf_idf

In [37]:
def train_test_vec(df, Vectorization, Ngram, min_df=0.005, max_df=0.5):
    vec_df = Vectorize(Vectorization, Ngram, df['sentence'])
    train = df[df["dataset_type"] == 'train']
    test = df[df["dataset_type"] == 'test']
    X_train = vec_df.loc[:17999, :]
    y_train = train["label"]
    X_test = vec_df.loc[18000:, :]
    y_test = test["label"]
    
    ohe = OneHotEncoder()
    ohe.fit_transform(np.array(y_train).reshape(1, -1))
    ohe.fit_transform(np.array(y_test).reshape(1, -1))

    return X_train, X_test, y_train, y_test

### Stemming

In [38]:
def stemming(text):
    stemmer = PorterStemmer()
    word_tokens = nltk.word_tokenize(text)
    word_tokens = [stemmer.stem(t) for t in word_tokens]  
    cleaned_review = " ".join(word_tokens)
        
    return cleaned_review

In [39]:
df_stem = df.copy()

In [40]:
df_stem['sentence'] = df_stem['sentence'].apply(lambda x: stemming(x))

CountVectorizer, Ngram = (1, 1)

In [41]:
X_train, X_test, y_train, y_test = train_test_vec(df_stem, Vectorization = 'COUNT', Ngram = (1, 1))

In [42]:
# Logistic Regression
log = LogisticRegression(max_iter=1000)
log.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [43]:
def get_accuracy(model, X_train, X_test, y_train, y_test):
    training_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    print('training accuracy:', accuracy_score(y_train, training_predictions))
    print('test accuracy:', accuracy_score(y_test, test_predictions))     

In [44]:
def get_ROCAUC_f1(model, X_train, X_test, y_train, y_test):
    training_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)

    print('test classification report:')
    print(classification_report(y_test, test_predictions))
    
    test_predictions = OneHotEncoder().fit_transform(test_predictions.reshape(-1, 1)).toarray()
    print('')
    print('test ROCAUC score:', roc_auc_score(y_test, test_predictions, multi_class='ovo'))

In [45]:
get_accuracy(log, X_train, X_test, y_train, y_test)

training accuracy: 0.5277222222222222
test accuracy: 0.5005


In [46]:
get_ROCAUC_f1(log, X_train, X_test, y_train, y_test)

test classification report:
              precision    recall  f1-score   support

       anger       0.59      0.25      0.35       275
        fear       0.48      0.25      0.33       212
         joy       0.53      0.71      0.61       704
        love       0.48      0.24      0.32       178
     sadness       0.45      0.57      0.50       550
    surprise       0.47      0.26      0.33        81

    accuracy                           0.50      2000
   macro avg       0.50      0.38      0.41      2000
weighted avg       0.51      0.50      0.48      2000


test ROCAUC score: 0.6287594728644657


In [47]:
# SVM
svm = SVC(kernel = 'linear', C = 1)
svm.fit(X_train, y_train)

SVC(C=1, kernel='linear')

In [48]:
get_accuracy(svm, X_train, X_test, y_train, y_test)

training accuracy: 0.5306111111111111
test accuracy: 0.501


In [49]:
get_ROCAUC_f1(svm, X_train, X_test, y_train, y_test)

test classification report:
              precision    recall  f1-score   support

       anger       0.64      0.26      0.37       275
        fear       0.51      0.25      0.34       212
         joy       0.60      0.58      0.59       704
        love       0.48      0.31      0.38       178
     sadness       0.41      0.70      0.52       550
    surprise       0.62      0.30      0.40        81

    accuracy                           0.50      2000
   macro avg       0.54      0.40      0.43      2000
weighted avg       0.53      0.50      0.49      2000


test ROCAUC score: 0.6407842950564223


CountVectorizer, Ngram = (2, 2)

In [50]:
X_train, X_test, y_train, y_test = train_test_vec(df_stem, Vectorization = 'COUNT', Ngram = (2, 2))

In [51]:
log = LogisticRegression(max_iter=1000)
log.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [52]:
get_accuracy(log, X_train, X_test, y_train, y_test)

training accuracy: 0.3368888888888889
test accuracy: 0.349


If we set ngram=(2,2), the performance is worse than (1,1). Therefore, I will only set ngram=(1,1) afterwards.

TfIdfVectorizer, Ngram = (1, 1)

In [53]:
X_train, X_test, y_train, y_test = train_test_vec(df_stem, Vectorization = 'TFIDF', Ngram = (1, 1))

In [54]:
log = LogisticRegression(max_iter=1000)
log.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [55]:
get_accuracy(log, X_train, X_test, y_train, y_test)

training accuracy: 0.5346666666666666
test accuracy: 0.4995


In [56]:
get_ROCAUC_f1(log, X_train, X_test, y_train, y_test)

test classification report:
              precision    recall  f1-score   support

       anger       0.60      0.23      0.34       275
        fear       0.50      0.25      0.33       212
         joy       0.57      0.64      0.60       704
        love       0.48      0.22      0.31       178
     sadness       0.43      0.68      0.53       550
    surprise       0.49      0.22      0.31        81

    accuracy                           0.50      2000
   macro avg       0.51      0.37      0.40      2000
weighted avg       0.51      0.50      0.48      2000


test ROCAUC score: 0.6245178887668288


### Lemmatization

In [57]:
df_lem = df.copy()

In [58]:
df_lem['sentence'] = df_lem['sentence'].apply(lambda x: lem(x))

CountVectorizer, Ngram = (1, 1)

In [59]:
X_train, X_test, y_train, y_test = train_test_vec(df_stem, Vectorization = 'COUNT', Ngram = (1, 1))

In [60]:
log = LogisticRegression(max_iter=1000)
log.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [61]:
get_accuracy(log, X_train, X_test, y_train, y_test)

training accuracy: 0.5277222222222222
test accuracy: 0.5005


In [62]:
get_ROCAUC_f1(log, X_train, X_test, y_train, y_test)

test classification report:
              precision    recall  f1-score   support

       anger       0.59      0.25      0.35       275
        fear       0.48      0.25      0.33       212
         joy       0.53      0.71      0.61       704
        love       0.48      0.24      0.32       178
     sadness       0.45      0.57      0.50       550
    surprise       0.47      0.26      0.33        81

    accuracy                           0.50      2000
   macro avg       0.50      0.38      0.41      2000
weighted avg       0.51      0.50      0.48      2000


test ROCAUC score: 0.6287594728644657


TfIdfVectorizer, Ngram = (1, 1)

In [63]:
X_train, X_test, y_train, y_test = train_test_vec(df_stem, Vectorization = 'TFIDF', Ngram = (1, 1))

In [64]:
log = LogisticRegression(max_iter=1000)
log.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [65]:
get_accuracy(log, X_train, X_test, y_train, y_test)

training accuracy: 0.5346666666666666
test accuracy: 0.4995


In [66]:
get_ROCAUC_f1(log, X_train, X_test, y_train, y_test)

test classification report:
              precision    recall  f1-score   support

       anger       0.60      0.23      0.34       275
        fear       0.50      0.25      0.33       212
         joy       0.57      0.64      0.60       704
        love       0.48      0.22      0.31       178
     sadness       0.43      0.68      0.53       550
    surprise       0.49      0.22      0.31        81

    accuracy                           0.50      2000
   macro avg       0.51      0.37      0.40      2000
weighted avg       0.51      0.50      0.48      2000


test ROCAUC score: 0.6245178887668288


## 2wordvec

In [67]:
df_2word = df.copy()

In [68]:
# load the language model, but we disable the ner (named entity recognition) and parser (dependency parser)
# since we don't need them for our use case to speed things up
nlp = spacy.load('en_core_web_md', disable = ['ner', 'parser'])

In [69]:
def process_text(text):
    """
    This function will use Spacy to perform stopword removal and lemmatization.
    """
    doc = nlp(text)
    processed_text = " ".join([token.lemma_ for token in doc if not token.is_stop])
    # this will get the word2vec embeddings for the processed text 
    # (the average of each token in the doc's word2vec embeddings)
    return np.array(nlp(processed_text).vector)

In [70]:
df_2word["vectors"] = df_2word.sentence.apply(process_text)

In [71]:
# It's word2vec embeddings
df_2word["vectors"]

0       [-2.4662, 3.17951, -2.4262333, 0.4203433, 0.96...
1       [0.68306357, 2.1139503, -0.7887706, -1.3781472...
2       [0.1790057, 0.23212998, -0.13869858, -0.215511...
3       [-1.293602, -0.37806198, -1.14107, -2.05672, 0...
4       [-2.3366, -3.68105, 0.130635, 1.26235, 2.42735...
                              ...                        
1995    [0.2973494, 1.2516923, 0.22463411, 0.84789705,...
1996    [-0.058069196, 1.2074139, -2.9435894, 1.047504...
1997    [1.5450816, -0.47757176, -1.9738317, -1.599271...
1998    [0.38225588, 2.0206482, -2.6140335, -2.5816183...
1999    [-1.4125922, 0.21994796, -2.5792959, -1.27465,...
Name: vectors, Length: 20000, dtype: object

In [72]:
def train_test_2wordvec(df):
    train = df[df["dataset_type"] == 'train']
    test = df[df["dataset_type"] == 'test']
    X_train = np.array([vector for vector in train["vectors"]])
    y_train = train["label"]
    X_test = np.array([vector for vector in test["vectors"]])
    y_test = test["label"]
    
    ohe = OneHotEncoder()
    ohe.fit_transform(np.array(y_train).reshape(1, -1))
    ohe.fit_transform(np.array(y_test).reshape(1, -1))

    return X_train, X_test, y_train, y_test

In [73]:
X_train, X_test, y_train, y_test = train_test_2wordvec(df_2word)

In [74]:
log = LogisticRegression(max_iter=1000)
log.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [75]:
get_accuracy(log, X_train, X_test, y_train, y_test)

training accuracy: 0.6543333333333333
test accuracy: 0.634


In [76]:
get_ROCAUC_f1(log, X_train, X_test, y_train, y_test)

test classification report:
              precision    recall  f1-score   support

       anger       0.61      0.49      0.55       275
        fear       0.57      0.47      0.52       212
         joy       0.68      0.77      0.72       704
        love       0.57      0.41      0.48       178
     sadness       0.62      0.71      0.66       550
    surprise       0.54      0.36      0.43        81

    accuracy                           0.63      2000
   macro avg       0.60      0.54      0.56      2000
weighted avg       0.63      0.63      0.63      2000


test ROCAUC score: 0.7210914709570679


In [77]:
# SVM
svm = SVC(kernel = 'linear', C = 1)
svm.fit(X_train, y_train)

SVC(C=1, kernel='linear')

In [78]:
get_accuracy(svm, X_train, X_test, y_train, y_test)

training accuracy: 0.6721666666666667
test accuracy: 0.6355


In [79]:
get_ROCAUC_f1(svm, X_train, X_test, y_train, y_test)

test classification report:
              precision    recall  f1-score   support

       anger       0.57      0.51      0.54       275
        fear       0.56      0.47      0.51       212
         joy       0.68      0.78      0.73       704
        love       0.58      0.40      0.47       178
     sadness       0.64      0.69      0.67       550
    surprise       0.55      0.37      0.44        81

    accuracy                           0.64      2000
   macro avg       0.60      0.54      0.56      2000
weighted avg       0.63      0.64      0.63      2000


test ROCAUC score: 0.7219693816031132


What could my model improve:
- To get better performance, we could try different supervised learning models and find the best parameters to put into each model using GridSearCV.
- Some ‘negative words in the text such as “not” and “n’“t, which are regarded as the stop words. If we remove these stop words, we will reverse the meaning of the sentence. We might need to use sequential models to solve this problem.