# **Extract Bag of Words (BoW) Features from Course Textual Content**


In [None]:
#import gensim
import pandas as pd
import nltk as nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
course_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/course_processed.csv"
df = pd.read_csv(course_url)

In [None]:
df.head()

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
0,ML0201EN,robots are coming build iot apps with watson ...,have fun with iot and learn along the way if ...
1,ML0122EN,accelerating deep learning with gpu,training complex deep learning models with lar...
2,GPXX0ZG0EN,consuming restful services using the reactive ...,learn how to use a reactive jax rs client to a...
3,RP0105EN,analyzing big data in r using apache spark,apache spark is a popular cluster computing fr...
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,learn how to containerize package and run a ...


In [None]:
df["text"] = df.TITLE +" " + df.DESCRIPTION

In [None]:
#1 - tokenize
def tokenize_course(course, keep_only_nouns=True):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(course)
    # Remove English stop words and numbers
    word_tokens = [w for w in word_tokens if (not w.lower() in stop_words) and (not w.isnumeric())]
    # Only keep nouns 
    if keep_only_nouns:
        filter_list = ['WDT', 'WP', 'WRB', 'FW', 'IN', 'JJR', 'JJS', 'MD', 'PDT', 'POS', 'PRP', 'RB', 'RBR', 'RBS',
                       'RP']
        tags = nltk.pos_tag(word_tokens)
        word_tokens = [word for word, pos in tags if pos not in filter_list]

    return word_tokens

In [None]:
token = [tokenize_course(i) for i in df["text"].values]

In [None]:
# 2 - token to dict / visualize token dict
token_dict = Dictionary(token)
token_dict.token2id

{'ai': 0,
 'apps': 1,
 'build': 2,
 'cloud': 3,
 'coming': 4,
 'create': 5,
 'data': 6,
 'developer': 7,
 'found': 8,
 'fun': 9,
 'iot': 10,
 'irobot': 11,
 'learn': 12,
 'node': 13,
 'objects': 14,
 'pi': 15,
 'pictures': 16,
 'place': 17,
 'program': 18,
 'raspberry': 19,
 'raspcam': 20,
 'read': 21,
 'recognize': 22,
 'red': 23,
 'robot': 24,
 'robots': 25,
 'services': 26,
 'swift': 27,
 'take': 28,
 'temperature': 29,
 'use': 30,
 'want': 31,
 'watson': 32,
 'way': 33,
 'accelerate': 34,
 'accelerated': 35,
 'accelerating': 36,
 'analyze': 37,
 'based': 38,
 'benefit': 39,
 'caffe': 40,
 'case': 41,
 'chips': 42,
 'classification': 43,
 'comfortable': 44,
 'complex': 45,
 'computations': 46,
 'convolutional': 47,
 'course': 48,
 'datasets': 49,
 'deep': 50,
 'dependencies': 51,
 'deploy': 52,
 'designed': 53,
 'feel': 54,
 'google': 55,
 'gpu': 56,
 'hardware': 57,
 'house': 58,
 'ibm': 59,
 'images': 60,
 'including': 61,
 'inference': 62,
 'large': 63,
 'learning': 64,
 'librari

In [None]:
token_dict

<gensim.corpora.dictionary.Dictionary at 0x7f071d88a520>

In [None]:
token_dict[0]

'ai'

In [None]:
# 3 - generate BoW features for each tokenized course. tuple (idx, freq)
courses_bow = [token_dict.doc2bow(course) for course in token]

*   'doc_index': the course index starting from 0
*   'doc_id': the actual course id such as `ML0201EN`
*   'token': the tokens for each course
*   'bow': the bow value for each token

In [None]:
doc_index = []
doc_id = []
token = [] #32
bow = []
for course_idx, course_bow in enumerate(courses_bow):
  for token_idx, token_freq in course_bow:
    doc_index.append(course_idx)
    token.append(token_dict[token_idx])
    bow.append(token_freq)

In [None]:
# WRITE YOUR CODE HERE
df_bow = pd.DataFrame({"doc_index":doc_index,
              "token":token,
              "bow":bow})

In [None]:
df_bow.head()

Unnamed: 0,doc_index,token,bow
0,0,ai,2
1,0,apps,2
2,0,build,2
3,0,cloud,1
4,0,coming,1


In [None]:
df.head()

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION,text
0,ML0201EN,robots are coming build iot apps with watson ...,have fun with iot and learn along the way if ...,robots are coming build iot apps with watson ...
1,ML0122EN,accelerating deep learning with gpu,training complex deep learning models with lar...,accelerating deep learning with gpu training c...
2,GPXX0ZG0EN,consuming restful services using the reactive ...,learn how to use a reactive jax rs client to a...,consuming restful services using the reactive ...
3,RP0105EN,analyzing big data in r using apache spark,apache spark is a popular cluster computing fr...,analyzing big data in r using apache spark apa...
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,learn how to containerize package and run a ...,containerizing packaging and running a sprin...


In [None]:
finish_df = df_bow.merge(right = df, how = "left", left_on = "doc_index", right_index = True, validate = "m:1").loc[:, ["doc_index","COURSE_ID","token","bow"] ]
finish_df.head()

Unnamed: 0,doc_index,COURSE_ID,token,bow
0,0,ML0201EN,ai,2
1,0,ML0201EN,apps,2
2,0,ML0201EN,build,2
3,0,ML0201EN,cloud,1
4,0,ML0201EN,coming,1


In [None]:
pivot_df = finish_df.iloc[:,1:].pivot(index='COURSE_ID', columns='token', values = "bow").fillna(0)
pivot_df.head()

token,1st,3d,ability,able,abstraction,aca,academia,academic,accelerate,accelerated,...,zoom,¬†,–,—,‘,‚,‚äì,“,”,•
COURSE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AI0111EN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BC0101EN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BC0201EN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BC0202EN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BD0101EN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#pivot_df.columns = pivot_df.columns.droplevel(0) # nao precisa se especificar values
pivot_df = pivot_df.rename_axis(None, axis=1)
pivot_df = pivot_df.rename_axis(None, axis=0)

In [None]:
pivot_df.head()

Unnamed: 0,1st,3d,ability,able,abstraction,aca,academia,academic,accelerate,accelerated,...,zoom,¬†,–,—,‘,‚,‚äì,“,”,•
AI0111EN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BC0101EN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BC0201EN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BC0202EN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BD0101EN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
