In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np

import re
import os
import string
import operator
from itertools import islice
from collections import Counter

from datetime import date
import matplotlib.dates as dates

from nltk import ngrams
import nltk as nltk
from nltk.corpus import stopwords

In [2]:
sns.set_style('darkgrid')
sns.set_context('paper')
pd.set_option('display.max_colwidth', None)

In [3]:
punct_signs = list(string.punctuation)
punct_signs.append('…')
punct_signs.append('¿')
punct_signs.append('•')
punct_signs.append('”')
punct_signs.append('“')
punct_signs.append('–')
punct_signs.remove('&')
punct_signs.append('∑')

stop_words = stopwords.words('english') + ['also', 'could', ]

In [4]:
df = pd.read_csv('data/courses.csv')
df = df.fillna("")
df.columns

Index(['ID', 'Country', 'Univeristy', 'G2RWorldRank', 'G2RNationalRank',
       'QSWorldUniversityRanking2021', 'Program',
       'SchoolOfferingCourseProgram', 'CourseDescription', 'Course Unit',
       'CourseUnitGoals', 'CourseUnitOutcomes'],
      dtype='object')

In [5]:
import math
def clean_text(text):
    for p in punct_signs:
        text = text.replace(p, ' ')
    clean_text = text.lower().split()
    clean_text = [w for w in clean_text if w not in stop_words]
    return clean_text

## 50 most frequently used ngrams in description

In [6]:
#m is the number of top ngrams.
def getNGrams(text, n, m):
    h_dict = {}   
    ngramas = list(ngrams(text, n))
    for grams in ngramas:
        words = ' '.join(grams).strip()
        if words not in h_dict:
            h_dict[words] = 0
        h_dict[words] = h_dict[words] + 1
        
    sorted_dict = sorted(h_dict.items(), key=operator.itemgetter(1), reverse=True)
    top_values = list(islice(sorted_dict, m))
    return [(r[0], r[1], round(r[1]/len(ngramas)*100,2)) for r in top_values]

def printNgrams(data, m):
    Ngrams = []
    unigrams = getNGrams(data, 1, m)
    bigrams = getNGrams(data, 2 , m)
    trigrams = getNGrams(data, 3 , m)
    for i in range(0,m):
        Ngrams.append(unigrams[i] + bigrams[i] + trigrams[i])
    df = pd.DataFrame(Ngrams, columns=['Unigrams', 'Absolute Freq', 'Relative Freq', 
                                       'Bigrams', 'Absolute Freq', 'Relative Freq', 
                                       'Trigrams', 'Absolute Freq', 'Relative Freq',]) 
    return df

In [7]:
data = ' '.join(list(df['CourseUnitGoals']))
data  = clean_text(data)

In [8]:
dfNgrams = printNgrams(data, 50)
dfNgrams

Unnamed: 0,Unigrams,Absolute Freq,Relative Freq,Bigrams,Absolute Freq.1,Relative Freq.1,Trigrams,Absolute Freq.2,Relative Freq.2
0,data,76,2.1,machine learning,27,0.74,artificial intelligence machine,5,0.14
1,learning,45,1.24,healthcare data,22,0.61,intelligence machine learning,5,0.14
2,students,36,0.99,artificial intelligence,14,0.39,participants understand 1,4,0.11
3,ethical,35,0.96,e g,9,0.25,ai machine learning,4,0.11
4,machine,27,0.74,science technology,8,0.22,use healthcare data,4,0.11
5,healthcare,27,0.74,data analysis,7,0.19,machine learning algorithms,4,0.11
6,design,26,0.72,provide students,6,0.17,provide professionally relevant,3,0.08
7,understanding,25,0.69,students learn,6,0.17,professionally relevant teaching,3,0.08
8,technology,25,0.69,big data,5,0.14,relevant teaching learning,3,0.08
9,ai,25,0.69,intelligence machine,5,0.14,successful understanding utilisation,3,0.08


## 30 Most used verbs and adjectives in description

In [9]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('wordnet');

[nltk_data] Downloading package punkt to /home/yadira/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yadira/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/yadira/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
#m is the number of top ngrams.
def getNPartsOfSpeech(text, m, tag):
    h_dict = {}   
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    #lemmatization and filtering allowed tags
    filtered_tags = [wordnet_lemmatizer.lemmatize(pt[0], pos="v") for pt in pos_tags if pt[1].startswith(tag)]
    
    for ft in filtered_tags:    
        if ft not in h_dict:
            h_dict[ft] = 0
        h_dict[ft] += 1
        
    sorted_dict = sorted(h_dict.items(), key=operator.itemgetter(1), reverse=True)
    top_values = list(islice(sorted_dict, m))
    return [(r[0], r[1], round(r[1]/len(filtered_tags)*100,2)) for r in top_values]

def printNPOS(data, m):
    postags = []
    
    verbs = getNPartsOfSpeech(data, m, 'V')
    verbs += [(None, None)] * (m - len(verbs))
    
    adjs = getNPartsOfSpeech(data, m , 'J')
    adjs +=[(None, None)] * (m - len(adjs))
    
    nouns = getNPartsOfSpeech(data, m , 'N')
    nouns +=[(None, None)] * (m - len(nouns))
    
    for i in range(0,m):
        if all(verbs[i]) or all(adj[i]) or all(nouns[i]):
            postags.append(verbs[i] + adjs[i] + nouns[i])
    df = pd.DataFrame(postags, columns=['Verbs', 'Absolute Freq', 'Relative Freq', 
                                       'Adjectives', 'Absolute Freq', 'Relative Freq',
                                        'Nouns', 'Absolute Freq', 'Relative Freq' ]) 
    return df

In [11]:
data = ' '.join(list(df['CourseUnitGoals']))

In [12]:
printNPOS(data, 30)

Unnamed: 0,Verbs,Absolute Freq,Relative Freq,Adjectives,Absolute Freq.1,Relative Freq.1,Nouns,Absolute Freq.2,Relative Freq.2
0,be,73,10.96,ethical,34,4.58,data,55,2.75
1,learn,24,3.6,social,22,2.96,machine,26,1.3
2,include,21,3.15,different,15,2.02,students,25,1.25
3,understand,19,2.85,practical,13,1.75,learn,25,1.25
4,apply,17,2.55,artificial,13,1.75,engineer,24,1.2
5,have,13,1.95,current,13,1.75,skills,23,1.15
6,use,11,1.65,societal,12,1.62,analysis,23,1.15
7,identify,11,1.65,specific,10,1.35,understand,22,1.1
8,develop,10,1.5,relevant,9,1.21,course,21,1.05
9,make,9,1.35,able,9,1.21,challenge,21,1.05
