In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np

import re
import os
import string
import operator
from itertools import islice
from collections import Counter

from datetime import date
import matplotlib.dates as dates

from nltk import ngrams
import nltk as nltk
from nltk.corpus import stopwords

In [2]:
sns.set_style('darkgrid')
sns.set_context('paper')
pd.set_option('display.max_colwidth', None)

In [3]:
punct_signs = list(string.punctuation)
punct_signs.append('…')
punct_signs.append('¿')
punct_signs.append('•')
punct_signs.append('”')
punct_signs.append('“')
punct_signs.append('–')
punct_signs.remove('&')
punct_signs.append('∑')

stop_words = stopwords.words('english') + ['also', 'could', ]

In [4]:
df = pd.read_csv('data/courses.csv')
df = df.fillna("")
df.columns

Index(['ID', 'Country', 'Univeristy', 'G2RWorldRank', 'G2RNationalRank',
       'QSWorldUniversityRanking2021', 'Program',
       'SchoolOfferingCourseProgram', 'CourseDescription', 'Course Unit',
       'CourseUnitGoals', 'CourseUnitOutcomes'],
      dtype='object')

In [5]:
import math
def clean_text(text):
    for p in punct_signs:
        text = text.replace(p, ' ')
    clean_text = text.lower().split()
    clean_text = [w for w in clean_text if w not in stop_words]
    return clean_text

## 50 most frequently used ngrams in description

In [6]:
#m is the number of top ngrams.
def getNGrams(text, n, m):
    h_dict = {}   
    ngramas = list(ngrams(text, n))
    for grams in ngramas:
        words = ' '.join(grams).strip()
        if words not in h_dict:
            h_dict[words] = 0
        h_dict[words] = h_dict[words] + 1
        
    sorted_dict = sorted(h_dict.items(), key=operator.itemgetter(1), reverse=True)
    top_values = list(islice(sorted_dict, m))
    return [(r[0], r[1], round(r[1]/len(ngramas)*100,2)) for r in top_values]

def printNgrams(data, m):
    Ngrams = []
    unigrams = getNGrams(data, 1, m)
    bigrams = getNGrams(data, 2 , m)
    trigrams = getNGrams(data, 3 , m)
    for i in range(0,m):
        Ngrams.append(unigrams[i] + bigrams[i] + trigrams[i])
    df = pd.DataFrame(Ngrams, columns=['Unigrams', 'Absolute Freq', 'Relative Freq', 
                                       'Bigrams', 'Absolute Freq', 'Relative Freq', 
                                       'Trigrams', 'Absolute Freq', 'Relative Freq',]) 
    return df

In [7]:
data = ' '.join(list(df['CourseUnitOutcomes']))
data  = clean_text(data)

In [8]:
dfNgrams = printNgrams(data, 50)
dfNgrams

Unnamed: 0,Unigrams,Absolute Freq,Relative Freq,Bigrams,Absolute Freq.1,Relative Freq.1,Trigrams,Absolute Freq.2,Relative Freq.2
0,data,99,2.1,computer science,19,0.4,identify formulate solve,12,0.25
1,engineering,66,1.4,machine learning,18,0.38,meet desired needs,8,0.17
2,understand,59,1.25,healthcare data,18,0.38,solutions global societal,8,0.17
3,design,53,1.12,health safety,13,0.28,global societal context,8,0.17
4,research,48,1.02,solutions global,13,0.28,engineering solutions global,7,0.15
5,science,46,0.98,identify formulate,12,0.25,students demonstrate ability,7,0.15
6,ability,44,0.93,formulate solve,12,0.25,ability apply engineering,6,0.13
7,apply,43,0.91,communicate effectively,11,0.23,apply engineering design,6,0.13
8,ethical,41,0.87,demonstrate ability,11,0.23,engineering design produce,6,0.13
9,evaluate,39,0.83,engineering problems,10,0.21,design produce solutions,6,0.13


## 30 Most used verbs and adjectives in description

In [9]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('wordnet');

[nltk_data] Downloading package punkt to /home/yadira/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yadira/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/yadira/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
#m is the number of top ngrams.
def getNPartsOfSpeech(text, m, tag):
    h_dict = {}   
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    #lemmatization and filtering allowed tags
    filtered_tags = [wordnet_lemmatizer.lemmatize(pt[0], pos="v") for pt in pos_tags if pt[1].startswith(tag)]
    
    for ft in filtered_tags:    
        if ft not in h_dict:
            h_dict[ft] = 0
        h_dict[ft] += 1
        
    sorted_dict = sorted(h_dict.items(), key=operator.itemgetter(1), reverse=True)
    top_values = list(islice(sorted_dict, m))
    return [(r[0], r[1], round(r[1]/len(filtered_tags)*100,2)) for r in top_values]

def printNPOS(data, m):
    postags = []
    
    verbs = getNPartsOfSpeech(data, m, 'V')
    verbs += [(None, None)] * (m - len(verbs))
    
    adjs = getNPartsOfSpeech(data, m , 'J')
    adjs +=[(None, None)] * (m - len(adjs))
    
    nouns = getNPartsOfSpeech(data, m , 'N')
    nouns +=[(None, None)] * (m - len(nouns))
    
    for i in range(0,m):
        if all(verbs[i]) or all(adj[i]) or all(nouns[i]):
            postags.append(verbs[i] + adjs[i] + nouns[i])
    df = pd.DataFrame(postags, columns=['Verbs', 'Absolute Freq', 'Relative Freq', 
                                       'Adjectives', 'Absolute Freq', 'Relative Freq',
                                        'Nouns', 'Absolute Freq', 'Relative Freq' ]) 
    return df

In [11]:
data = ' '.join(list(df['CourseUnitOutcomes']))

In [12]:
printNPOS(data, 30)

Unnamed: 0,Verbs,Absolute Freq,Relative Freq,Adjectives,Absolute Freq.1,Relative Freq.1,Nouns,Absolute Freq.2,Relative Freq.2
0,be,64,7.73,ethical,40,4.07,data,70,2.73
1,apply,36,4.35,able,34,3.46,engineer,65,2.54
2,use,31,3.74,social,27,2.75,research,47,1.83
3,identify,23,2.78,global,26,2.64,ability,44,1.72
4,relate,21,2.54,appropriate,25,2.54,science,43,1.68
5,meet,20,2.42,environmental,23,2.34,knowledge,34,1.33
6,understand,20,2.42,societal,22,2.24,design,33,1.29
7,solve,17,2.05,professional,19,1.93,computer,33,1.29
8,evaluate,17,2.05,economic,15,1.53,approach,28,1.09
9,demonstrate,17,2.05,critical,13,1.32,healthcare,27,1.05
