In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np

import re
import os
import string
import operator
from itertools import islice
from collections import Counter

from datetime import date
import matplotlib.dates as dates

from nltk import ngrams
import nltk as nltk
from nltk.corpus import stopwords

In [2]:
sns.set_style('darkgrid')
sns.set_context('paper')
pd.set_option('display.max_colwidth', None)

In [3]:
punct_signs = list(string.punctuation)
punct_signs.append('…')
punct_signs.append('¿')
punct_signs.append('•')
punct_signs.append('”')
punct_signs.append('“')
punct_signs.append('–')
punct_signs.remove('&')
punct_signs.append('∑')

stop_words = stopwords.words('english') + ['also', 'could', ]

In [5]:
import math
def clean_text(text):
    for p in punct_signs:
        text = text.replace(p, ' ')
    clean_text = text.lower().split()
    clean_text = [w for w in clean_text if w not in stop_words]
    return clean_text

## 50 most frequently used ngrams in description

In [6]:
#m is the number of top ngrams.
def getNGrams(text, n, m):
    h_dict = {}   
    ngramas = list(ngrams(text, n))
    for grams in ngramas:
        words = ' '.join(grams).strip()
        if words not in h_dict:
            h_dict[words] = 0
        h_dict[words] = h_dict[words] + 1
        
    sorted_dict = sorted(h_dict.items(), key=operator.itemgetter(1), reverse=True)
    top_values = list(islice(sorted_dict, m))
    return [(r[0], r[1], round(r[1]/len(ngramas)*100,2)) for r in top_values]

def printNgrams(data, m):
    Ngrams = []
    unigrams = getNGrams(data, 1, m)
    bigrams = getNGrams(data, 2 , m)
    trigrams = getNGrams(data, 3 , m)
    for i in range(0,m):
        Ngrams.append(unigrams[i] + bigrams[i] + trigrams[i])
    df = pd.DataFrame(Ngrams, columns=['Unigrams', 'Absolute Freq', 'Relative Freq', 
                                       'Bigrams', 'Absolute Freq', 'Relative Freq', 
                                       'Trigrams', 'Absolute Freq', 'Relative Freq',]) 
    return df

In [9]:
df = df.fillna('')
data = #ADD_YOUR_TEXT as String e.g "This is my text"
data  = clean_text(data)

In [10]:
dfNgrams = printNgrams(data, 50)
dfNgrams

Unnamed: 0,Unigrams,Absolute Freq,Relative Freq,Bigrams,Absolute Freq.1,Relative Freq.1,Trigrams,Absolute Freq.2,Relative Freq.2
0,course,330,1.43,science technology,68,0.29,artificial intelligence ai,14,0.06
1,data,294,1.27,machine learning,65,0.28,electrical computer engineering,13,0.06
2,students,289,1.25,computer science,55,0.24,science technology studies,10,0.04
3,technology,257,1.11,artificial intelligence,55,0.24,human centered design,10,0.04
4,science,247,1.07,topics include,46,0.2,human computer interaction,10,0.04
5,social,226,0.98,data science,44,0.19,science technology society,9,0.04
6,engineering,180,0.78,security privacy,39,0.17,obtain hands experience,7,0.03
7,computer,177,0.77,case studies,38,0.16,humanities social sciences,7,0.03
8,systems,175,0.76,public policy,34,0.15,science technology medicine,7,0.03
9,issues,168,0.73,decision making,29,0.13,participants obtain hands,6,0.03


## 30 Most used verbs and adjectives in description

In [11]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('wordnet');

[nltk_data] Downloading package punkt to /home/yadira/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yadira/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/yadira/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
#m is the number of top ngrams.
def getNPartsOfSpeech(text, m, tag):
    h_dict = {}   
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    #lemmatization and filtering allowed tags
    filtered_tags = [wordnet_lemmatizer.lemmatize(pt[0], pos="v") for pt in pos_tags if pt[1].startswith(tag)]
    
    for ft in filtered_tags:    
        if ft not in h_dict:
            h_dict[ft] = 0
        h_dict[ft] += 1
        
    sorted_dict = sorted(h_dict.items(), key=operator.itemgetter(1), reverse=True)
    top_values = list(islice(sorted_dict, m))
    return [(r[0], r[1], round(r[1]/len(filtered_tags)*100,2)) for r in top_values]

def printNPOS(data, m):
    postags = []
    
    verbs = getNPartsOfSpeech(data, m, 'V')
    verbs += [(None, None)] * (m - len(verbs))
    
    adjs = getNPartsOfSpeech(data, m , 'J')
    adjs +=[(None, None)] * (m - len(adjs))
    
    nouns = getNPartsOfSpeech(data, m , 'N')
    nouns +=[(None, None)] * (m - len(nouns))
    
    for i in range(0,m):
        if all(verbs[i]) or all(adj[i]) or all(nouns[i]):
            postags.append(verbs[i] + adjs[i] + nouns[i])
    df = pd.DataFrame(postags, columns=['Verbs', 'Absolute Freq', 'Relative Freq', 
                                       'Adjectives', 'Absolute Freq', 'Relative Freq',
                                        'Nouns', 'Absolute Freq', 'Relative Freq' ]) 
    return df

In [13]:
data = ' '.join(list(df['CourseDescription']))

In [14]:
printNPOS(data, 30)

Unnamed: 0,Verbs,Absolute Freq,Relative Freq,Adjectives,Absolute Freq.1,Relative Freq.1,Nouns,Absolute Freq.2,Relative Freq.2
0,be,635,14.44,social,216,4.85,course,344,2.45
1,include,183,4.16,ethical,136,3.05,data,252,1.79
2,have,140,3.18,such,91,2.04,technology,231,1.64
3,use,101,2.3,human,89,2.0,students,212,1.51
4,provide,93,2.11,public,74,1.66,science,202,1.44
5,learn,69,1.57,new,60,1.35,issue,173,1.23
6,develop,61,1.39,technical,52,1.17,engineer,169,1.2
7,explore,58,1.32,legal,51,1.14,systems,167,1.19
8,compute,57,1.3,artificial,48,1.08,design,155,1.1
9,introduce,47,1.07,scientific,48,1.08,society,131,0.93
