In [6]:
# Data+ Web Scraping Test
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import requests
from bs4 import BeautifulSoup
# Initialize variables (only for Data+ 2018 websites - figure out for other)
sites = ["https://bigdata.duke.edu/projects/co-curricular-technology-pathways-e-advisor"]
# Program descriptions
descriptions = []
# Final list of nouns
terms = []

# Access websites and extract information
for i in sites:
    link = i
    page = requests.get(link)
    # Checks if page downloaded successfully
    if page.status_code != 200:
        break
        
# Extracts HTML content on page ('lxml' works too)
soup = BeautifulSoup(page.content, 'html.parser')


# Get the descriptions
title = soup.find(text = 'Project Summary')
body = title.next.next
descriptions.append(body.get_text())

# Cleaning documents
tokenizer = RegexpTokenizer(r'\w+')
for i in descriptions:
    print(i)
    raw = i.lower()
    
# Tokenization
#tokens = tokenizer.tokenize(raw)

# Create list with all words and POS 
#print(nltk.pos_tag(tokens))
#print("\n")

# function to test if something is a noun
lines = 'lines is some string of words'
is_noun = lambda pos: pos[:2] == 'NN'
# do the nlp stuff
tokenized = nltk.word_tokenize(lines)
nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
print(nouns)

A team of students will work with Duke’s Office of Information Technology to conceptualize and potentially develop an “e-advisor” program that will help students navigate, augment, and map their way through Duke’s co-curricular ecosystem. The team of students will identify available data, programs and resources, define learning objectives, recommend common pathways and create a storyboard of the program building out a “master narrative” experience and prototype the branching and decision engine. Students will work with de-identified registration and advising data in a secure environment, have access to the analytics tools used in OIT, and will have an opportunity for exploration of the data in consultation with OIT and data analytics professionals.

['lines', 'string', 'words']


In [7]:
# TextBlob 
# Function: gets all words and phrases associated with nouns
from textblob import TextBlob

# Testing tokenization
print("Test TextBlob package")
txt = """Natural language processing (NLP) is a field of computer science, artificial intelligence, and computational linguistics concerned with the inter
actions between computers and human (natural) languages."""
blobTest = TextBlob(txt)
print(blobTest.noun_phrases, "\n")

# Use TextBlob to extract tags
print("Use TextBlob to extract tags")
blob = TextBlob(raw)
print(blob.noun_phrases, "\n")

# Clean the list
# Get rid of stop words using stop words package
print("Get rid of stop words using stop words package")
en_stop = get_stop_words('en')
words = [i for i in blob.noun_phrases if not i in en_stop]
print(words, "\n")    
    
# Get rid of manually selected useless words
print("Get rid of manually selected useless words")
new_words = []
a = ["team","students","opportunities", "work", "access", "s"]
for n in a:
    new_words = [x for x in words if x != n]
    words = new_words   
print(new_words, "\n")
terms.append(new_words)

Test TextBlob package
['natural language processing', 'nlp', 'computer science', 'artificial intelligence', 'computational linguistics', 'inter actions'] 

Use TextBlob to extract tags
['duke ’ s office', 'information technology', '“ e-advisor ” program', 'students navigate', 'duke ’ s co-curricular ecosystem', 'available data', 'learning objectives', 'common pathways', 'program building', '“ master narrative ” experience', 'decision engine', 'analytics tools', 'data analytics professionals'] 

Get rid of stop words using stop words package
['duke ’ s office', 'information technology', '“ e-advisor ” program', 'students navigate', 'duke ’ s co-curricular ecosystem', 'available data', 'learning objectives', 'common pathways', 'program building', '“ master narrative ” experience', 'decision engine', 'analytics tools', 'data analytics professionals'] 

Get rid of manually selected useless words
['duke ’ s office', 'information technology', '“ e-advisor ” program', 'students navigate', 'du

In [9]:
# LDA

# Construct a document-term matrix
dictionary = corpora.Dictionary(terms)
corpus = [dictionary.doc2bow(t) for t in terms]

# Apply the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics= 3, id2word = dictionary, passes = 50)
print(lda_model.print_topics(num_topics = 3, num_words = 5))

[(0, '0.077*"program building" + 0.077*"“ e-advisor ” program" + 0.077*"information technology" + 0.077*"students navigate" + 0.077*"duke ’ s office"'), (1, '0.077*"students navigate" + 0.077*"data analytics professionals" + 0.077*"program building" + 0.077*"learning objectives" + 0.077*"available data"'), (2, '0.077*"duke ’ s office" + 0.077*"decision engine" + 0.077*"common pathways" + 0.077*"information technology" + 0.077*"duke ’ s co-curricular ecosystem"')]


In [10]:
# RAKE 1
import sys
sys.path.append('RAKE-tutorial')
from nlp_rake import rake

stoppath = 'RAKE-tutorial/data/stoplists/SmartStoplist.txt'

rake_object = rake.Rake(stoppath, 4, 3, 6)

sample_file = open("/Users/derekliu/Desktop//Data+/21", 'r', encoding="iso-8859-1")
text = sample_file.read()
keywords = rake_object.run(text)

# 3. print results
print("Keywords:", keywords)

ModuleNotFoundError: No module named 'rake'

In [11]:
# RAKE 2
from rake_nltk import Rake

# Uses stopwords for english from NLTK, and all puntuation characters by default
r = Rake()

# Extraction given the text.
myText = '''A team of students will work with Duke’s Office of Information Technology to 
conceptualize and potentially develop an “e-advisor” program that will help students navigate,
augment, and map their way through Duke’s co-curricular ecosystem. The team of students will 
identify available data, programs and resources, define learning objectives, recommend common 
pathways and create a storyboard of the program building out a “master narrative” experience
and prototype the branching and decision engine. Students will work with de-identified registration 
and advising data in a secure environment, have access to the analytics tools used in OIT, and 
will have an opportunity for exploration of the data in consultation with OIT and data analytics professionals.'''
r.extract_keywords_from_text(myText)

r.get_ranked_phrases()

['“ master narrative ” experience',
 'advisor ” program',
 'recommend common pathways',
 'define learning objectives',
 'analytics tools used',
 'identify available data',
 'data analytics professionals',
 'help students navigate',
 '“ e',
 'program building',
 'advising data',
 'secure environment',
 'potentially develop',
 'information technology',
 'identified registration',
 'duke ’',
 'decision engine',
 'curricular ecosystem',
 'data',
 'students',
 'work',
 'way',
 'team',
 'storyboard',
 'resources',
 'prototype',
 'programs',
 'opportunity',
 'oit',
 'office',
 'map',
 'exploration',
 'de',
 'create',
 'consultation',
 'conceptualize',
 'co',
 'branching',
 'augment',
 'access']

In [13]:
# nltk select nouns

# Cleaning documents
tokenizer = RegexpTokenizer(r'\w+')
for i in descriptions:
    print(i)
    raw = i.lower()

# Tokenization
#tokens = tokenizer.tokenize(raw)

# Create list with all words and POS 
#print(nltk.pos_tag(tokens))
#print("\n")

# Find all nouns
# lines = desc[3]
lines = 'A team of students will work with Duke’s Office of Information Technology to conceptualize and potentially develop an “e-advisor” program that will help students navigate, augment, and map their way through Duke’s co-curricular ecosystem. The team of students will identify available data, programs and resources, define learning objectives, recommend common pathways and create a storyboard of the program building out a “master narrative” experience and prototype the branching and decision engine. Students will work with de-identified registration and advising data in a secure environment, have access to the analytics tools used in OIT, and will have an opportunity for exploration of the data in consultation with OIT and data analytics professionals.'
is_noun = lambda pos: pos[:2] == 'NN'
tokenized = nltk.word_tokenize(lines)
nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
print(nouns)

A team of students will work with Duke’s Office of Information Technology to conceptualize and potentially develop an “e-advisor” program that will help students navigate, augment, and map their way through Duke’s co-curricular ecosystem. The team of students will identify available data, programs and resources, define learning objectives, recommend common pathways and create a storyboard of the program building out a “master narrative” experience and prototype the branching and decision engine. Students will work with de-identified registration and advising data in a secure environment, have access to the analytics tools used in OIT, and will have an opportunity for exploration of the data in consultation with OIT and data analytics professionals.

['team', 'students', 'Duke', '’', 'Office', 'Information', 'Technology', '”', 'program', 'students', 'augment', 'way', 'Duke', '’', 'ecosystem', 'team', 'students', 'data', 'programs', 'resources', 'define', 'objectives', 'pathways', 'story

In [16]:
# More advanced nltk cleaning text and extracting nouns

# Clean Descriptions
# load data
# filename = '/Users/derekliu/Desktop/DukeGroups.txt'
filename = '/Users/derekliu/Desktop/Data+/21.txt'

file = open(filename, 'rt')
text = file.read()
file.close()
# split into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
# convert to lower case
tokens = [w.lower() for w in tokens]
# remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
# filter out stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words[:100])
nouns = [word for (word, pos) in nltk.pos_tag(words) if is_noun(pos)] 
print(nouns)

['team', 'students', 'work', 'duke', 'office', 'information', 'technology', 'conceptualize', 'potentially', 'develop', 'eadvisor', 'program', 'help', 'students', 'navigate', 'augment', 'map', 'way', 'duke', 'cocurricular', 'ecosystem', 'team', 'students', 'identify', 'available', 'data', 'programs', 'resources', 'define', 'learning', 'objectives', 'recommend', 'common', 'pathways', 'create', 'storyboard', 'program', 'building', 'master', 'narrative', 'experience', 'prototype', 'branching', 'decision', 'engine', 'students', 'work', 'deidentified', 'registration', 'advising', 'data', 'secure', 'environment', 'access', 'analytics', 'tools', 'used', 'oit', 'opportunity', 'exploration', 'data', 'consultation', 'oit', 'data', 'analytics', 'professionals']
['team', 'students', 'office', 'information', 'technology', 'conceptualize', 'eadvisor', 'program', 'help', 'students', 'map', 'way', 'ecosystem', 'team', 'students', 'data', 'programs', 'resources', 'objectives', 'pathways', 'program', 'bu

In [17]:
# Add more words to stop_words to make it more accurate

# Build word filter
trivial = ["team","students","opportunities", "work", "access", "duke", "student", "effort", "consumer", "consumers", "club",
           "home", "school", "schools", "act", "day", "seafood", "africa", "anthropology", "faculty", "goal", "group",
          "skills", "consumption", "hosting", "graduate", "b", "majors", "major", "session", "sessions", "articles",
          "article", "department", "departments", "year", "members", "field", "fields", "activity", "activities", "organization",
          "organizations", "today", "tomorrow", "events", "event", "outing", "outings","idea", "ideas", "talks", "university", "universities"]
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# filter out stop words using our own library
for word in trivial:
    stop_words.add(word)

In [18]:
nouns

['team',
 'students',
 'office',
 'information',
 'technology',
 'conceptualize',
 'eadvisor',
 'program',
 'help',
 'students',
 'map',
 'way',
 'ecosystem',
 'team',
 'students',
 'data',
 'programs',
 'resources',
 'objectives',
 'pathways',
 'program',
 'building',
 'master',
 'experience',
 'prototype',
 'decision',
 'engine',
 'students',
 'registration',
 'data',
 'secure',
 'environment',
 'access',
 'analytics',
 'tools',
 'opportunity',
 'exploration',
 'data',
 'consultation',
 'data',
 'analytics',
 'professionals']

In [22]:
# DukeGroups LDA method, need to be in sentence form
# Construct a document-term matrix, input needs to be double bracket
dic = []

for x in range (0, 1): #len(desc)-1
#     singleDesc = desc[x]
    singleDesc = lines
    ### Clean text
    # split into words
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(singleDesc)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words using nltk
    words = [w for w in words if not w in stop_words]
    # extract all nouns
    is_noun = lambda pos: pos[:2] == 'NN'
    nouns = [word for (word, pos) in nltk.pos_tag(words) if is_noun(pos)] 
    dic.append(nouns)
dictionary = corpora.Dictionary(dic)
corpus = [dictionary.doc2bow(t) for t in dic]

# # Apply the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics= 50, id2word = dictionary, passes = 100)
print(lda_model.print_topics(num_topics = 50, num_words = 5))

[(0, '0.033*"professionals" + 0.033*"opportunity" + 0.033*"programs" + 0.033*"pathways" + 0.033*"registration"'), (1, '0.033*"professionals" + 0.033*"opportunity" + 0.033*"programs" + 0.033*"pathways" + 0.033*"registration"'), (2, '0.033*"professionals" + 0.033*"opportunity" + 0.033*"programs" + 0.033*"pathways" + 0.033*"registration"'), (3, '0.033*"professionals" + 0.033*"opportunity" + 0.033*"programs" + 0.033*"pathways" + 0.033*"registration"'), (4, '0.033*"professionals" + 0.033*"opportunity" + 0.033*"programs" + 0.033*"pathways" + 0.033*"registration"'), (5, '0.033*"professionals" + 0.033*"opportunity" + 0.033*"programs" + 0.033*"pathways" + 0.033*"registration"'), (6, '0.033*"professionals" + 0.033*"opportunity" + 0.033*"programs" + 0.033*"pathways" + 0.033*"registration"'), (7, '0.033*"professionals" + 0.033*"opportunity" + 0.033*"programs" + 0.033*"pathways" + 0.033*"registration"'), (8, '0.033*"professionals" + 0.033*"opportunity" + 0.033*"programs" + 0.033*"pathways" + 0.033*