In [113]:
import pandas as pd
from nltk import sent_tokenize, word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
import copy
import logging
from gensim.models import doc2vec
from gensim import matutils
from numpy import dot, array
import logging
import cPickle as pickle
import cProfile

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [41]:
with open('data/catalog.xlsx', 'r') as f:
	catalog = pd.read_excel(f) # type: pandas.core.frame.DataFrame
print catalog.head()

                                Title         Department  \
AFSC101      FOUNDATION OF THE USAF I  Air Force Science   
AFSC102     FOUNDATION OF THE USAF II  Air Force Science   
AFSC201      EVOLUTION OF AIR POWER I  Air Force Science   
AFSC202     EVOLUTION OF AIR POWER II  Air Force Science   
AFSC301  AIR FORCE LEADERSHIP STUDY I  Air Force Science   

                           Long Title  \
AFSC101      FOUNDATION OF THE USAF I   
AFSC102     FOUNDATION OF THE USAF II   
AFSC201      EVOLUTION OF AIR POWER I   
AFSC202     EVOLUTION OF AIR POWER II   
AFSC301  AIR FORCE LEADERSHIP STUDY I   

                                               Description  
AFSC101  Overall roles and missions of the USAF; career...  
AFSC102  Continuation of AFSC 101.  Course taught at th...  
AFSC201  Key historical events and milestones in the de...  
AFSC202  Continuation of AFSC 201.  Course taught at th...  
AFSC301  Leadership, management fundamentals, professio...  


In [42]:
# fill na
catalog = catalog.fillna('')

In [44]:
# extract data
catalog_train = catalog[['Long Title', 'Description']]

In [None]:
def sentence_to_words(sentence, stop_word = False, stem = False):
	words = sent_tokenize(sentence)
	
	if stop_word == True:
		stop = stopwords.words('english')
		words = [w for w in words if w not in stop]
	
	return words

In [57]:
# tag sentence
train = []
for index, row in catalog_train.iterrows():
	for content in row:
		sentences = sent_tokenize(content)
		for sentence in sentences:
			train.append((sentence, index))
print 'Number of sentences: ', len(train)
# tokenize sentence
count = 0
for idx in range(len(train)):
	sentence = train[idx][0]
	tag = train[idx][1]
	words = word_tokenize(sentence)
	count += len(words)
	train[idx] = (words, tag)
print 'Number of words: ', count

Number of sentences:  24405


Number of words:  345022


In [59]:
# Create Tagged Document
documents = []
sentences = []
for ele in train:
	sent = ele[0]
	tag = [str(ele[1])]
	instance = doc2vec.TaggedDocument(sent, tag)    # type: doc2vec.TaggedDocument
	documents.append(instance)
	sentences.append(sent)

In [60]:
# Config Train Model
default_alpha = 0.025
min_alpha = 0.0001
windows = 8;

model = doc2vec.Doc2Vec(size=300, min_count = 3, window = windows, 
                        alpha= default_alpha, min_alpha= min_alpha)
model.build_vocab(documents)
model.intersect_word2vec_format('GoogleNews-vectors-negative300.bin', 
                                binary=True, lockf = 1.0)

In [73]:
# save model
with open('pickle/unlockModel','w') as f:
	model.save(f)

In [105]:
def train_epoch(m, epoch):
	"""
	Return a trained trainedmodel based on base trainedmodel & epoch time
	:param epoch:
	:return:
	"""
	model = copy.deepcopy(m)
	assert isinstance(model, doc2vec.Doc2Vec)
	
	total_examples = len(documents)
	model.train(documents, total_examples=total_examples, epochs=epoch)
	return model

In [115]:
dvmodel = train_epoch(model, 50)

2017-08-29 22:01:17,084 : INFO : training model with 3 workers on 8884 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=8


2017-08-29 22:01:18,103 : INFO : PROGRESS: at 1.53% examples, 202548 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:19,139 : INFO : PROGRESS: at 3.05% examples, 197470 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:20,168 : INFO : PROGRESS: at 4.46% examples, 191024 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:21,188 : INFO : PROGRESS: at 6.14% examples, 197441 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:22,208 : INFO : PROGRESS: at 7.55% examples, 195275 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:23,231 : INFO : PROGRESS: at 8.97% examples, 192730 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:24,237 : INFO : PROGRESS: at 10.54% examples, 194466 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:25,251 : INFO : PROGRESS: at 11.99% examples, 193684 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:26,275 : INFO : PROGRESS: at 13.58% examples, 195334 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:27,302 : INFO : PROGRESS: at 14.94% examples, 192956 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:28,352 : INFO : PROGRESS: at 16.56% examples, 193983 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:29,358 : INFO : PROGRESS: at 18.07% examples, 194214 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:30,391 : INFO : PROGRESS: at 19.67% examples, 195202 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:31,397 : INFO : PROGRESS: at 21.22% examples, 195897 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:32,463 : INFO : PROGRESS: at 22.98% examples, 197244 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:33,518 : INFO : PROGRESS: at 24.72% examples, 198556 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:34,568 : INFO : PROGRESS: at 26.46% examples, 199720 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:35,621 : INFO : PROGRESS: at 28.20% examples, 200703 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:36,672 : INFO : PROGRESS: at 29.93% examples, 201634 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:37,703 : INFO : PROGRESS: at 31.66% examples, 202654 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:38,748 : INFO : PROGRESS: at 33.38% examples, 203447 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:39,804 : INFO : PROGRESS: at 35.12% examples, 204097 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:40,860 : INFO : PROGRESS: at 36.89% examples, 204698 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:41,919 : INFO : PROGRESS: at 38.62% examples, 205203 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:42,958 : INFO : PROGRESS: at 40.36% examples, 205803 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:44,031 : INFO : PROGRESS: at 42.09% examples, 206099 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:45,056 : INFO : PROGRESS: at 43.77% examples, 206447 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:46,070 : INFO : PROGRESS: at 45.43% examples, 206856 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:47,073 : INFO : PROGRESS: at 47.06% examples, 207095 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:48,081 : INFO : PROGRESS: at 48.58% examples, 206777 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:49,102 : INFO : PROGRESS: at 50.26% examples, 207086 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:50,154 : INFO : PROGRESS: at 51.94% examples, 207187 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:51,211 : INFO : PROGRESS: at 53.66% examples, 207480 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:52,243 : INFO : PROGRESS: at 55.40% examples, 207917 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:53,299 : INFO : PROGRESS: at 57.12% examples, 208178 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:54,356 : INFO : PROGRESS: at 58.89% examples, 208453 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:55,408 : INFO : PROGRESS: at 60.62% examples, 208703 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:56,410 : INFO : PROGRESS: at 62.30% examples, 209013 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:57,432 : INFO : PROGRESS: at 63.93% examples, 209017 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:01:58,452 : INFO : PROGRESS: at 65.64% examples, 209391 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:01:59,465 : INFO : PROGRESS: at 67.32% examples, 209618 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:02:00,481 : INFO : PROGRESS: at 69.01% examples, 209825 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:02:01,503 : INFO : PROGRESS: at 70.64% examples, 209815 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:02:02,548 : INFO : PROGRESS: at 72.38% examples, 210025 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:02:03,566 : INFO : PROGRESS: at 73.94% examples, 209861 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:02:04,584 : INFO : PROGRESS: at 75.38% examples, 209385 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:02:05,613 : INFO : PROGRESS: at 77.05% examples, 209512 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:02:06,630 : INFO : PROGRESS: at 78.75% examples, 209696 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:02:07,652 : INFO : PROGRESS: at 80.37% examples, 209674 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:02:08,716 : INFO : PROGRESS: at 82.11% examples, 209793 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:02:09,722 : INFO : PROGRESS: at 83.78% examples, 209983 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:02:10,751 : INFO : PROGRESS: at 85.48% examples, 210220 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:02:11,768 : INFO : PROGRESS: at 87.12% examples, 210235 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:02:12,778 : INFO : PROGRESS: at 88.83% examples, 210412 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:02:13,784 : INFO : PROGRESS: at 90.44% examples, 210446 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:02:14,800 : INFO : PROGRESS: at 92.07% examples, 210440 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:02:15,809 : INFO : PROGRESS: at 93.67% examples, 210461 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:02:16,839 : INFO : PROGRESS: at 95.41% examples, 210670 words/s, in_qsize 5, out_qsize 0


2017-08-29 22:02:17,914 : INFO : PROGRESS: at 97.14% examples, 210719 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:02:18,974 : INFO : PROGRESS: at 98.91% examples, 210825 words/s, in_qsize 6, out_qsize 0


2017-08-29 22:02:19,538 : INFO : worker thread finished; awaiting finish of 2 more threads


2017-08-29 22:02:19,602 : INFO : worker thread finished; awaiting finish of 1 more threads


2017-08-29 22:02:19,606 : INFO : worker thread finished; awaiting finish of 0 more threads


2017-08-29 22:02:19,607 : INFO : training on 17251100 raw words (13191057 effective words) took 62.5s, 210993 effective words/s


In [108]:
def predict_course(model, query_list, n = 5):
	"""
	Given a pretrianed model and a list of query words, return the top-n related course with cosine similarity
	:param n: get top-n related course; default n = 5
	:param model: a doc2vec model
	:param query_list: a list of query word
	:type model: doc2vec.Doc2Vec
	:type query_list: list
	:return: [(tag, cosine_similarity)]
	"""
	dv = model.docvecs
	
	def get_vector(model, query_list):
		"""
		Return the sum vector of a list of words
		:type model: doc2vec.Doc2Vec
		:type query_list: list
		"""
		strlist = query_list.split(" ")
		v = [model[i] for i in strlist]
		return matutils.unitvec(array(v).sum(axis=0))
	
	sum_vec = get_vector(model, query_list)
	result = model.docvecs.most_similar([sum_vec], topn = n)
	return result

In [118]:
print predict_course(dvmodel, 'computer science')
print dvmodel.most_similar('computer science'.split())

[('HIST233', 0.27921849489212036), ('HIST533', 0.2791692018508911), ('STAT281', 0.23940983414649963), ('MUSI605', 0.23683518171310425), ('PSYC581', 0.2252344787120819)]
[(u'simulations', 0.40995684266090393), (u'physics', 0.3936920464038849), (u'biology', 0.3717336356639862), (u'hands-on', 0.3641414940357208), (u'mathematics', 0.36155372858047485), (u'laboratory', 0.3612132966518402), (u'engineering', 0.3518570065498352), (u'finance', 0.35036101937294006), (u'useful', 0.34551703929901123), (u'electronics', 0.34456297755241394)]
