# Part 1: Code For LDA on 10 patent documents

In [1]:
# nltk package is used for basic natural language processing operations such as removing stopwords, tokenizing words, stemming and
# lemmatization.

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

# gensim package is applied for LDA modelling and creation of term frequency - inverse document matrices
from gensim import corpora 
from gensim.models import ldamodel
from gensim.models import tfidfmodel 

word_list = []

# selecting the 10 patent documents 
for i in range(6334220,6334229):
    string = str(i)
    paragraph = open(string +'.txt','r')
    
    for text in paragraph:
        structure = text
        
    # Creating Tokens using word_tokenize function in nltk library
    mysentencetokens_sw= nltk.word_tokenize(str(structure))

    # normazlizing everything to lower case
    looper = 0
    for token in mysentencetokens_sw:
        mysentencetokens_sw[looper] = token.lower()
        looper += 1

    # removing stopwords using nltk library with anything less than 2 letters
    minlength = 2
    mysentencetokens = [token for token in mysentencetokens_sw if 
                    (not token in stopwords.words('english')) 
                    and len(token) > minlength]
    
    #Transforming the words into the stem forms using PorterStemmer function from nltk library
    porter = nltk.PorterStemmer()
    looper = 0
    for token in mysentencetokens:
        mysentencetokens[looper] = porter.stem(token)
        looper += 1
    
    #lemmatizing the words so that they are known words from the dictionary using the WordNetLemmatizer function
    lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
    looper = 0
    for token in mysentencetokens:
        mysentencetokens[looper] = lmtzr.lemmatize(token)
        looper += 1
        word_list.append(mysentencetokens)

#creating a Dictionary and Corpus from the 10 patent documents
dictionary = corpora.Dictionary(word_list) 
corpus = [dictionary.doc2bow(words) for words in word_list] 

# creating the term document frequency matrix to obtain distribution of words in a topic across the documents in the corpus
term_document_frequency_matrix = tfidfmodel.TfidfModel(corpus) 
new_corpus = term_document_frequency_matrix[corpus] 

# applying the LDA model, setting the number of topics to 5 and printing top 10 words on each topic 
# (Note : default value is top ten words for show_topics command)
find_topics = ldamodel.LdaModel(new_corpus, id2word=dictionary, num_topics=5, passes = 20) 
topics = find_topics.show_topics(formatted = True) 

print "Top ten words and their frequencies" 
print "Topic 1 :" 
print topics[0]
print

print "Topic 2 :" 
print topics[1]
print

print "Topic 3 :" 
print topics[2]
print

print "Topic 4 :" 
print topics[3]
print

print "Topic 5 :"
print topics[4]



Top ten words and their frequencies
Topic 1 :
0.063*protector + 0.050*strap + 0.036*section + 0.035*first + 0.032*nozzl + 0.032*rotari + 0.028*second + 0.027*jet + 0.026*assembl + 0.020*whirlpool

Topic 2 :
0.078*cm/ + 0.068*accord + 0.068*measur + 0.057*rate + 0.052*method + 0.046*flow + 0.042*sleep + 0.042*airflow + 0.039*air + 0.037*area

Topic 3 :
0.109*bag + 0.085*sleep + 0.050*part + 0.050*bodi + 0.039*leg + 0.035*main + 0.035*foot + 0.032*substanti + 0.024*user + 0.022*drawstr

Topic 4 :
0.100*edg + 0.058*perineum + 0.050*protect + 0.050*devic + 0.049*gener + 0.045*panel + 0.042*inch + 0.039*first + 0.035*second + 0.034*loop

Topic 5 :
0.056*member + 0.044*faucet + 0.036*tighten + 0.036*support + 0.031*bathtub + 0.026*seat + 0.025*plural + 0.024*defin + 0.023*base + 0.021*connector


#Output of above code (Part 1)

Top ten words and their frequencies
Topic 1 :
0.063*protector + 0.050*strap + 0.036*section + 0.035*first + 0.032*nozzl + 0.032*rotari + 0.028*second + 0.027*jet + 0.026*assembl + 0.020*whirlpool

Topic 2 :
0.078*cm/ + 0.068*accord + 0.068*measur + 0.057*rate + 0.052*method + 0.046*flow + 0.042*sleep + 0.042*airflow + 0.039*air + 0.037*area

Topic 3 :
0.109*bag + 0.085*sleep + 0.050*part + 0.050*bodi + 0.039*leg + 0.035*main + 0.035*foot + 0.032*substanti + 0.024*user + 0.022*drawstr

Topic 4 :
0.100*edg + 0.058*perineum + 0.050*protect + 0.050*devic + 0.049*gener + 0.045*panel + 0.042*inch + 0.039*first + 0.035*second + 0.034*loop

Topic 5 :
0.056*member + 0.044*faucet + 0.036*tighten + 0.036*support + 0.031*bathtub + 0.026*seat + 0.025*plural + 0.024*defin + 0.023*base + 0.021*connector

# Part 2: Creation of toy data set

Documents labelled "doc1.txt", "doc2.txt" and "doc3.txt" are based on nanotechnology and related subjects.                     
Documents labelled "doc4.txt" and "doc5.txt" are based on datascience and databases. 

Doc1

Materials science, also commonly known as materials science and engineering, is an interdisciplinary field which deals with the discovery and design of new materials. Though it is a relatively new scientific field that involves studying materials through the materials paradigm (synthesis, structure, properties and performance), its intellectual origins reach back to the emerging fields of chemistry, mineralogy and engineering during the Enlightenment. It incorporates elements of physics and chemistry, and is at the forefront of nanoscience and nanotechnology research. In recent years, materials science has become more widely known as a specific field of science and engineering.

Doc2

Nanoparticles are particles between 1 and 100 nanometers in size. In nanotechnology, a particle is defined as a small object that behaves as a whole unit with respect to its transport and properties.  Particles are further classified according to diameter. Nanoparticles are of great scientific interest as they are, in effect, a bridge between bulk materials and atomic or molecular structures. A bulk material should have constant physical properties regardless of its size, but at the nano-scale size-dependent properties are often observed. 

Doc3

Nanotechnology as defined by size is naturally very broad, including fields of science as diverse as surface science, organic chemistry, molecular biology, semiconductor physics, microfabrication, etc. The associated research and applications are equally diverse, ranging from extensions of conventional device physics to completely new approaches based upon molecular self-assembly, from developing new materials with dimensions on the nanoscale to direct control of matter on the atomic scale.

Doc4

In general terms, Data Science is the extraction of knowledge from data. It employs techniques and theories drawn from many fields within the broad areas of mathematics, statistics, information theory and information technology, including signal processing, probability models, machine learning, statistical learning, computer programming, data engineering, pattern recognition and learning, visualization, predictive analytics, uncertainty modeling, data warehousing, data compression and high performance computing. Methods that scale to Big Data are of particular interest in data science, although the discipline is not generally considered to be restricted to such data.

Doc5

Formally, a "database" refers to a set of related data and the way it is organized. Access to this data is usually provided by a "database management system" (DBMS) consisting of an integrated set of computer software that allows users to interact with one or more databases and provides access to all of the data contained in the database (although restrictions may exist that limit access to particular data). The DBMS provides various functions that allow entry, storage and retrieval of large quantities of information as well as provides ways to manage how that information is organized.


# Part 3 (A): LDA on toy Data set 

In [5]:
# nltk package is used for basic natural language processing operations such as removing stopwords, tokenizing words, stemming and
# lemmatization.

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

# gensim package is applied for LDA modelling and creation of term frequency - inverse document matrices
from gensim import corpora 
from gensim.models import ldamodel
from gensim.models import tfidfmodel 

word_list_toy = []

# selecting the 5 created documents
for i in range(1,5):
    string = str(i)
    paragraph = open('doc' + string +'.txt','r')
    
    for text in paragraph:
        structure = text
        
    # Creating Tokens using word_tokenize function in nltk library
    mysentencetokens_sw= nltk.word_tokenize(str(structure))

    # normazlizing everything to lower case
    looper = 0
    for token in mysentencetokens_sw:
        mysentencetokens_sw[looper] = token.lower()
        looper += 1

    # removing stopwords using nltk library with anything less than 2 letters
    minlength = 2
    mysentencetokens = [token for token in mysentencetokens_sw if 
                    (not token in stopwords.words('english')) 
                    and len(token) > minlength]
    
    #Transforming the words into the stem forms using PorterStemmer function from nltk library
    porter = nltk.PorterStemmer()
    looper = 0
    for token in mysentencetokens:
        mysentencetokens[looper] = porter.stem(token)
        looper += 1
    
    #lemmatizing the words so that they are known words from the dictionary using the WordNetLemmatizer function
    lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
    looper = 0
    for token in mysentencetokens:
        mysentencetokens[looper] = lmtzr.lemmatize(token)
        looper += 1
        word_list_toy.append(mysentencetokens)

#creating a Dictionary and Corpus from the 10 patent documents
dictionary = corpora.Dictionary(word_list_toy) 
corpus = [dictionary.doc2bow(words) for words in word_list_toy] 

# creating the term document frequency matrix to obtain distribution of words in a topic across the documents in the corpus
term_document_frequency_matrix = tfidfmodel.TfidfModel(corpus) 
new_corpus = term_document_frequency_matrix[corpus] 

# applying the LDA model, setting the number of topics to 5 and printing top 10 words on each topic 
# (Note : default value is top ten words for show_topics command)
find_topics = ldamodel.LdaModel(new_corpus, id2word=dictionary, num_topics=5, passes = 20) 
topics = find_topics.show_topics(formatted = True) 

print dictionary

print "Top ten words and their frequencies" 
print "Topic 1 :" 
print topics[0]
print

print "Topic 2 :" 
print topics[1]
print

print "Topic 3 :" 
print topics[2]
print

print "Topic 4 :" 
print topics[3]
print

print "Topic 5 :"
print topics[4]

Dictionary(140 unique tokens: [u'particular', u'size-depend', u'comput', u'discoveri', u'enlighten']...)
Top ten words and their frequencies
Topic 1 :
0.126*data + 0.048*learn + 0.032*inform + 0.032*theori + 0.032*model + 0.032*gener + 0.032*statist + 0.032*comput + 0.016*within + 0.016*drawn

Topic 2 :
0.007*field + 0.007*scienc + 0.007*nanotechnolog + 0.007*physic + 0.007*materi + 0.007*perform + 0.007*engin + 0.007*broad + 0.007*includ + 0.007*scale

Topic 3 :
0.078*particl + 0.053*bulk + 0.053*nanoparticl + 0.037*properti + 0.029*size + 0.027*unit + 0.027*whole + 0.027*transport + 0.027*accord + 0.027*diamet

Topic 4 :
0.047*known + 0.040*materi + 0.030*engin + 0.026*chemistri + 0.026*new + 0.024*year + 0.024*wide + 0.024*mineralog + 0.024*nanosci + 0.024*paradigm

Topic 5 :
0.053*diver + 0.031*molecular + 0.027*equal + 0.027*devic + 0.027*semiconductor + 0.027*self-assembl + 0.027*rang + 0.027*organ + 0.027*natur + 0.027*nanoscal


# Output of above code (Part 3A)

Dictionary(140 unique tokens: [u'particular', u'size-depend', u'comput', u'discoveri', u'enlighten']...)

Top ten words and their frequencies

Topic 1 :
0.126*data + 0.048*learn + 0.032*inform + 0.032*theori + 0.032*model + 0.032*gener + 0.032*statist + 0.032*comput + 0.016*within + 0.016*drawn

Topic 2 :
0.007*field + 0.007*scienc + 0.007*nanotechnolog + 0.007*physic + 0.007*materi + 0.007*perform + 0.007*engin + 0.007*broad + 0.007*includ + 0.007*scale

Topic 3 :
0.078*particl + 0.053*bulk + 0.053*nanoparticl + 0.037*properti + 0.029*size + 0.027*unit + 0.027*whole + 0.027*transport + 0.027*accord + 0.027*diamet

Topic 4 :
0.047*known + 0.040*materi + 0.030*engin + 0.026*chemistri + 0.026*new + 0.024*year + 0.024*wide + 0.024*mineralog + 0.024*nanosci + 0.024*paradigm

Topic 5 :
0.053*diver + 0.031*molecular + 0.027*equal + 0.027*devic + 0.027*semiconductor + 0.027*self-assembl + 0.027*rang + 0.027*organ + 0.027*natur + 0.027*nanoscal

# Part 3B : Calculation of the jargon distance between documents using the method in the Jargon Paper

### 3B.1 : Creation of corpus and dictionary for group 1 - NanoScience Group

In [6]:
# creating the corpus for the nanoscience_group 
word_list_nano = []
for i in range(1,3):
    string = str(i)
    paragraph = open('doc' + string +'.txt','r')
    for text in paragraph:
        structure = text
    # Creating Tokens using word_tokenize function in nltk library
    mysentencetokens_sw= nltk.word_tokenize(str(structure))
    # normazlizing everything to lower case
    looper = 0
    for token in mysentencetokens_sw:
        mysentencetokens_sw[looper] = token.lower()
        looper += 1
    # removing stopwords using nltk library with anything less than 2 letters
    minlength = 2
    mysentencetokens = [token for token in mysentencetokens_sw if 
                    (not token in stopwords.words('english')) 
                    and len(token) > minlength]
    #Transforming the words into the stem forms using PorterStemmer function from nltk library
    porter = nltk.PorterStemmer()
    looper = 0
    for token in mysentencetokens:
        mysentencetokens[looper] = porter.stem(token)
        looper += 1
    #lemmatizing the words so that they are known words from the dictionary using the WordNetLemmatizer function
    lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
    looper = 0
    for token in mysentencetokens:
        mysentencetokens[looper] = lmtzr.lemmatize(token)
        looper += 1
        word_list_nano.append(mysentencetokens)
        
#creating a Dictionary and Corpus from the nanoscience_group 
nano_dictionary = corpora.Dictionary(word_list_nano) 
nano_corpus = [dictionary.doc2bow(words) for words in word_list_nano] 

print nano_dictionary

Dictionary(71 unique tokens: [u'origin', u'diamet', u'properti', u'constant', u'often']...)


## Output of above short code 3B.1

Dictionary(71 unique tokens: [u'origin', u'diamet', u'properti', u'constant', u'often']...)

### 3B.2: Creation of Corpus and Dictionary group 2 - DataScience Group

In [7]:
# creating the corpus for the datascience_group 
word_list_data = []
for i in range(4,5):
    string = str(i)
    paragraph = open('doc' + string +'.txt','r')
    for text in paragraph:
        structure = text
    # Creating Tokens using word_tokenize function in nltk library
    mysentencetokens_sw= nltk.word_tokenize(str(structure))
    # normazlizing everything to lower case
    looper = 0
    for token in mysentencetokens_sw:
        mysentencetokens_sw[looper] = token.lower()
        looper += 1
    # removing stopwords using nltk library with anything less than 2 letters
    minlength = 2
    mysentencetokens = [token for token in mysentencetokens_sw if 
                    (not token in stopwords.words('english')) 
                    and len(token) > minlength]
    #Transforming the words into the stem forms using PorterStemmer function from nltk library
    porter = nltk.PorterStemmer()
    looper = 0
    for token in mysentencetokens:
        mysentencetokens[looper] = porter.stem(token)
        looper += 1
    #lemmatizing the words so that they are known words from the dictionary using the WordNetLemmatizer function
    lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
    looper = 0
    for token in mysentencetokens:
        mysentencetokens[looper] = lmtzr.lemmatize(token)
        looper += 1
        word_list_data.append(mysentencetokens)
        
#creating a Dictionary and Corpus from the nanoscience_group 
data_dictionary = corpora.Dictionary(word_list_data) 
data_corpus = [dictionary.doc2bow(words) for words in word_list_data] 

print data_dictionary


Dictionary(48 unique tokens: [u'program', u'comput', u'although', u'predict', u'knowledg']...)


## Output of above short code 3B.2 

Dictionary(48 unique tokens: [u'program', u'comput', u'although', u'predict', u'knowledg']...)

### 3B.3 Obtaining the frequency Distributions in DataScience group, NanoScience Group and Total group 

In [8]:
from nltk import FreqDist 

# extracting all the words in nanodictionary with related frequencies
word_list_nano_final = []
word_list_nano_final = word_list_nano[1]
word_list_nano_final.extend(word_list_nano[50])
word_list_nano_final.extend(word_list_nano[100])

# extracting all the words in datadictionary with related frequencies
word_list_data_final = []
word_list_data_final = word_list_data[1]
word_list_data_final.extend(word_list_data[25])


#extracting total words from all dictionaries
word_list_final = []
word_list_final = word_list_toy[1]
word_list_final.extend(word_list_toy[20])
word_list_final.extend(word_list_toy[50])
word_list_final.extend(word_list_toy[100])
word_list_final.extend(word_list_toy[150])
word_list_final.extend(word_list_toy[210])

wordfrequency_nano = nltk.FreqDist(word_list_nano_final) # Calculate the frequency of terms in the nano 
wordfrequency_data = nltk.FreqDist(word_list_data_final) # Calculate the frequency of terms in the data 
wordfrequency_total = nltk.FreqDist(word_list_final)

wordfrequency_nano, wordfrequency_data, wordfrequency_total


(FreqDist({u'materi': 14, u'scienc': 8, u'field': 8, u'engin': 6, u'properti': 5, u'chemistri': 4, u'new': 4, u'known': 4, u'particl': 3, u'nanotechnolog': 3, ...}),
 FreqDist({u'data': 16, u'learn': 6, u'comput': 4, u'scienc': 4, u'gener': 4, u'theori': 4, u'inform': 4, u'statist': 4, u'model': 4, u'program': 2, ...}),
 FreqDist({u'materi': 26, u'scienc': 20, u'field': 18, u'data': 16, u'engin': 14, u'new': 8, u'known': 8, u'chemistri': 8, u'properti': 7, u'learn': 6, ...}))

## Output of above short code 3B.3 
(FreqDist({u'materi': 14, u'scienc': 8, u'field': 8, u'engin': 6, u'properti': 5, u'chemistri': 4, u'new': 4, u'known': 4, u'particl': 3, u'nanotechnolog': 3, ...}),
 FreqDist({u'data': 16, u'learn': 6, u'comput': 4, u'scienc': 4, u'gener': 4, u'theori': 4, u'inform': 4, u'statist': 4, u'model': 4, u'program': 2, ...}),
 FreqDist({u'materi': 26, u'scienc': 20, u'field': 18, u'data': 16, u'engin': 14, u'new': 8, u'known': 8, u'chemistri': 8, u'properti': 7, u'learn': 6, ...}))

### 3B.4 : Implementation of Jargon Distance calculations and printing the results

In [9]:
import math

alpha = 0.01
Psi = {} 
Psj = {}
for i, j in wordfrequency_total.items(): 
    for k, l in wordfrequency_nano.items(): 
        if i in wordfrequency_nano.keys(): 
            Psi[i] = (1 - alpha) * (float(l)/len(word_list_nano)) + alpha * (float(j)/len(word_list_toy))
        else: 
            Psi[i] = alpha * (float(j)/len(word_list_toy))

for m, n in wordfrequency_total.items():
    for o, p in wordfrequency_data.items(): 
        if m in wordfrequency_data.keys():             
            Psj[m] = (1 - alpha) * (float(p)/len(word_list_data)) + alpha * (float(n)/len(word_list_toy))
        else: 
            Psj[m] = alpha * (float(n)/len(word_list_toy))

# Shannon Entropy function
def Shannon_Entropy(H):
    E = 0
    for word in H: 
        E += - word*math.log(word, 2)
    return E 

# Cross entropy function
def Cross_entropy(Pi, Pj):
    CE = 0 
    for ki, pi in Pi: 
        for kj, pj in Pj: 
            if ki == kj: 
                CE += - pi * math.log(pj, 2)
    return CE 

# Shannon Entropy of writer nanoscience and reader DataScience
H_nanoscience = Shannon_Entropy(Psi.values())
print 'Shannon Entropy(nanoscience) : ' + str(H_nanoscience)

# Cross Entropy of writer nanoscience and reader datascience
Cross_entropy = Cross_entropy(Psi.items(), Psj.items())
print 'Cross entropy (nanoscience as writer) : ' + str(Cross_entropy)

# Efficiency of Communication
Efficiency = H_nanoscience / Cross_entropy
print 'Efficiency of communication : ' + str(Efficiency)

# Cultural Hole between two topics
Cultural_hole = 1 - Efficiency
print 'Cultural Hole (Jargon Distance): ' + str(Cultural_hole)

Shannon Entropy(nanoscience) : 4.78415926774
Cross entropy (nanoscience as writer) : 8.84321150814
Efficiency of communication : 0.540997946656
Cultural Hole (Jargon Distance): 0.459002053344


## Final output from jargon distance calculations 3B.4
Shannon Entropy(nanoscience) : 4.78415926774
Cross entropy (nanoscience as writer) : 8.84321150814
Efficiency of communication : 0.540997946656
Cultural Hole (Jargon Distance): 0.459002053344
