In [1]:
from pandas import read_csv

#  load dataset
df = read_csv('train.csv')
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [2]:
from sklearn.model_selection import train_test_split
from numpy import arange

#  evaluating the performance of algorithm
train, test = train_test_split(df, test_size = 0.1)

#  indexing datasets starting from 0 (required for for loops)
test.index = arange(0, len(test))
train.index = arange(0, len(train))

train_abstract = train[["ABSTRACT"]]
test_abstract = test[["ABSTRACT"]]

#  renaming colums to make it simpler without space in names
test.columns = ['ID','TITLE','ABSTRACT','Computer', 'Physics', 'Mathematics', 'Statistics', 'Biology', 'Finance']



In [3]:
#  data cleaning and preprocessing

def preprocess_data(dataset):
    
    from re import sub
    from nltk.corpus import stopwords
    from nltk.stem.snowball import SnowballStemmer
    from nltk.corpus import wordnet
    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize import word_tokenize

    lemmatizer = WordNetLemmatizer()
    sn = SnowballStemmer(language = 'english')
    corpus = []
    cleaned_data = []
    
    #  function that returns POS tag to first character lemmatize() accepts
    def get_wordnet_pos(word):
        
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        
        return tag_dict.get(tag, wordnet.NOUN)
    
    for i in range(0, len(dataset)):
        
        review = sub('[^a-zA-Z]',' ',dataset.ABSTRACT[i])
       # review = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in review if w not in set(stopwords.words('english'))]
        review = ' '.join([sn.stem(word) for word in review.lower().split() if word not in set(stopwords.words('english'))])
        corpus.append(review)
    
    tokenize_data = [word_tokenize(corpus_item) for corpus_item in corpus]
    
    for j in range (0, len(tokenize_data)):
        temp = [word for word in tokenize_data[j] if len(word) > 1]
        cleaned_data.append(temp)
        temp = []
        
    return cleaned_data
    


In [4]:
train_cleaned = preprocess_data(train_abstract)
test_cleaned = preprocess_data(test_abstract)

In [5]:
from gensim import corpora
from gensim.corpora.dictionary import Dictionary

#  creating a dictionary
gensim_dictionary = corpora.Dictionary(train_cleaned)
gensim_corpus = [gensim_dictionary.doc2bow(token, allow_update = True) for token in train_cleaned]

In [6]:
#  lda model training
from gensim import models
from gensim.models.ldamodel import LdaModel

lda_model = LdaModel(gensim_corpus, num_topics = 6, id2word = gensim_dictionary, passes = 20)


In [7]:
#  print the top five words from each topic 
topics = lda_model.print_topics(num_words = 5)

for topic in topics:
    print(topic)

(0, '0.011*"use" + 0.010*"system" + 0.008*"data" + 0.007*"network" + 0.006*"model"')
(1, '0.023*"learn" + 0.020*"network" + 0.018*"model" + 0.013*"use" + 0.012*"train"')
(2, '0.008*"result" + 0.008*"space" + 0.008*"group" + 0.008*"prove" + 0.008*"show"')
(3, '0.011*"system" + 0.011*"state" + 0.009*"phase" + 0.008*"field" + 0.008*"energi"')
(4, '0.016*"algorithm" + 0.014*"method" + 0.014*"problem" + 0.013*"model" + 0.012*"estim"')
(5, '0.009*"observ" + 0.008*"mass" + 0.007*"use" + 0.006*"star" + 0.006*"high"')


In [8]:
#  function that returns an accuracy 
def get_accuracy(test_data):
    
    counter = 0;
    
    for i in range(0, len(test_data)):
        
        bow_test_doc = gensim_dictionary.doc2bow(test_data[i])
        lista = lda_model.get_document_topics(bow_test_doc)
        x, y = max(lista, key = lambda item:item[1])
        
        if x == 0:
            temp = test[["Computer"]].Computer[i]
        elif x == 1:
            temp = test[["Physics"]].Physics[i]
        elif x == 2:
            temp = test[["Mathematics"]].Mathematics[i]
        elif x == 3:
            temp = test[["Statistics"]].Statistics[i]
        elif x == 4:
            temp = test[["Biology"]].Biology[i]
        else:
            temp = test[["Finance"]].Finance[i]
        if temp == 1:
            counter = counter + 1;
    
    return (counter/len(test_data)*100) 
    

In [9]:
print("Accuracy of test set is: ", get_accuracy(test_cleaned))

Accuracy of test set is:  31.839847473784555


In [10]:
#  visualization
import pyLDAvis.gensim_models

lda_visualization = pyLDAvis.gensim_models.prepare(lda_model, gensim_corpus, gensim_dictionary, sort_topics = False)
pyLDAvis.display(lda_visualization)

  default_term_info = default_term_info.sort_values(


In [11]:
print("Proba za git")

Proba za git
