In [52]:
import pandas as pd
import numpy as np
import re
from string import punctuation
import nltk
from gensim import models, corpora

In [159]:
nltk.download('wordnet')
nltk.download('stopwords')

# Processing texts

In [123]:
split_line = '-------------------'

def get_abstract(text):
    parts = text.split('Abstract:')
    assert len(parts) == 2
    
    return parts[1].strip()

def load_texts(path='HW1.txt'):
    texts = []
    with open(path, 'r') as f:
        text = []
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            if line == split_line:
                texts.append(get_abstract("\n".join(text)))
                text = []
            elif len(line) > 0:
                text.append(line)
                
        if len(text) > 0:
            texts.append(get_abstract("\n".join(text)))
            
    return texts


def not_number(word):
    return len(re.split(r'.*[0-9].*', word)) == 1


def remove_empty(words):
    without_empty = []
    for word in words:
        if len(word) > 4 and not_number(word):
            without_empty.append(word)
            
    return without_empty


def word_split(texts):
    regex = f'[{punctuation}\n][ \t]*|[ \t]+'
    texts_word = []
    for text in texts:
        texts_word.append(remove_empty(re.split(regex, text.lower().strip())))
        
    return texts_word

In [126]:
def lemmatize(texts):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lem_texts = []
    
    for text in texts:
        lem_texts.append([lemmatizer.lemmatize(word) for word in text])
        
    return lem_texts

def remove_stopwords(texts):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    wout_stop_texts = []
    for text in texts:
        wout_stop_texts.append([word for word in text if word not in stopwords])
        
    return wout_stop_texts

def clean_texts(texts):
    splitted = word_split(texts)
    lemmatized = lemmatize(splitted)
    
    return remove_stopwords(lemmatized)

### Cleaning data

In [127]:
data = load_texts()
data = clean_texts(data)

In [128]:
all_words = []
for words in data:
    all_words += words
    
unique_words = list(set(all_words))

### Preparing tfidf

In [129]:
dictionary = corpora.Dictionary(data)
corpus = [dictionary.doc2bow(text) for text in data]
tfidf_model = models.TfidfModel(corpus)  
tfidf = tfidf_model[corpus]

# Modeling

### Model with best results is LDA
First let's try with 4 topics

In [155]:
lda = models.LdaModel(tfidf, id2word=dictionary, num_topics=4)
topics = lda.print_topics(num_words=2)
topics

[(0, '0.003*"protein" + 0.003*"channel"'),
 (1, '0.002*"collagen" + 0.002*"duplicate"'),
 (2, '0.006*"spindle" + 0.006*"centrosome"'),
 (3, '0.004*"expression" + 0.004*"protein"')]

First let's try with 3 topics

In [156]:
lda = models.LdaModel(tfidf, id2word=dictionary, num_topics=3)
topics = lda.print_topics(num_words=2)
topics

[(0, '0.002*"stress" + 0.002*"longevity"'),
 (1, '0.003*"spindle" + 0.003*"centrosome"'),
 (2, '0.004*"protein" + 0.003*"pathway"')]

First let's try with 2 topics

In [158]:
lda = models.LdaModel(tfidf, id2word=dictionary, num_topics=2)
topics = lda.print_topics(num_words=2)
topics

[(0, '0.003*"protein" + 0.003*"gene"'),
 (1, '0.002*"protein" + 0.002*"signaling"')]

It seems that the best number of themes is 3
1. Something about life and health
2. Some common biology (chromosomes, etc)
3. Sport and food