# Article Analysis

In [1]:
# show warnings only once
import warnings 

warnings.filterwarnings(action='once')

In [2]:
# load file
import pandas as pd
import os

current_dir = os.path.dirname(os.path.realpath('__file__'))
filepath = current_dir + '/csv/needed_article_titles_and_contents.csv'
df = pd.read_csv(filepath, delimiter = ";", names = ["title", "content"])
df.head()

Unnamed: 0,title,content
0,Callianira hexagona,Callianira hexagona is a species of Ctenophora...
1,Ceratomyxa hooperi,Ceratomyxa hooperi is a myxosporean parasite t...
2,Bee removal,Bee removal is the process of removing bees fr...
3,Petalonamae,The Petalonamae are a proposed extinct group o...
4,Knacker,"A Knacker (), Knackerman or Knacker Man, is a ..."


#### Note
If you see the following window for the first time, please select "all" from the list and click "download". This may take a while. In consecutive runs you can simply close the window immediately. 

In [3]:
# load nltk
import nltk
nltk.download()

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [None]:
# tokenize documents
from nltk.tokenize import word_tokenize
import numpy as np

documents = df["content"].values

def tokenize(documents): 
    return np.array([nltk.word_tokenize(document) for document in documents])

tokens = tokenize(documents)
print(tokens[0][:10])

['Callianira', 'hexagona', 'is', 'a', 'species', 'of', 'Ctenophora', 'from', 'the', 'Mertensiidae']


In [None]:
# apply pos tagging
def tag_pos(documents): 
    return np.array([nltk.pos_tag(document) for document in documents])

pos_tags = tag_pos(tokens)
print(pos_tags[0][:10])

In [None]:
# filter nouns
def filter_nouns(documents): 
    return np.array([[pos_tag[0].lower() for pos_tag in document if pos_tag[1][0] == "N"] for document in documents])

nouns = filter_nouns(pos_tags)    
print(nouns[0])

In [None]:
# lemmatization
from nltk.corpus import wordnet as wn

def lemmatize(documents): 
    return np.array([[wn.morphy(noun) for noun in document if wn.morphy(noun) != None]for document in documents])

lemmas = lemmatize(nouns)
print(lemmas[0]) 
    
def map_frequencies(documents): 
    frequencies = {}
    for document in documents: 
        for word in document: 
            if word in frequencies: 
                frequencies[word] += 1
            else: 
                frequencies[word] = 1
    return frequencies

# lemma_count = dict(zip(np.unique(np.hstack(lemmas).flatten(), return_counts = True)))
lemma_count = map_frequencies(lemmas)
print(lemma_count)

## Hypothesis 1: Most frequent nouns
We expect words like "animal", "behavior", "habitat" and "diet" to be the most frequent words. 

In [None]:
# visualize distribution
import matplotlib.pyplot as plt
%matplotlib inline

lemma_count_subset = {k: v for k, v in lemma_count.items() if v > 800}
sorted_lemma = dict(sorted(lemma_count_subset.items(), key=lambda x: x[1]))

labels = list(sorted_lemma.keys())
values = list(sorted_lemma.values())

plt.figure(figsize = (20,8))
plt.title("Most frequent nouns", fontdict={'fontsize':16})
plt.bar(labels, values)
plt.xticks(rotation = 45)
plt.show()

lemma_count_subset = {k: v for k, v in lemma_count.items() if v > 800 and v < 4000}
sorted_lemma_n = dict(sorted(lemma_count_subset.items(), key=lambda x: x[1]))

labels = list(sorted_lemma_n.keys())
values = list(sorted_lemma_n.values())

plt.figure(figsize = (20,8))
plt.title("Without \"animal\" and \"species\"", fontdict={'fontsize':16})
plt.bar(labels, values)
plt.xticks(rotation = 45)
plt.show()

## Evaluation of Hypothesis 1
We were correct that "animal" belongs to the most frequent nouns, but we missed "species", the second most frequent noun. A lot of animal names like "bird", "dog" and "fish" are among the most frequent nouns as well. "diet" wasn't among the top words, but a similar one ("food") is. As we guessed, "behavior" also is one of the the top words, but "population" would also have been a plausible guess which we should have taken. 

In [None]:
# filter stopwords
def filter_stopwords(documents): 
    en_stop = set(nltk.corpus.stopwords.words('english'))
    docs = []
    for document in documents: 
        doc = []
        for word in document: 
            lemma = wn.morphy(word)
            condition = lemma not in en_stop and lemma != None and len(lemma) > 2
            if condition: 
                doc.append(lemma)
        docs.append(doc)
    return np.array(docs)
                
filtered = filter_stopwords(tokens)
print(filtered[0])

In [None]:
# prepare data for LDA
import gensim 
dictionary = gensim.corpora.Dictionary(filtered)
corpus = [dictionary.doc2bow(document) for document in filtered]

print(dictionary)
print(corpus)

## Hypothesis 2: Topic Clusters
We expect topics to emerge from different types and contexts of animals like "pet" which includes "cat", "dog" and "human" and its' counterpart: wild animals with words like "tiger", "predator" and "wild"; "cattle" consisting of "cow", "pig" and "livestock"; and also clusters separating mammals from reptiles and terrestrial from aquatic animals. And since "bat" and "bird" were among the most frequent nouns, a cluster describing flying animals could emerge as well. 

In [None]:
# build LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word = dictionary, passes = 15, random_state = 1604)
topics = ldamodel.print_topics(num_words = 4)
for topic in topics:
    print(topic)

In [None]:
# visualize topics (using first three levels)
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics = False)
pyLDAvis.display(lda_display)

## Evaluation of Hypothesis 2
Though cluster number 3 includes the words "virus", "pain", "disease" and "infection", describing cellular life which we didn't take into consideration. Also, cluster 9 roughly resembles what we described as wild animals, as it includes the words "prey" and "predator". 
Cluster 4 covers what we called aquatic animals, with words like "fish", "dolphin" and "waters". 
And as we expected, a cluster about flying animals emerged (cluster 6) with words including "owl", "bird", "nest", "eggs" but not "bat", so it only describes birds and not flying animals in general. 
Cluster 8 is pretty niche since it seems to describe animals from dry areas like deserts with words like "snake" and "coyote". Right next to it is cluster 5, describing livestock as we guessed. It includes the words "cattle", "horse" and "donkey" but not "cow" and "pig". 

cellular 5
wild
pet 4
water 7
air 6
cattle 1

In [None]:
# using only topics from the first layer
level_map_path = "C:/Users/Anita/Desktop/data/level_map.csv"
level_map = pd.read_csv(level_map_path, delimiter = ";", names = ["level", "titles"])
level_map["level"] = [0,1,2]
level_map.head()

In [None]:
# visualize named entity distribution
def ner(documents): 
    named_entities = []
    for document in documents: 
        for sentence in nltk.sent_tokenize(document): 
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence))): 
                if hasattr(chunk, 'label'): 
                    chunkname = chunk.leaves()[0][0]
                    named_entities.append(chunkname)
    return np.array(named_entities)
                
named_entities = ner(documents)

In [None]:
def map_frequencies_ner(named_entities): 
    frequencies = {}
    for named_entity in named_entities: 
        if named_entity in frequencies: 
            frequencies[named_entity] += 1
        else: 
            frequencies[named_entity] = 1
    return frequencies
            
ne_count = map_frequencies_ner(named_entities)

## Hypothesis 3: Most frequent named entities
It is unlikely that a specific animal name is one of the top named entities. Rather, it could be locations since many different species live in one region. This could be described in terms of country or continent names like "Africa", "Australia" and "U.S.". 

In [None]:
ne_count_subset = {k: v for k, v in ne_count.items() if v > 100}
sorted_ne = dict(sorted(ne_count_subset.items(), key=lambda x: x[1]))

labels = list(sorted_ne.keys())
values = list(sorted_ne.values())

plt.figure(figsize = (20,8))
plt.title("Most common named entities", fontdict={'fontsize':16})
plt.bar(labels, values)
plt.xticks(rotation = 45)
plt.show()

## Evaluation of Hypothesis 3
The guess that locations are among the most frequent named entities was right, in fact, there not a single animal name included. We though that maybe a few would be listed. We assumed that continents and rough region descriptions would be more common than countries, and "Canada" is the most frequent occuring named entity, followed by "African", "North", "South", and "Western". The last three probably belong to other country names, so they should be discarded. 