This notebook reads in a cleaned corpus text file and returns the most frequent words and the most important sentences, representing a summary of the corpus.  The code is modeled after the code from the following paper by R. Mihalcea and P. Tarau:
https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf

Additional reference for implementing the paper's code:
https://github.com/davidadamojr/TextRank/blob/master/textrank/__init__.py


In [10]:
#import packages

import nltk, string
import numpy as np
import pandas as pd
import itertools
import textrank
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
#read in file

with open('genetic_testing_corpus_cleaned.txt', 'r') as f:
    corpus = f.read()

corpus = corpus.decode('utf-8').strip()

### Preprocessing:
1. remove extra spaces, tabs, and returns
2. stemming, lemmatisation, POS tagging

In [8]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [12]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string, re


def processed(text):
    text = ' '.join(corpus.strip().split('\n')).lower()
    lemmatiser = WordNetLemmatizer()
    lem_text = lemmatiser.lemmatize(text)
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    lem_text = regex.sub('', lem_text)
    return lem_text
    
clean = processed(corpus)

I compared stemming to lemmatizing and found that for this particular corpus, the two methods yielded very similar results.  I decided to go with lemmatizing since it avoids making up words, as stemming sometimes does.

In [13]:
#tag text with POS (part of speech) & tokenize

from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize

def tag_tokenize(processed_text):
    tokens = word_tokenize(processed_text) # Generate list of tokens
    tagged = pos_tag(tokens)
    sentences = sent_tokenize(corpus)  #will use later in TextRank
    return tagged

tagged = tag_tokenize(clean)

In [28]:
#remove stopwords & filter for tags:
from nltk.corpus import stopwords

def filter_for_tags(tagged, tags=['NN', 'JJ', 'NNP']):
    #filter based on POS tags
    tagged = [item for item in tagged if item[1] in tags]
    return tagged

def filter_nostp(tagged_text):
    filtered = filter_for_tags(tagged_text)
    #filtered = re.sub(u"\u2019", "", filtered)
    stp = stopwords.words("english")
    add1 = ["thats", "says", "'", "theres", "-", "mr", "its", "whats", "wheres", "even", "also", "may", "might", "think", "believe", "study", "dr", "university"]
    add = [unicode(i, "utf-8") for i in add1]
    stop = stp + add
    no_stp = [w[0] for w in filtered if w[0] not in stop]
    return no_stp

#will call the filter_nostp() function later in word frequency count

tagged2 = filter_for_tags(tagged)

### Get most common words in corpus:

In [29]:
from collections import Counter

count = Counter(filter_nostp(tagged2))
top30 = count.most_common(30)
for i in top30:
    print i[0]

’
genetic
”
dna
risk
“
new
company
disease
test
family
ancestry
medical
percent
health
cancer
gene
—
clement
many
european
time
information
mother
research
intelligence
african
school
last
heart


### TextRank Algorithm:

In [30]:
#tokenize sentences
sentences = sent_tokenize(corpus)

def normalize(tagged):
    """Return a list of tuples with the first item's periods removed."""
    tagged = [(item[0].replace('.', ''), item[1]) for item in tagged]

normalize(tagged2)    
    
def unique_everseen(iterable, key=None):
    #List unique elements in order of appearance.
    seen = set()
    seen_add = seen.add
    if key is None:
        for element in [x for x in iterable if x not in seen]:
            seen_add(element)
            yield element
    else:
        for element in iterable:
            k = key(element)
            if k not in seen:
                seen_add(k)
                yield element

In [31]:
#calculate cosine distance:
import re, math
WORD = re.compile(r'\w+')
import itertools
import networkx as nx

def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)    


def build_graph(nodes):
    """Return a networkx graph instance.
    :param nodes: List of hashables that represent the nodes of a graph.
    """
    gr = nx.Graph()  # initialize an undirected graph
    gr.add_nodes_from(nodes)
    nodePairs = list(itertools.combinations(nodes, 2))

    # add edges to the graph (weighted by cosine distance)
    for pair in nodePairs:
        firstString = pair[0]
        vector1 = text_to_vector(firstString)
        secondString = pair[1]
        vector2 = text_to_vector(secondString)
        pairwise_sim = get_cosine(vector1, vector2)
        gr.add_edge(firstString, secondString, weight=pairwise_sim)

    return gr

build_graph(sentences)

<networkx.classes.graph.Graph at 0x1a3174c1d0>

### Return key phrases from corpus:

In [32]:
unique_word_set = unique_everseen([x[0] for x in tagged2])
word_set_list = list(unique_word_set)

In [33]:
#calculate score of each sentence

calculated_page_rank = nx.pagerank(build_graph(sentences), weight='weight')

In [36]:
# most important words in ascending order of importance
keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get,
                        reverse=True)

# the number of keyphrases returned will be relative to the size of the
# text (a third of the number of vertices)
one_third = len(word_set_list) // 3
keyphrases = keyphrases[0:one_third + 1]

In [37]:
res = []
for i in range(0,19):
    keyphrases[i] = keyphrases[i].encode('utf-8')
    res.append(keyphrases[i])


In [38]:
#display neater output:

for i in res:
    print i

And with the financial support of a handful of like-minded wealthy individuals who agreed to invest in the exploratory phase of the project, “it just seemed,” Mr. Clement said, “like something I could do.” Even with the Harvard name as a calling card, several of the families he contacted over the next few years did not respond to his inquiries.
A spokesperson for the agency declined comment, saying that “FTC investigations are non-public, and so typically we do not comment on an investigation or even whether we are investigating.” Privacy issues in the use of such DNA testing kits came to the forefront last month with the arrest of the notorious Golden State Killer, when it was revealed that police had used data from GEDMatch , a genealogy research site where users upload genealogical and genetic information, to help identify the suspect.
“There is a demand.” Yet even if just a minority of 23andMe customers decided to game the current insurance system, “it’s enough to perturb the marke

In [27]:
textfile = open('genetic_testing_summary.txt', 'w')

for item in res:
  textfile.write("%s\n" % item)
textfile.close()

#note: in the text file, all punctuation is back to normal (no weird symbols) as it is typically in UTF-8.

## Part II: Topic Modeling with LDA

In [54]:
#read in texts

articles = pd.read_csv("genetic_testing_text.csv")

In [55]:
articles.head(2)

Unnamed: 0,Article,Text
0,1,He learned of his mixed-race ancestry through ...
1,2,Ms. Reilly found that she had inherited an Apo...


In [83]:
corp = articles.Text.tolist()

In [88]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stp = stopwords.words('english')
add1 = ["thats", "says", "'", "said", "people", "theres", "-", "mr", "its", "whats", "wheres", "even", "also", "may", "might", "think", "believe", "study", "dr", "university"]
add = [unicode(i, "utf-8") for i in add1]
stop = stp + add
stop = set(stop)

exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    doc = doc.decode('utf-8').strip()
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

corp_clean = [clean(doc).split() for doc in corp]

In [90]:
from gensim import corpora
dictionary = corpora.Dictionary(corp_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corp_clean]


In [93]:
import gensim

lda = gensim.models.ldamodel.LdaModel
ldamodel = lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=20)
ldamodel.save('model1.gensim')

In [92]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, u'0.000*"genetic" + 0.000*"test" + 0.000*"dna" + 0.000*"\u2014"')
(1, u'0.000*"mr" + 0.000*"dna" + 0.000*"\u2014" + 0.000*"test"')
(2, u'0.025*"gene" + 0.021*"intelligence" + 0.018*"study" + 0.014*"genetic"')
(3, u'0.012*"dna" + 0.009*"family" + 0.008*"result" + 0.008*"ancestry"')
(4, u'0.021*"company" + 0.014*"23andme" + 0.013*"genetic" + 0.012*"data"')
(5, u'0.025*"risk" + 0.020*"genetic" + 0.017*"test" + 0.017*"cancer"')
(6, u'0.011*"mr" + 0.010*"dna" + 0.008*"clement" + 0.007*"company"')
(7, u'0.000*"genetic" + 0.000*"dna" + 0.000*"gene" + 0.000*"result"')
(8, u'0.013*"dna" + 0.012*"percent" + 0.011*"family" + 0.010*"ancestry"')
(9, u'0.000*"dna" + 0.000*"genetic" + 0.000*"family" + 0.000*"percent"')
