## https://pythondata.com/text-analytics-visualization/ 

In [1]:
pip install bs4 nltk

Note: you may need to restart the kernel to use updated packages.


### code

In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
from collections import Counter
from html.parser import HTMLParser
from collections import OrderedDict
import re
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from bs4 import BeautifulSoup

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
porter = PorterStemmer()
wnl = WordNetLemmatizer()
stop = stopwords.words('english')
stop.append("new")
stop.append("like")
stop.append("u")
stop.append("it")
stop.append("'s")
stop.append("n't")
stop.append("mr.")
stop = set(stop)

### Tokenizer

In [4]:
# From http://ahmedbesbes.com/how-to-mine-newsfeed-data-and-extract-interactive-insights-in-python.html
def tokenizer(text):
    tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]
    
    tokens = []
    for token_by_sent in tokens_:
        tokens += token_by_sent
        
    tokens = list(filter(lambda t: t.lower() not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u'``',u"''", u'\u2014', u'\u2026', u'\u2013'],tokens)) 
    
    filtered_tokens = []
    for token in tokens:
        token = wnl.lemmatize(token)
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    
    filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))
    
    return filtered_tokens
        

### Strip HTML text from the content

In [5]:
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    
    def handle_data(self, d):
        self.fed.append(d)
    
    def get_data(self):
        return ''.join(self.fed)
    
def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [6]:
def get_keywords(tokens, num):
    return Counter(tokens).most_common(num)

In [7]:
def build_article_df(urls):
    articles = []
    for index, row in urls.iterrows():
        try:
            data = row['text'].strip().replace("'","")
            data = strip_tags(data)
            soup = BeautifulSoup(data)
            data = soup.get_text()
            data = data.encode('ascii', 'ignore').decode('ascii')
            document = tokenizer(data)
            top_5 = get_keywords(document, 5)
            
            unzipped = list(zip(*top_5))
            kw = list(unzipped[0])
            kws = ",".join(str(x) for x in kw)
            articles.append((kws, row['title'], row['pubdate']))
        except Exception as e:
            print(e)
            #print data
            #break
            pass
        #break
        
    article_df=pd.DataFrame(articles, columns =['keywords','title','pubdate'])
    return article_df

### Load data 

In [8]:
df = pd .read_csv('tocsv.csv')
data = []
for index, row in df.iterrows():
    data.append((row['Title'], row['Permalink'], row['Date'], row['Content']))
data_df = pd.DataFrame(data, columns=['title','url','pubdate','text'])

In [9]:
data_df.tail()

Unnamed: 0,title,url,pubdate,text
143,Driving Digital by Isaac Sacolick - a book review,http://ericbrown.com/driving-digital-isaac-sac...,20170906,"<img class=""alignleft size-medium wp-image-975..."
144,Data and Culture go hand in hand,http://ericbrown.com/?p=9757,-11130,"Last week, I spent an afternoon talking to the..."
145,Data Quality - The most important data dimension?,http://ericbrown.com/data-quality-most-importa...,20170918,"<img class=""size-medium wp-image-9764 alignrig..."
146,"Be pragmatic, not dogmatic",http://ericbrown.com/be-pragmatic-not-dogmatic...,20170928,"<img class=""alignright size-medium wp-image-97..."
147,The Data Way,http://ericbrown.com/the-data-way.htm,20171003,"<img class=""alignleft size-medium wp-image-977..."


In [10]:
article_df = build_article_df(data_df)

In [11]:
article_df.head()

Unnamed: 0,keywords,title,pubdate
0,"data,big,culture,may,skill",Building a Data Culture,20141118
1,"data,data-driven,make,company,decision","Note to Self - Don't say ""Data Driven"" Anymore",20141120
2,"captured,canon,titmouse,backporch,feeder",Foto Friday - Titmouse on the Feeder,20141121
3,"mobility,organization,device,mobile,access",The Cloud - Gateway to Enterprise Mobility,20141121
4,"data,center,agile,organization,one",The Agile Data Center,20141124


In [12]:
keywords_array=[]
for index, row in article_df.iterrows():
    keywords = row['keywords'].split(',')
    for kw in keywords:
        keywords_array.append((kw.strip(' '), row['keywords']))
        
kw_df = pd.DataFrame(keywords_array).rename(columns={0:'keyword', 1:'keywords'})

In [13]:
kw_df.head()

Unnamed: 0,keyword,keywords
0,data,"data,big,culture,may,skill"
1,big,"data,big,culture,may,skill"
2,culture,"data,big,culture,may,skill"
3,may,"data,big,culture,may,skill"
4,skill,"data,big,culture,may,skill"


In [14]:
document = kw_df.keywords.to_list()
names = kw_df.keyword.to_list()

In [15]:
document_array =[]
for item in document:
    items = item.split(',')
    document_array.append((items))
    
occurences = OrderedDict((name, OrderedDict((name, 0) for name in names)) for name in names)

## Find the Co Occurences
for l in document_array:
    for i in range(len(l)):
        for item in l[:i] + l[i+1:]:
            occurences[l[i]][item] +=1

In [16]:
co_occur = pd.DataFrame.from_dict(occurences)

In [17]:
co_occur.to_csv('out/text_matrix.csv')

In [18]:
co_occur.head()

Unnamed: 0,data,big,culture,may,skill,data-driven,make,company,decision,captured,...,manager,love,song,scene,isaac,quality,governance,pragmatic,dogmatic,thats
data,0,75,5,5,5,10,15,30,5,0,...,0,0,0,0,0,5,5,0,0,5
big,75,0,5,5,5,0,5,10,0,5,...,0,0,0,0,0,0,0,0,0,0
culture,5,5,0,5,5,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
may,5,5,5,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
skill,5,5,5,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
