In [9]:
import pandas as pd
import requests
import datetime
import pickle
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from political_utils import clustering as cl
import warnings
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import networkx as nx
import nltk
nltk.download('stopwords')
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/i516134/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# help interpret clusters
def breakdown(column):
    for i in list(df[column].unique()):
        x = df[df[column]==i]
        print('CLUSTER ', str(i))
        print(x['media_site'].value_counts())
        print('===========================')

In [18]:
def read_article(art):
    article = art.split(". ")
    sentences = []
    for sentence in article:
        # print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    return sentences

def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)
 
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix


def generate_summary(art, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text and split it
    sentences =  read_article(art)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    #print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    top_n = 5 if len(ranked_sentence) >=5 else len(ranked_sentence)

    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    result = ("Summarize Text: \n", ". ".join(summarize_text))
    return result
    
    

In [5]:
df = pd.read_pickle('resources/data/102019_kav_clusters.pkl')

In [19]:
df.head()

Unnamed: 0,publish_date,url,title,authors,media_site,article,word_count,cue_stats,cue_stats_title,total,...,embedding_kmeans_cluster_5,embedding_kmeans_cluster_6,tf_idf_dbscan_epsilon_0.4,tf_idf_dbscan_epsilon_0.85,bow_dbscan_epsilon_0.4,bow_dbscan_epsilon_0.85,embedding_dbscan_epsilon_0.4,embedding_dbscan_epsilon_0.45,embedding_dbscan_epsilon_0.85,embedding_dbscan_epsilon_0.9
0,9/20/18,https://www.americanthinker.com/articles/2018/...,Blasey Ford Must Be Acknowledged and Then Dism...,[],americanthinker.com,She released only selected portions of her the...,1266,"{'dianne feinstein': 1, 'house': 2, 'senate': ...",{'total': 0},4,...,2,3,-1,-1,-1,-1,-1,3,0,0
1,9/21/18,https://www.americanthinker.com/articles/2018/...,Dear Juanita Broaddrick,[],americanthinker.com,They want an FBI investigation of an individua...,969,"{'dianne feinstein': 2, 'bill clinton': 4, 'se...",{'total': 0},10,...,2,3,-1,-1,-1,-1,-1,-1,0,0
2,9/25/18,https://www.americanthinker.com/articles/2018/...,Debra Katz Was Wrong about Paula Jones's Case,[],americanthinker.com,Debra Katz Was Wrong about Paula Jones's Case\...,1444,"{'bill clinton': 1, 'hillary clinton': 1, 'tot...",{'total': 0},2,...,1,5,-1,-1,-1,-1,-1,-1,0,0
3,9/28/18,https://www.americanthinker.com/articles/2018/...,Fake Rape Victims Are More Fun than Real Ones,[],americanthinker.com,Fake Rape Victims Are More Fun than Real Ones\...,1151,{'total': 0},{'total': 0},0,...,2,3,-1,-1,-1,-1,-1,-1,0,0
4,9/20/18,https://www.americanthinker.com/articles/2018/...,Ford vs. Kavanaugh: There's Nothing to Investi...,[],americanthinker.com,The problem with that is that there is nothing...,934,"{'kennedy': 1, 'house': 4, 'total': 5}",{'total': 0},5,...,2,3,-1,-1,-1,-1,-1,3,0,0


In [21]:
# df['summary'] = df['article'].apply(lambda x: generate_summary(x))
summ = []
for index, row in df.iterrows():
    try:
        x = generate_summary(row['article'])
    except Exception as e:
        print('error')
        x = str(e)
    summ.append(x)

error
error


In [30]:
df.to_pickle('resources/data/102019_kav_clusters.pkl')

In [None]:
# run after this

In [None]:
df = pd.read_pickle('resources/data/102019_kav_clusters.pkl')

In [31]:
df

Unnamed: 0,publish_date,url,title,authors,media_site,article,word_count,cue_stats,cue_stats_title,total,...,embedding_kmeans_cluster_6,tf_idf_dbscan_epsilon_0.4,tf_idf_dbscan_epsilon_0.85,bow_dbscan_epsilon_0.4,bow_dbscan_epsilon_0.85,embedding_dbscan_epsilon_0.4,embedding_dbscan_epsilon_0.45,embedding_dbscan_epsilon_0.85,embedding_dbscan_epsilon_0.9,summary
0,9/20/18,https://www.americanthinker.com/articles/2018/...,Blasey Ford Must Be Acknowledged and Then Dism...,[],americanthinker.com,She released only selected portions of her the...,1266,"{'dianne feinstein': 1, 'house': 2, 'senate': ...",{'total': 0},4,...,3,-1,-1,-1,-1,-1,3,0,0,"(Summarize Text: \n, One would think, though, ..."
1,9/21/18,https://www.americanthinker.com/articles/2018/...,Dear Juanita Broaddrick,[],americanthinker.com,They want an FBI investigation of an individua...,969,"{'dianne feinstein': 2, 'bill clinton': 4, 'se...",{'total': 0},10,...,3,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, I remember all the specif..."
2,9/25/18,https://www.americanthinker.com/articles/2018/...,Debra Katz Was Wrong about Paula Jones's Case,[],americanthinker.com,Debra Katz Was Wrong about Paula Jones's Case\...,1444,"{'bill clinton': 1, 'hillary clinton': 1, 'tot...",{'total': 0},2,...,5,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, The legal viability of Ca..."
3,9/28/18,https://www.americanthinker.com/articles/2018/...,Fake Rape Victims Are More Fun than Real Ones,[],americanthinker.com,Fake Rape Victims Are More Fun than Real Ones\...,1151,{'total': 0},{'total': 0},0,...,3,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, Fake Rape Victims Are Mor..."
4,9/20/18,https://www.americanthinker.com/articles/2018/...,Ford vs. Kavanaugh: There's Nothing to Investi...,[],americanthinker.com,The problem with that is that there is nothing...,934,"{'kennedy': 1, 'house': 4, 'total': 5}",{'total': 0},5,...,3,-1,-1,-1,-1,-1,3,0,0,"(Summarize Text: \n, And Christine Blasey Ford..."
5,9/19/18,https://www.americanthinker.com/articles/2018/...,Ford's Accusation against Kavanaugh Is Not Cre...,[],americanthinker.com,Now the Dems and the media say any claim of se...,1234,"{'dianne feinstein': 2, 'bill clinton': 2, 'ke...",{'total': 0},8,...,5,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, Kavanaugh was on top of m..."
6,9/21/18,https://www.americanthinker.com/articles/2018/...,J'accuse as the New Legal Standard,[],americanthinker.com,""" J'accuse "" was the title of an editorial pub...",1194,"{'dianne feinstein': 1, 'keith ellison': 1, 'b...",{'total': 0},11,...,3,-1,-1,-1,-1,-1,0,0,0,"(Summarize Text: \n, They will stall and try t..."
7,9/25/18,https://www.americanthinker.com/articles/2018/...,Judge Kavanaugh and Sexual McCarthyism,[],americanthinker.com,McCarthy was looking for communists in the Sta...,1310,"{'dianne feinstein': 1, 'kennedy': 1, 'donald ...",{'total': 0},6,...,3,-1,-1,-1,-1,-1,0,0,0,"(Summarize Text: \n, The charges have been mad..."
8,9/18/18,https://www.americanthinker.com/articles/2018/...,Judge Kavanaugh Meets His Anita Hill,[],americanthinker.com,After the Democrats prattled on about not havi...,1206,"{'chuck grassley': 1, 'bernie sanders': 1, 'di...",{'total': 0},13,...,0,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, She is also among the tho..."
9,9/29/18,https://www.americanthinker.com/articles/2018/...,Kavanaugh Character Assassins Must Pay in Nove...,[],americanthinker.com,From Dianne Feinstein's withholding of Dr. For...,1326,"{'richard blumenthal': 1, 'kamala harris': 1, ...",{'total': 0},22,...,0,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, Sen. Sen. Sen. Blumenthal..."


In [26]:
breakdown('embedding_kmeans_cluster_6')

CLUSTER  3
PJMedia                      10
The American Spectator        9
Hot Air                       9
National Review               7
americanthinker.com           7
thefederalist.com             7
frontpagemag.com              7
conservativetribune.com       6
conservativereview.com        6
pamelageller.com              6
CNS News                      6
Wall Street Journal           6
townhall.com                  6
weeklystandard.com            5
NewsMax                       4
Right Scoop                   4
Michael Savage                4
The Blaze                     3
whatreallyhappened            3
Western Journalism Center     3
Free Beacon                   2
Daily Wire                    2
Zero Hedge                    2
Breitbart                     2
thepoliticalinsider.com       2
Conservative Treehouse        1
Washington Times              1
bigleaguepolitics.com         1
WorldNetDaily                 1
InfoWars                      1
Name: media_site, dtype: int6

In [27]:
# print cluster
df[df['embedding_kmeans_cluster_6']==5]

Unnamed: 0,publish_date,url,title,authors,media_site,article,word_count,cue_stats,cue_stats_title,total,...,embedding_kmeans_cluster_6,tf_idf_dbscan_epsilon_0.4,tf_idf_dbscan_epsilon_0.85,bow_dbscan_epsilon_0.4,bow_dbscan_epsilon_0.85,embedding_dbscan_epsilon_0.4,embedding_dbscan_epsilon_0.45,embedding_dbscan_epsilon_0.85,embedding_dbscan_epsilon_0.9,summary
2,9/25/18,https://www.americanthinker.com/articles/2018/...,Debra Katz Was Wrong about Paula Jones's Case,[],americanthinker.com,Debra Katz Was Wrong about Paula Jones's Case\...,1444,"{'bill clinton': 1, 'hillary clinton': 1, 'tot...",{'total': 0},2,...,5,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, The legal viability of Ca..."
5,9/19/18,https://www.americanthinker.com/articles/2018/...,Ford's Accusation against Kavanaugh Is Not Cre...,[],americanthinker.com,Now the Dems and the media say any claim of se...,1234,"{'dianne feinstein': 2, 'bill clinton': 2, 'ke...",{'total': 0},8,...,5,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, Kavanaugh was on top of m..."
70,9/24/18,https://bigleaguepolitics.com/democrats-active...,Democrats Actively Sought Second Accuser Again...,[Peter D'Abrosca],bigleaguepolitics.com,The author of a slanderous political hit piece...,355,"{'senate': 1, 'total': 1}",{'total': 0},1,...,5,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, It never came up. The aut..."
121,9/18/18,http://feedproxy.google.com/~r/breitbart/~3/Wb...,Third Person that Kavanaugh Accuser Claims Was...,[Ian Mason],Breitbart,A third high school student Brett Kavanaugh ac...,760,"{'chuck grassley': 1, 'dianne feinstein': 1, '...",{'total': 0},8,...,5,-1,-1,-1,-1,-1,1,0,0,"(Summarize Text: \n, A third high school stude..."
350,9/19/18 14:25,https://www.cnsnews.com/news/article/terence-p...,Third Alleged Party Attendee Denies Kavanaugh ...,[Terence P. Jeffrey],CNS News,Sen. Dianne Feinstein (D.-Calif.) questions Ju...,1215,"{'chuck grassley': 1, 'dianne feinstein': 3, '...",{'total': 0},15,...,5,-1,-1,-1,-1,-1,1,0,0,"(Summarize Text: \n, Ford wants to cooperate w..."
352,9/21/18 15:10,https://www.cnsnews.com/news/article/emily-war...,87 Women Who Know Brett Kavanaugh: The Allegat...,[Emily Ward],CNS News,"Sara Fagen, #IStandWithBrett.\n\n(CNSNews.com)...",991,"{'house': 1, 'senate judiciary committee': 1, ...",{'total': 0},3,...,5,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, Our country simply cannot..."
443,10/3/18 22:35,https://theconservativetreehouse.com/2018/10/0...,Retired FBI Agent/DOJ Lawyer Ms. Monica McLean...,[Posted On],Conservative Treehouse,Earlier today we did a deep dive into the back...,298,"{'dianne feinstein': 1, 'total': 1}",{'total': 0},1,...,5,-1,0,-1,-1,-1,-1,0,0,"(Summarize Text: \n, Ms. Ms. Ford and Ms. Ford..."
444,9/22/18 23:15,https://theconservativetreehouse.com/2018/09/2...,Three Witnesses Described by Kavanaugh Accuser...,[Posted On],Conservative Treehouse,Dr. Christine Blasey-Ford has stated three wit...,504,"{'house': 5, 'senate judiciary committee': 1, ...",{'total': 0},7,...,5,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, Kavanaugh and she has no ..."
448,9/24/18 22:54,https://theconservativetreehouse.com/2018/09/2...,Ms. Blasey-Ford Attorney Casts Doubt on Appear...,[Posted On],Conservative Treehouse,As of last weekend sketchy DC political lawyer...,569,"{'house': 7, 'senate judiciary committee': 2, ...",{'total': 0},11,...,5,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, Ford there were five peop..."
474,9/20/18 12:00,https://www.conservativereview.com/news/kavana...,Kavanaugh accuser&#8217;s classmate: What? You...,[Chris Pandolfo],conservativereview.com,Kavanaugh accuser’s classmate: What? You mean ...,324,"{'senate judiciary committee': 1, 'senate': 1,...",{'total': 0},2,...,5,-1,-1,-1,-1,-1,-1,0,0,"(Summarize Text: \n, Kavanaugh accuser’s class..."


In [29]:
df['summary'].loc[4175]

('Summarize Text: \n',
 'Keyser does not refute Dr. Christine Ford’s allegation against Judge Brett Kavanaugh.”\n\nWalsh went on to stipulate that “as my client as already made clear, she does not know Judge Kavanaugh and has no recollection of ever being at a party or gathering where he was present, with, or without, Dr. Last week, Keyser said in a statement from her attorney, on penalty of a felony, that she didn’t attend such a party and didn’t even know Kavanaugh.\n\n“Simply put, Ms. Kavanaugh and she has no recollection of ever being at a party or gathering where he was present, with, or without, Dr. All of the individuals who she claimed attended the party with her and Kavanaugh deny any knowledge of the event taking place.\n\nOne of those people is Ford’s close friend Leland Keyser')