In [250]:
import gensim
import dill
import pandas as pd

In [331]:
def get_data(bias):
    with open('data/{}_articles.pkl'.format(bias), 'rb') as f:
        data = dill.load(f)
    text = [article['text'] for article in data]
    kws = [article['keywords'] for article in data]
    titles = [article['title'] for article in data]
    dates = [article['date'] for article in data]
    links = [article['date'] for article in data]
    df = pd.DataFrame.from_dict({'title': titles, 'text': text, 'date': dates})
    df = df.drop_duplicates('title')
    df = df[df['text'].str.len() > 500]
    df = df.reset_index()
    return df

In [332]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk

In [333]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [334]:
data = get_data('right')
processed_text = data['text'].map(preprocess)

In [335]:
dictionary = gensim.corpora.Dictionary(processed_text)
# dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(text) for text in processed_text]

In [352]:
len(dictionary)

14561

In [336]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

text_vectors = np.zeros((len(bow_corpus), len(dictionary)))
for i in range(len(bow_corpus)):
    vec = tfidf[bow_corpus[i]]
    for word, weight in vec:
        text_vectors[i][word] = weight

In [351]:
query_idx = 500
vec = text_vectors[query_idx,:]

diff = np.sum(np.abs(text_vectors - vec),axis=1)
recommend_idx = np.argsort(diff)[1]

print('Min diff: ', diff[recommend_idx])

print('Title: ', data.loc[query_idx]['title'], '\n')
print('Text: ', data.loc[query_idx]['text'], '\n\n')
# print('Date: ', data.loc[query_idx]['date'], '\n')
print('Title: ', data.loc[recommend_idx]['title'], '\n')
print('Text: ', data.loc[recommend_idx]['text'])
print(recommend_idx)

Min diff:  10.901514942185178
Title:  IHop Employee Shoots and Kills Active Shooter at Restaurant 

Text:  Two people, including a suspected active shooter are dead following a fatal altercation at a Huntsville, Alabama IHOP restaurant earlier this week. The armed employee who drew his own firearm to stop the shooting was also wounded in the shootout. Police are new releasing more details about the incident.

[Scroll Down For Video]

“We have crime scene investigators and our major crimes unit is here,” Lt. Michael Johnson said Wednesday night. “Right now we have witnesses writing out statements and talking with multiple investigators. We have a lot of moving parts right now. There are people who are emotional. We just need a little time to piece this together.”

According to a local media report:

Police say the incident began as a dispute related to restaurant service. Investigators say that [customer Roderick Turner, 25] was at the restaurant to pick up a to-go order when he became 

In [257]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=5, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.009*"say" + 0.009*"peopl" + 0.008*"like" + 0.006*"year" + 0.006*"school" + 0.006*"work" + 0.005*"time" + 0.005*"think" + 0.005*"know" + 0.004*"go"
Topic: 1 
Words: 0.006*"say" + 0.006*"peopl" + 0.005*"like" + 0.004*"year" + 0.003*"kelli" + 0.003*"women" + 0.003*"know" + 0.003*"go" + 0.003*"moon" + 0.003*"cooki"
Topic: 2 
Words: 0.007*"world" + 0.007*"nowthi" + 0.006*"report" + 0.005*"prison" + 0.005*"time" + 0.005*"like" + 0.004*"trump" + 0.004*"peopl" + 0.004*"say" + 0.004*"year"
Topic: 3 
Words: 0.009*"peopl" + 0.008*"white" + 0.006*"black" + 0.005*"worker" + 0.005*"like" + 0.005*"state" + 0.005*"time" + 0.005*"year" + 0.005*"come" + 0.004*"ident"
Topic: 4 
Words: 0.009*"say" + 0.008*"like" + 0.006*"trump" + 0.005*"go" + 0.005*"time" + 0.005*"year" + 0.005*"warren" + 0.004*"women" + 0.004*"know" + 0.004*"tucker"
Topic: 5 
Words: 0.006*"hitler" + 0.006*"babi" + 0.004*"shapiro" + 0.004*"say" + 0.003*"januari" + 0.003*"child" + 0.003*"violenc" + 0.003*"leg" + 0.003*"p

In [262]:
text_vectors = np.zeros((len(bow_corpus), 10))
for i in range(len(bow_corpus)):
    tps = lda_model.get_document_topics(bow_corpus[i])
    for topic, weight in tps:
        text_vectors[i][topic] = weight

In [273]:
query_idx = 120
vec = text_vectors[query_idx,:]

diff = np.sum(np.abs(text_vectors - vec),axis=1)
recommend_idx = np.argsort(diff)[1]

print('Title: ', data.loc[query_idx]['title'], '\n')
print('Date: ', data.loc[query_idx]['date'], '\n')
print('Text: ', data.loc[query_idx]['text'])
data.loc[query_idx]['date']

Title:  Can the U.S. Reinstate “Maximum Pressure” on North Korea? 

Date:  2018-12-04 00:00:00 

Text:  The diplomatic sprint to North Korean denuclearization has slowed to a crawl. Earlier last month, North Korea abruptly canceled talks with U.S. Secretary of State Mike Pompeo, with reports suggesting that Pyongyang continues to enhance its nuclear and missile capabilities. Despite U.S. President Donald Trump’s insistence that North Korean leader Kim Jong Un is serious about giving up his nuclear weapons, chances are good that the United States is going to need a Plan B to manage the nuclear threat.

Unfortunately, the air had already been leaking out of the Trump administration’s “maximum pressure” strategy since early to mid-2018. Worse still, it will likely prove extremely difficult to revive international efforts to squeeze North Korea if the current diplomatic push hits a dead end. Key countries that were supportive of the pressure campaign—most notably China and South Korea—are 

datetime.datetime(2018, 12, 4, 0, 0)

In [274]:
print(recommend_idx)
print('Title: ', data.loc[recommend_idx]['title'], '\n')
print('Date: ', data.loc[recommend_idx]['date'], '\n')
print('Text: ', data.loc[recommend_idx]['text'])

128
Title:  News You Can Choose: Iowa Bar Cancels MLK-Themed Keg Party 

Date:  None 

Text:  Screenshot: Twitter

Note: To read the story of how an Iowa bar callously disrespected the legacy and the name of Martin Luther King Jr., please read the plain type. To read the story about how an Iowa bar came to its senses and apologized for its offensive, racist actions, follow the story in bold type.



As America celebrates the birth of one of its greatest civil rights icons, an Iowa bar decided it was a perfectly good idea to name a huge beer bash after Martin Luther King Jr, only reconsidering after considerable public outcry, proving that White America continues one of this country’s oldest traditions: “Trying it.”

As America celebrates the birth of one of its greatest civil rights icons, public outcry forced an Iowa bar to cancel a huge beer bash named after Martin Luther King Jr., proving that this country has made strides towards racial progress.

Cedar Falls, Iowa is one of the wh