In [22]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
from ipywidgets import HBox, VBox
from IPython.display import display
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re
import string
import random
import nbinteract as nbi
%matplotlib inline
pd.set_option('display.max_colwidth', None)

In [23]:
df = pd.read_csv('newsData.csv')

In [24]:
# Function for converting into lower case
def make_lower_case(text):
    return text.lower()


# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# Function for removing the html tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

# Applying all the functions in description and storing as "cleaned"
df['cleaned'] = df['text'].apply(func = make_lower_case)
df['cleaned'] = df.cleaned.apply(func=remove_punctuation)
df['cleaned'] = df.cleaned.apply(func=remove_html)

In [25]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),max_df=0.8, min_df=0, stop_words='english')

In [26]:
matrix = tf.fit_transform(df['cleaned'])


In [27]:
cosSim= cosine_similarity(matrix, matrix)


In [28]:
def recommender(title):
    #create indices for titles
    titles = df['title']
    link = df['link']
    indices = pd.Series(df.index, index=df['title'])
    ##recommend based on cosine similarity
    idx = indices[title]
    similarity = list(enumerate(cosSim[idx]))
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    similarity = similarity[1:21]
    news_indices = [i[0] for i in similarity]
    newsTitle = titles.iloc[news_indices]
    newsLink = link.iloc[news_indices]
    return [newsTitle, newsLink]

In [29]:
df2 = pd.read_csv('newsData.csv')

In [30]:
df2 = df2.replace({'\'': ''}, regex=True)

In [31]:
df2 = df2.replace({'\]': ''}, regex=True)

In [32]:
df2 = df2.replace({'\[': ''}, regex=True)

In [33]:
df2 = df2[['title','text','keywords','link']]

In [34]:
keywords = df2['keywords'].str.split(',')

In [35]:
keywords = df2['keywords'].fillna("").astype('str')

In [36]:
tf2 = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

In [37]:
matrix2 = tf2.fit_transform(keywords)
##show matrix dimensions

In [38]:
cosSim2= cosine_similarity(matrix2, matrix2)

In [39]:
def recommender2(title):
    #create indices for titles
    titles = df['title']
    link = df['link']
    indices = pd.Series(df.index, index=df['title'])
    ##recommend based on cosine similarity
    idx = indices[title]
    similarity = list(enumerate(cosSim2[idx]))
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    similarity = similarity[1:21]
    news_indices = [i[0] for i in similarity]
    newsTitle = titles.iloc[news_indices]
    newsLink = link.iloc[news_indices]
    return [newsTitle, newsLink]

 # News Recommender system
 ## | Get recommendations based on article text |
 ### enter a title to view top 20 recommendations

In [53]:
interact = interact_manual.options(manual_name="find recommendations")
im = interact(recommender, title='')
im.widget.children[0].description = 'enter title'
display(im)

interactive(children=(Text(value='', description='title'), Button(description='find recommendations', style=Bu…

<function __main__.recommender(title)>

 ## | Get recommendations based on keyword tags |
 ### enter a title to view top 20 recommendations

In [54]:
interact2 = interact_manual.options(manual_name="find recommendations")
im2 = interact(recommender2, title='')
im2.widget.children[0].description = 'enter title'
display(im2)

interactive(children=(Text(value='', description='title'), Button(description='find recommendations', style=Bu…

<function __main__.recommender2(title)>