## AI Business News

In [1]:
import requests
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen as uReq
import urllib.request as urllib2
import pandas as pd
from datetime import datetime
from dateutil.parser import parse
import nltk as nltk
import re
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

In [3]:
def extract_AIB_html_summary(AIB_html):
    try:
        AIB_html_sumarry = AIB_html.findAll("h5")
    except:
        AIB_html_sumarry = "No summary available"
    return AIB_html_sumarry

In [4]:
def extract_AIB_html_time(AIB_html):
    try:
        AIB_html_time = AIB_html.findAll("time")
    except:
        AIB_html_time = "No tile available"
    return AIB_html_time

In [5]:
def get_NewsHeadlines(summary_content):
    try:
        news_headline = summary_content.a.text
    except:
        news_headline = "No News Headlines"
    return news_headline

In [6]:
def get_NewsLinks(summary_content):
    try:
        news_link ="https://aibusiness.com/" + summary_content.a['href']
    except:
        news_link = "No News Link"
    return news_link

In [7]:
def get_PublishedDate(time):
    try:
        published_date = time.text
        published_date = parse(published_date).strftime("%d %b %Y ")
    except:
        published_date = "No dates published"
    return published_date

In [8]:
def get_NewsDescriptions(news_content):
    news_body = ""
    try:
        for paragraph in news_content.findAll('p')[2:]:
                paragraph = paragraph.get_text()
                news_body = news_body + paragraph
    except:
        news_body = "No News Description"
    return news_body

In [9]:
## Remove Punctuation
def remove_punctuation(text):
    try:
        text_nopunct = ''
        #text_nopunct = re.sub('['+string.punctuation+']', ' ', text)
        text_nopunct = re.sub('[^A-Za-z0-9]+', ' ', text)
    except:
        text_nopunct = "Text punctuation removal not working"
    return text_nopunct

In [10]:
## Convert to lower case
def lowercase_token(tokens): 
    return [w.lower() for w in tokens]

In [11]:
## Remove stopwords
def removestopwords(tokens, stoplist): 
    return [word for word in tokens if word not in stoplist]

In [12]:
## Lematization of words
def process_lemmatize_words(filtered_words):
    lemmatizer=WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in filtered_words]



In [13]:
## Lematization of words verbs
def process_lemmatize_verbs(lemmatize_words):
    lemmatizer=WordNetLemmatizer()
    return [lemmatizer.lemmatize(word, pos='v') for word in lemmatize_words]

In [14]:
def save_News_to_excel(AIB_NEWS_DataFrame,FileName):
    try:
        AIB_NEWS_DataFrame.to_excel(excel_writer= FileName+".xlsx",sheet_name=FileName)
        print("Files Saved Successfully")
    except:
        print("Error occured in saving the file.")

In [15]:
def AIB_News(FileName):
    AIB_NEWS_DataFrame = pd.DataFrame()
    AIB_urls = []
    for page_no in range(3):
        AIB_urls.append("https://aibusiness.com/archives.asp?section_id=782&piddl_archivepage={}".format(page_no))
    
    AIB_urls = AIB_urls[1:]
    
    for AIB_url in AIB_urls:
        try:
            req = urllib2.Request(AIB_url, headers=hdr)
            uClient = uReq(req) # requesting the webpage from the internet
            news_page = uClient.read() # reading the webpage
            uClient.close
            AIB_html = bs(news_page, "html.parser")
        except:
            print("Webpage Not Responding. Try Including Header")
        AIB_html_sumarry = extract_AIB_html_summary(AIB_html)
        AIB_html_time = extract_AIB_html_time(AIB_html)
        News_Headlines = []
        News_Links = []
        Dates_Published = []
        News_Descriptions = []
        for summary_content in AIB_html_sumarry:
            #Get List Of News HeadLines
            news_headline = get_NewsHeadlines(summary_content)
            News_Headlines.append(news_headline)
            
            #Get List Of News Links
            news_link = get_NewsLinks(summary_content)
            News_Links.append(news_link)
            
        for time in AIB_html_time:
            #Get Published Date
            published_date = get_PublishedDate(time)
            Dates_Published.append(published_date)
            
        for news_link in News_Links:
            try:
                newsRes = requests.get(news_link)
                newsRes.encoding = 'utf-8'
                news_content = bs(newsRes.text, "html.parser")
            except:
                print("No content for news")
            #Get news Description 
            news_description = get_NewsDescriptions(news_content)
            News_Descriptions.append(news_description)
        mydict = {"News Link": News_Links, "News Headlines": News_Headlines, "Date Published" : Dates_Published[:-1], "News Description":News_Descriptions, "Domain": "Miscellaneous", "Company": "Miscellaneous"}    
        AIB_NEWS_DataFrame = pd.concat([AIB_NEWS_DataFrame, pd.DataFrame(mydict, index = None)])    
    try:
        AIB_NEWS_DataFrame = AIB_NEWS_DataFrame.rename(columns = {'News Description':'News_Description'})
        AIB_NEWS_DataFrame['News_Description_Cleaned'] = AIB_NEWS_DataFrame['News_Description'].apply(lambda News_Description: remove_punctuation(News_Description))
        ## Tokenize sentences
        tokens = [word_tokenize(sen) for sen in AIB_NEWS_DataFrame.News_Description_Cleaned]
        lower_tokens = [lowercase_token(token) for token in tokens]
        stoplist = stopwords.words('english')
        filtered_words = [removestopwords(sen, stoplist) for sen in lower_tokens]
        lemmatize_words = [process_lemmatize_words(sen) for sen in filtered_words]
        lemmatize_verbs = [process_lemmatize_verbs(sen) for sen in lemmatize_words]
        AIB_NEWS_DataFrame['News_Description_Cleaned'] = [' '.join(sen) for sen in lemmatize_verbs]
        AIB_NEWS_DataFrame['tokens'] = lemmatize_verbs
    except:
        print("There is some problem with text pre-processing")
        
    save_News_to_excel(AIB_NEWS_DataFrame,FileName) 
        

In [16]:
AIB_News('AIB_NEWS')

Files Saved Successfully
