In [None]:
# Description:  NLP Project - To develop a Program interface that provides a 'self learning' chatbot.

In [None]:
# Title:  Webpage Specific AI ChatBot

In [None]:
# Modules and libraries required: 
# ['Docx', 'Google Translator', 'NLTK', 'Newspaper', 'NumPy', 'Pyttsx3', 'Random', 'Sklearn', 'Speech Recognition', 'String', 'Winsound']
# Packages used:
# ['Punkt', 'Wordnet']

In [None]:
# import libraries
import io
import sys
import nltk
import random
import string 
import pyttsx3
import warnings
import winsound
import contextlib
import numpy as np
from docx import Document
from heapq import nlargest
from newspaper import Article
from docx.shared import Inches
from string import punctuation
import speech_recognition as sr
from nltk.corpus import stopwords
from googletrans import Translator
from collections import defaultdict
from nltk.tokenize import sent_tokenize,word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Ignore warnings
warnings.filterwarnings('ignore')

In [None]:
# Downloading packages
nltk.download('punkt', quiet=True)                                           # Download the punkt package
nltk.download('wordnet', quiet=True)                                         # Download the wordnet package

In [None]:
# User input: Webpage URL
inp_url = str(input("Enter the webpage/article URL: "))

In [None]:
# Text extraction from webpage
article = Article(inp_url)
article.download()                                                           # Download the article
article.parse()                                                              # Parse the article
article.nlp()                                 
# Apply Natural Language Processing (NLP)
corpus = article.text                                                        # Store the article text into corpus

In [None]:
# Sample text extracted
print(corpus)

In [None]:
# Tokenization
text = corpus
sent_tokens = nltk.sent_tokenize(text)                                       # Text to a list of sentences 

In [None]:
# Create a dictionary (Key: Value) to remove punctuations  
remove_punct_dict = dict(  (ord(punct), None) for punct in string.punctuation)

In [None]:
# Create a function to return a list of lemmatized lower case words after removing punctuations 
def LemNormalize(text):
    return nltk.word_tokenize(text.lower().translate(remove_punct_dict))

In [None]:
# Summarization of webpage text content
class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        self.min_cut = min_cut
        self.max_cut = max_cut
        self.stopwords = set(stopwords.words("english")+ list(punctuation))  
        
    def _compute_frequencies(self, word_sent):                               # Compute the frequency of each word
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self.stopwords:
                    freq[word] +=1                                           # Frequencies normalization and filtering
        m = float(max(freq.values()))
        for w in freq.keys():
            freq[w] = freq[w]/m
            if freq[w] >= self.max_cut or freq[w] <= self.min_cut:
                del freq[w]
            return freq
    
    def summarize(self, text, n):
            sents = sent_tokenize(text)
            assert n <= len(sents)
            word_sent = [word_tokenize(s.lower()) for s in sents]
            self._freq = self._compute_frequencies(word_sent)
            ranking = defaultdict(int)
            for i,sent in enumerate(word_sent):
                for w in sent:
                    if w in self._freq:
                        ranking[i] += self._freq[w]
            sents_idx = self._rank(ranking,n)
            return [sents[j] for j in sents_idx]
        
    def _rank(self, ranking, n):
            return nlargest(n, ranking, key=ranking.get)

In [None]:
# Greetings and responses initialization
# Greeting input from the user
GREETING_INPUTS = ["hi", "hello",  "hola", "greetings", "hey", "hai"] 
# Greeting responses back to the user
GREETING_RESPONSES = ["hello","hi", "hey", "what's good",  "great to see you here","hey there"]

In [None]:
# Injecting speech characterisitics to the WeBot
frequency = 2500                                                             # Set Frequency To 2500 Hertz
duration = 250                                                               # Set Duration To 1000 ms == 1 second

In [None]:
# Generating voice engine for WeBot
engine = pyttsx3.init()
engine.setProperty('volume',0.8)
engine.setProperty('rate', 150)
voices = engine.getProperty('voices')
engine.setProperty('voice', voices[0].id)

In [None]:
# Function to return a random greeting response to a users greeting
def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [None]:
# Generating responses
def response(user_response):
    robo_response='' # Create an empty response for the bot
    sent_tokens.append(user_response) # Append the users response to the list of sentence tokens
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english') 
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    score = flat[-2]
    if(score==0):
        robo_response = robo_response + "I apologize, I didn't understand."
    else:
        robo_response = robo_response + sent_tokens[idx]
    sent_tokens.remove(user_response) 
       
    return robo_response

In [None]:
# Supress Print Output
@contextlib.contextmanager
def suppress_output():
    old_stdout = sys.stdout
    sys.stdout = io.StringIO()
    try:
        yield
    finally:
        sys.stdout = old_stdout

In [None]:
# Creating chat environment for real-time interactions
flag=True
r = sr.Recognizer()
chat_text = ''
winsound.Beep(frequency, duration)
intro_msg = "I am Website Specific AI Chatbot or WeBot for short. I will answer your queries about this website. If you want to exit, say Exit!"
chat_text = chat_text + "Bot:" + intro_msg + "\n"
print("WeBot: ", intro_msg)
engine.say(intro_msg)
engine.runAndWait()
while(flag==True):    
    print("-------------------------------------------------- Speak --------------------------------------------------")
    winsound.Beep(frequency, duration)
    with sr.Microphone() as source:
        audio_text = r.listen(source)
    winsound.Beep(frequency, duration)
    try:
        with suppress_output():
            user_response = r.recognize_google(audio_text) 
    except:
        user_response = "--- voice not recognized ---"
    
    user_response=user_response.lower()
    print("You: ", user_response)
    chat_text = chat_text + "You: "+ user_response + "\n"
    if(user_response!='exit'):
        if(user_response=='thanks' or user_response=='thank you' ):
            flag=False
            print("WeBot: Always at your service!")
            chat_text = chat_text + "Bot: "+ "Always at your service!" + "\n"
            engine.say("Always at your service!")
            engine.runAndWait()
        else:
            if(greeting(user_response)!=None):
                g_ur = greeting(user_response)
                print("WeBot: "+g_ur)
                chat_text = chat_text + "Bot: "+ g_ur + "\n"
                engine.say(g_ur)
                engine.runAndWait()
            else:
                r_ur = response(user_response)
                print("WeBot: "+r_ur)
                chat_text = chat_text + "Bot: "+ r_ur + "\n"
                engine.say(r_ur)
                engine.runAndWait()
    else:
        flag=False
        print("WeBot: Chat with you later!")
        chat_text = chat_text + "Bot: "+ "Chat with you later!" + "\n"
        engine.say("Chat with you later")
        engine.runAndWait()

In [None]:
# Generating the summarized text
text = text.replace("\n", "")
line_count = 0
summarized_content = "\n\nWebpage Content Summary:"
fs = FrequencySummarizer();
for ranked_sentence in fs.summarize(text,2):
    line_count += 1
    summarized_content = summarized_content + "\n\n --> " + ranked_sentence   
chat_text = chat_text + summarized_content

In [None]:
# Saving chat to the users local document with added feature of the Indian languages along with world languages
document = Document()
translator = Translator()
languages = {'en':'English', 'hi': 'Hindi', 'te':'Telugu', 'ta':'Tamil', 'kn':'Kannada', 'mr':'Marathi', 'ml':'Malayalam', 'fr':'French', 'de':'German'}
document.add_heading('WeBot Chat - Translation', 0)
for language in languages:
    translated_text = translator.translate(chat_text, dest=language).text
    document.add_heading(languages[language], level=1)
    document.add_paragraph(translated_text)
    
document.add_page_break()
document.save('WeBot-Output.docx')