<a href="https://colab.research.google.com/github/vallirajasekar/Cricket_Chatbot/blob/main/Cricket_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional


In [2]:
urls = [
    "https://en.wikipedia.org/wiki/Indian_Premier_League","https://www.iplt20.com/","https://www.cricbuzz.com/","https://www.google.com/search?q=cricinfo&oq=&aqs=chrome.2.69i57j69i64j69i59j0i67i650j0i20i131i263i433i512j0i20i263i433i512j0i131i433i512j0i512.5015j0j15&sourceid=chrome&ie=UTF-8","https://www.espncricinfo.com/cricket-news"
]


In [3]:
def fetch_data(urls):
    raw_docs = []
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        text = soup.get_text(separator=" ")
        raw_docs.append(text.lower())
    return raw_docs


In [4]:
raw_docs = fetch_data(urls)


In [5]:
import nltk

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
nltk.download('wordnet')

# Create a WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

sent_tokens = []
word_tokens = []

for doc in raw_docs:
    sentences = sent_tokenize(doc)
    sent_tokens.extend(sentences)
    words = [lemmatizer.lemmatize(word) for word in word_tokenize(doc)]
    word_tokens.extend(words)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
# Prepare the training data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sent_tokens)
total_words = len(tokenizer.word_index) + 1


In [8]:
input_sequences = []
for sentence in sent_tokens:
    seq = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(seq)):
        n_gram_sequence = seq[:i+1]
        input_sequences.append(n_gram_sequence)


In [9]:
max_sequence_len = max([len(seq) for seq in input_sequences])
padded_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')


In [10]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

# ...

x_train = padded_sequences[:, :-1]
y_train = padded_sequences[:, -1]
y_train = to_categorical(y_train, num_classes=total_words)

# ...


In [11]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=100, verbose=1)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 75/100
Epoch 76/100
Epoch 76

In [None]:
import random

# Function to generate a response from the model
def generate_response(model, tokenizer, max_sequence_len, input_text):
    input_text = input_text.lower()
    tokenized_text = tokenizer.texts_to_sequences([input_text])[0]
    tokenized_text = pad_sequences([tokenized_text], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict_classes(tokenized_text, verbose=0)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    return output_word


In [None]:
# Function to generate a response from the model
def generate_response(model, tokenizer, max_sequence_len, input_text):
    input_text = input_text.lower()
    tokenized_text = tokenizer.texts_to_sequences([input_text])[0]
    tokenized_text = pad_sequences([tokenized_text], maxlen=max_sequence_len-1, padding='pre')
    predicted_probs = model.predict(tokenized_text)[0]
    predicted_index = tf.argmax(predicted_probs).numpy()
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            output_word = word
            break
    return output_word


In [None]:
def chatbot_demo(model, tokenizer, max_sequence_len):
    print("Chatbot Demo")
    print("Enter 'exit' to end the conversation.")
    while True:
        user_input = input("User: ")
        if user_input.lower() == 'exit':
            print("Chatbot: Goodbye!")
            break
        response = generate_response(model, tokenizer, max_sequence_len, user_input)
        print("Chatbot:", response)


In [1]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the pre-trained BERT model and tokenizer
model_name = 'model'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)


In [2]:
def generate_response(model, tokenizer, input_text):
    input_text = input_text.lower()
    encoded_input = tokenizer.encode_plus(input_text, add_special_tokens=True, padding='max_length', max_length=128, truncation=True, return_tensors='pt')
    input_ids = encoded_input['input_ids']
    attention_mask = encoded_input['attention_mask']
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted_labels = torch.max(outputs.logits, dim=1)
    
    predicted_index = predicted_labels.item()
    output_word = tokenizer.convert_ids_to_tokens(predicted_index)
    
    return output_word


In [3]:
def chatbot_demo(model, tokenizer):
    print("Chatbot Demo")
    print("Enter 'exit' to end the conversation.")
    while True:
        user_input = input("User: ")
        if user_input.lower() == 'exit':
            print("Chatbot: Goodbye!")
            break
        elif user_input.lower() in ['hi', 'hello', 'hey']:
            print("Chatbot: Hi there!")
        elif '?' in user_input:
            print("Chatbot: I'm sorry, I don't have the answer to that question.")
        else:
            response = generate_response(model, tokenizer, user_input)
            print("Chatbot:", response)


In [5]:
chatbot_demo(model, tokenizer)


NameError: ignored

In [17]:
chatbot_demo(model, tokenizer, max_sequence_len)


Chatbot Demo
Enter 'exit' to end the conversation.
User: KKR
Chatbot: final
User: Hi
Chatbot: 134
User: CSK
Chatbot: v
User: Chennai
Chatbot: super
User: Kings
Chatbot: out
User: cricinfo
Chatbot: google
User: dhoni
Chatbot: back
User: rohit
Chatbot: needed
User: virat
Chatbot: today
User: ipl 2022
Chatbot: records
User: records
Chatbot: the


KeyboardInterrupt: ignored