### Telegram ChatBot with ML

In [None]:
pip install python-telegram-bot --upgrade

In [223]:
import random
import nltk
import json

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, LogisticRegressionCV, Perceptron, RidgeClassifier, RidgeClassifierCV, SGDClassifier
from sklearn.model_selection import train_test_split

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, KFold

import logging
from telegram import Update, ForceReply
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackContext

In [5]:
# BOT_CONFIG = {
#     'intents': {
#         'hello': {
#             'examples': ['Хай', 'Конничива', 'Васап'],
#             'responses': ['Привет', 'Добрый вечер']
#         },
#         'bye': {
#             'examples': ['Пока', 'Увидимся!', 'Покеда'],
#             'responses': ['До свидания', 'Прощайте', 'Сайонара!']
#         }
#     }
# }

with open('BOT_CONFIG.json', 'r') as f:
    BOT_CONFIG = json.load(f)

In [3]:
def clean(text):
    text = text.lower()
    cleaned_text = ''
    for ch in text:
        if ch in 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя ':
            cleaned_text += ch
    
    return cleaned_text

def get_intent(text):
    text = clean(text)
    for intent in BOT_CONFIG['intents']:
        try:
            for example in BOT_CONFIG['intents'][intent]['examples']:
                example = clean(example)
                if nltk.edit_distance(example, text) / max(len(text), len(example)) < 0.4:
                    return intent
        
        except:
            pass
    
    return 'intent not found'

In [6]:
X = []
y = []

for intent in BOT_CONFIG['intents']:
    try:
        for example in BOT_CONFIG['intents'][intent]['examples']:
            X.append(example)
            y.append(intent)
    except:
        pass

len(X), len(y), len(set(y))

(841, 841, 243)

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

(672, 169, 672, 169)

In [235]:
vectorizer = TfidfVectorizer(preprocessor=clean, analyzer='char_wb', ngram_range=(2,3))
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)
len(vectorizer.get_feature_names())

# поиграться с векторайзерами

2274

In [193]:
model = LogisticRegression(C=1, class_weight='balanced')
#model = SVC(C=1000, random_state=241, kernel='linear')

# чем больше C – чем сильнее модель МО зубрит ответы
model.fit(X_train_vect, y_train)

print(f'train {model.score(X_train_vect, y_train)}')
print(f'test {model.score(X_test_vect, y_test)}')

# поиграться с моделями

train 0.7291666666666666
test 0.27218934911242604


In [228]:
# just playing with grid search

grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X_train_vect, y_train)

gs.best_estimator_, gs.best_score_, gs.best_params_

(SVC(C=100.0, kernel='linear', random_state=241),
 0.15322277501381978,
 {'C': 100.0})

In [263]:
# just playing with tf \ idf calculation

corpus = [
    'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
 ]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names(), vectorizer.vocabulary_, vectorizer.idf_

np.log(5/2) + 1 # idf = ln(N + 1 / df + 1) + 1

1.916290731874155

In [156]:
# поискать норм дата-сет для чат бота

def get_intent_by_model(text):
    return model.predict(vectorizer.transform([text]))[0]

In [11]:
def bot(question):
    intent = get_intent_by_model(question)
    return random.choice(BOT_CONFIG['intents'][intent]['responses'])

In [73]:
question = ''
while True:
    question = input()
    if question != 'стоп':
        answer = bot(question)
        print(answer)
    else:
        break

стоп


In [138]:
# Enable logging
logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO
)

logger = logging.getLogger(__name__)


# Define a few command handlers. These usually take the two arguments update and
# context.
def start(update: Update, context: CallbackContext) -> None:
    """Send a message when the command /start is issued."""
    user = update.effective_user
    update.message.reply_markdown_v2(
        fr'Hi {user.mention_markdown_v2()}\!',
        reply_markup=ForceReply(selective=True),
    )


def help_command(update: Update, context: CallbackContext) -> None:
    """Send a message when the command /help is issued."""
    update.message.reply_text('Help!')


def answer(update: Update, context: CallbackContext) -> None:
    """Answer the user message."""
    question = update.message.text
    try:
      answer = bot(question)
    except:
      answer = 'Извините, что-то сломалось =('
      
    update.message.reply_text(answer)


def main() -> None:
    """Start the bot."""
    updater = Updater("1971454798:AAHLLbwzKp8hXfHLNo_KHg23c7420dsbstc")

    # Get the dispatcher to register handlers
    dispatcher = updater.dispatcher

    dispatcher.add_handler(CommandHandler("start", start))
    dispatcher.add_handler(CommandHandler("help", help_command))

    # on non command i.e message - answer the message on Telegram
    dispatcher.add_handler(MessageHandler(Filters.text & ~Filters.command, answer))

    updater.start_polling()

    # Run the bot until you press Ctrl-C or the process receives SIGINT,
    # SIGTERM or SIGABRT. This should be used most of the time, since
    # start_polling() is non-blocking and will stop the bot gracefully.
    updater.idle()