## Presentation (Score-Ratio Metric and TF-IDF)

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from scipy.sparse import load_npz
import gc

In [2]:
%load_ext memory_profiler

In [3]:
contraction_map = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "isn't": "is not",
    "mightn't": "might not",
    "mustn't": "must not",
    "needn't": "need not",
    "shan't": "shall not",
    "shouldn't": "should not",
    "wasn't": "was not",
    "weren't": "were not",
    "won't": "will not",
    "wouldn't": "would not",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "let's": "let us",
    "who's": "who is",
    "what's": "what is",
    "here's": "here is",
    "there's": "there is",
    "when's": "when is",
    "where's": "where is",
    "why's": "why is",
    "how's": "how is",
    "y'all": "you all",
    "o'clock": "of the clock",
    "ma'am": "madam",
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to",
    "lemme": "let me",
    "gimme": "give me",
    "kinda": "kind of",
    "ain’t": "am not",
    "y’all": "you all",
    "could’ve": "could have",
    "should’ve": "should have",
    "would’ve": "would have",
    "might’ve": "might have",
    "must’ve": "must have",
    "shan’t": "shall not",
    "let’s": "let us"
}

In [4]:
def expand_contractions(text):
    for contraction, expanded in contraction_map.items():
        text = text.replace(contraction, expanded)
    return text

In [5]:
import ssl
import certifi
ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())

In [6]:
%%memit 
def expand_contractions(text):
    for contraction, expanded in contraction_map.items():
        text = text.replace(contraction, expanded)
    return text
stop_words = set(stopwords.words('english')) - {"not", "no", "never"}
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = expand_contractions(text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

peak memory: 159.91 MiB, increment: 0.19 MiB


In [7]:
gc.collect()

7

In [8]:
%memit tf_idf_matrix = load_npz("Dataset/sparse_matrix.npz").astype(np.float32)

peak memory: 415.45 MiB, increment: 254.33 MiB


In [9]:
%%memit
word_index_df = pd.read_csv('Dataset/word_to_index.csv', keep_default_na=False)
word_index_df['index'] = word_index_df['index'].astype(np.int32)
unique_words = dict(zip(word_index_df['word'], word_index_df['index']))
del word_index_df
temp_idf = pd.read_csv('Dataset/idf.csv', keep_default_na=False)
idf = dict(zip(temp_idf['word'], temp_idf['idf_score']))
del temp_idf

peak memory: 157.95 MiB, increment: 0.56 MiB


In [13]:
%%memit
df = pd.read_csv("Dataset/best_ans.csv")

peak memory: 762.72 MiB, increment: 600.44 MiB


In [14]:
# We could go only best answers if needed (would cut down a lot)

In [21]:

def chatbot_reply(user_query):
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data,cols = [],[]
    for word,count in tf.items():
        if word in unique_words:
            data.append((count/length)*idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix((data, ([0]*len(cols), cols)), shape=(1, len(unique_words)))
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    idx = similarity.argmax()
    print(idx)
    Id = df.iloc[idx]['Id']
    return df.iloc[idx]['Body_answer']

In [22]:
df.iloc[469]['Id']

np.int64(126524)

In [23]:
chatbot_reply("how to reverse a list in Python?")

38364


'alright, i\'ll give it a try. "broken pipe" on the server side usually means that the client closes the connection while the server is still sending data. from your previous question, i assume your client is a browser (using the tag). that most probably means that the browser does not support playback of mpeg transport streams. actually i haven\'t heard of any browser that supports it. maybe you should try to stream an ogg theora video (mime type "video/theora") for testing - firefox 3.1+ supports this out of the box. if that works, your server implementation is correct.'

In [24]:
%%memit
user_query = input("Enter query: ")
chatbot_reply(user_query)

199621
peak memory: 829.38 MiB, increment: 557.28 MiB
