In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import re
import tqdm
from collections import defaultdict
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.sparse import save_npz, load_npz
import functions

In [2]:
tf_idf_matrix = load_npz("Dataset/sparse_matrix.npz")
df = pd.read_csv("Dataset/cleaned.csv")
word_index_df = pd.read_csv('Dataset/word_to_index.csv', keep_default_na=False)
unique_words = dict(zip(word_index_df['word'], word_index_df['index']))
idf = pd.read_csv('Dataset/idf.csv', keep_default_na=False)
idf = dict(zip(idf['word'], idf['idf_score']))
df = df.dropna()
unique_df = df[['question',"Id"]].drop_duplicates()

In [3]:
stop_words = set(stopwords.words('english')) - {"not", "no", "never"}
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    # Expand contractions
    text = functions.expand_contractions(text)
    
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercase and keep only alphabetic words
    tokens = [word for word in tokens if word.isalpha()]

    # Remove stopwords
    tokens = [w for w in tokens if w not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

In [4]:
def chatbot_reply(user_query):
    # Transform user query into TF-IDF vector using the same steps we did on the dataframe
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data,cols = [],[]
    for word,count in tf.items():
        if word in unique_words:
            data.append((count/length)*idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix((data, ([0]*len(cols), cols)), shape=(1, len(unique_words)))

    # Compute cosine similarity
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    
    # Find the most similar question
    idx = similarity.argmax()
    print(idx)
    Id = unique_df.iloc[idx]['Id']
    best_ans = df[df['Id'] == Id].sort_values('Score_answer',ascending=False).iloc[0]
    # Retrieve the best matching Q&A
    return best_ans["Body_answer"], best_ans["question"], similarity[idx]

In [5]:
def top_n_results(user_query,n=1,score_req = 10):
    # Transform user query into TF-IDF vector
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data,cols = [],[]
    for word,count in tf.items():
        if word in unique_words:
            data.append((count/length)*idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix((data, ([0]*len(cols), cols)), shape=(1, len(unique_words)))

    # Compute cosine similarity
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    # Find the most similar question
    idx = np.argsort(-similarity)[:n]
    values = -np.sort(-similarity)[:n]
    idx_values = pd.DataFrame({'Id':unique_df.iloc[idx]['Id'], 'Similarity':values})
    Id = unique_df.iloc[idx]['Id']
    best_ans = df[df['Id'].isin(Id)]
    best_ans = best_ans[best_ans['Score_answer']>=score_req]
    return best_ans.merge(idx_values, on='Id')

In [6]:
top_n_results("How to print a backslash in Python?",n=5,score_req=10)

Unnamed: 0.1,Unnamed: 0,Id,Tag,Score_question,question,Score_answer,Body_answer,Similarity
0,407390,17327202,"['python', 'string', 'replace', 'backslash']",18,python replace single backslash double backsla...,29,No need to use str.replace or string.replace h...,0.86108
1,447354,19095796,"['python', 'python-2.7']",25,print backslash python write print print print...,33,You need to escape your backslash by preceding...,0.885258
2,447356,19095796,"['python', 'python-2.7']",25,print backslash python write print print print...,19,"Another clue, if you're trying to accomplish s...",0.885258
3,586955,25047976,['python'],8,split string backslash python simple question ...,15,You have the right idea with escaping the back...,0.821077


In [7]:
user_query = input("Enter your question:" )
top_n_results(user_query,n=5,score_req=0)

Unnamed: 0.1,Unnamed: 0,Id,Tag,Score_question,question,Score_answer,Body_answer,Similarity
0,699240,29617176,"['python', 'function', 'for-loop']",-2,running function line loop python possible cal...,3,"You have the ordering wrong, but yes it is pos...",0.900708
1,699241,29617176,"['python', 'function', 'for-loop']",-2,running function line loop python possible cal...,1,"Yes:\n\nfor x in range(4): print(""Hello"")\n\n\...",0.900708
2,699242,29617176,"['python', 'function', 'for-loop']",-2,running function line loop python possible cal...,3,You could use join function also.\n\n>>> print...,0.900708
3,699243,29617176,"['python', 'function', 'for-loop']",-2,running function line loop python possible cal...,1,The thing with those constructs is that they c...,0.900708
4,699244,29617176,"['python', 'function', 'for-loop']",-2,running function line loop python possible cal...,1,"As a extension to Avinash's answer, you don't ...",0.900708
5,741738,31207287,"['python', 'encoding', 'utf-8', 'utf-16']",0,converting loading string file print string pr...,2,Your string appears to have been encoded using...,0.86194
6,834793,34627292,"['python', 'python-2.7', 'pandas', 'dataframe']",1,write csv dataframes using python panda csv fi...,1,You need add index=False to to_csv:\n\nprint d...,0.84038
7,834794,34627292,"['python', 'python-2.7', 'pandas', 'dataframe']",1,write csv dataframes using python panda csv fi...,2,Assuming you named your dataframe df. You nee...,0.84038
8,867316,35765082,"['python', 'asynchronous', 'logging', 'console']",3,console logger not synchronized print stumbled...,1,These are a couple of different issues raised ...,0.92452
9,925288,37889752,"['python', 'function', 'file-handling']",0,write output file multiple time called functio...,1,Try something like this:\n\ndef do():\n ret...,0.878828
