In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import re
import tqdm
from collections import defaultdict
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.sparse import save_npz, load_npz
import functions

In [13]:
tf_idf_matrix = load_npz("Dataset/sparse_matrix.npz")
df = pd.read_csv("Dataset/cleaned.csv",usecols = ['Id','Score_question','question','Score_answer','Body_answer'])
word_index_df = pd.read_csv('Dataset/word_to_index.csv', keep_default_na=False)
unique_words = dict(zip(word_index_df['word'], word_index_df['index']))
idf = pd.read_csv('Dataset/idf.csv', keep_default_na=False)
idf = dict(zip(idf['word'], idf['idf_score']))
df = df.dropna()
unique_df = df[['question',"Id"]].drop_duplicates()

In [15]:
stop_words = set(stopwords.words('english')) - {"not", "no", "never"}
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    # Expand contractions
    text = functions.expand_contractions(text)
    
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercase and keep only alphabetic words
    tokens = [word for word in tokens if word.isalpha()]

    # Remove stopwords
    tokens = [w for w in tokens if w not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

Returns the most similar question, and the highest rated answer for that quesiton.

In [16]:
def chatbot_reply(user_query):
    # Transform user query into TF-IDF vector using the same steps we did on the dataframe
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data,cols = [],[]
    for word,count in tf.items():
        if word in unique_words:
            data.append((count/length)*idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix((data, ([0]*len(cols), cols)), shape=(1, len(unique_words)))

    # Compute cosine similarity across whole df
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    
    # Find the most similar question
    idx = similarity.argmax()
    print(idx)
    Id = unique_df.iloc[idx]['Id']
    best_ans = df[df['Id'] == Id].sort_values('Score_answer',ascending=False).iloc[0]
    # Retrieve the best matching Q&A
    return best_ans["Body_answer"], best_ans["question"], similarity[idx]

In [22]:
def top_n_results(user_query,n=1,score_req = 10):
    # Transform user query into TF-IDF vector
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data,cols = [],[]
    for word,count in tf.items():
        if word in unique_words:
            data.append((count/length)*idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix((data, ([0]*len(cols), cols)), shape=(1, len(unique_words)))

    # Compute cosine similarity across whole df
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    # Find the most similar question(s)
    idx = np.argsort(-similarity)[:n]
    values = -np.sort(-similarity)[:n]
    idx_values = pd.DataFrame({'Id':unique_df.iloc[idx]['Id'], 'Similarity':values})
    Id = unique_df.iloc[idx]['Id']
    best_ans = df[df['Id'].isin(Id)]
    best_ans = best_ans[best_ans['Score_answer']>=score_req]
    return best_ans.merge(idx_values, on='Id')

In [23]:
top_n_results("How to print a backslash in Python?",n=5,score_req=10)

Unnamed: 0,Id,Score_question,question,Score_answer,Body_answer,Similarity
0,17327202,18,python replace single backslash double backsla...,29,No need to use str.replace or string.replace h...,0.86108
1,19095796,25,print backslash python write print print print...,33,You need to escape your backslash by preceding...,0.885258
2,19095796,25,print backslash python write print print print...,19,"Another clue, if you're trying to accomplish s...",0.885258
3,25047976,8,split string backslash python simple question ...,15,You have the right idea with escaping the back...,0.821077


In [24]:
user_query = input("Enter your question:" )
top_n_results(user_query,n=5,score_req=0)

Unnamed: 0,Id,Score_question,question,Score_answer,Body_answer,Similarity
0,29617176,-2,running function line loop python possible cal...,3,"You have the ordering wrong, but yes it is pos...",0.900708
1,29617176,-2,running function line loop python possible cal...,1,"Yes:\n\nfor x in range(4): print(""Hello"")\n\n\...",0.900708
2,29617176,-2,running function line loop python possible cal...,3,You could use join function also.\n\n>>> print...,0.900708
3,29617176,-2,running function line loop python possible cal...,1,The thing with those constructs is that they c...,0.900708
4,29617176,-2,running function line loop python possible cal...,1,"As a extension to Avinash's answer, you don't ...",0.900708
5,31207287,0,converting loading string file print string pr...,2,Your string appears to have been encoded using...,0.86194
6,34627292,1,write csv dataframes using python panda csv fi...,1,You need add index=False to to_csv:\n\nprint d...,0.84038
7,34627292,1,write csv dataframes using python panda csv fi...,2,Assuming you named your dataframe df. You nee...,0.84038
8,35765082,3,console logger not synchronized print stumbled...,1,These are a couple of different issues raised ...,0.92452
9,37889752,0,write output file multiple time called functio...,1,Try something like this:\n\ndef do():\n ret...,0.878828


In [20]:
def top_n_interval_filtered(user_query,n=1,score_req=10,lower_bound=0,upper_bound=1,lowest_question_score=0):
    #Transform user query into TF-IDF vector
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data, cols = [], []
    for word, count in tf.items():
        if word in unique_words:
            data.append((count / length) * idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix(
        (data, ([0] * len(cols), cols)),
        shape=(1, len(unique_words))
    )
     # Compute cosine similarity across whole df
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    # Find the most similar question(s)
    idx = np.argsort(-similarity)[:n]
    values = -np.sort(-similarity)[:n]
    idx_values = pd.DataFrame({
        'Id': unique_df.iloc[idx]['Id'],
        'Similarity': values
    })
    top_answers = df[df['Id'].isin(idx_values['Id'])]
    top_answers = top_answers[top_answers['Score_answer'] >= score_req] #Filter on score_req for answer
    merged = top_answers.merge(idx_values, on='Id')
    merged['Score_ratio'] = abs(merged["Score_question"]).div(merged["Score_answer"].dropna())
    #Filter on score ratio and question score
    filtered = merged[
        (merged["Score_ratio"] >= lower_bound) &
        (merged["Score_ratio"] <= upper_bound) &
        (merged["Score_question"] > lowest_question_score)
    ]
    return filtered.sort_values(by="Similarity", ascending=False)

In [21]:
top_n_interval_filtered("how to print escape character",n=10)

Unnamed: 0,Id,Score_question,question,Score_answer,Body_answer,Similarity,Score_ratio
3,18935754,5,escape special character string single backsla...,17,This is one way to do it (in Python 3.x):\n\ne...,0.756953,0.294118
0,3096948,4,escape html python string might contain escape...,11,If your value being escaped might contain quot...,0.701505,0.363636
1,4202538,63,python escape regex special character python f...,98,Use re.escape\n\nre.escape(string)\n>>> re.esc...,0.667122,0.642857
