In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import re
import tqdm
from collections import defaultdict
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from scipy.sparse import save_npz, load_npz

In [2]:
tf_idf_matrix = load_npz("Dataset/sparse_matrix.npz")
df = pd.read_csv("Dataset/cleaned.csv")
word_index_df = pd.read_csv('Dataset/word_to_index.csv', keep_default_na=False)
unique_words = dict(zip(word_index_df['word'], word_index_df['index']))
idf = pd.read_csv('Dataset/idf.csv', keep_default_na=False)
idf = dict(zip(idf['word'], idf['idf_score']))
df = df.dropna()
unique_df = df[['question',"Id"]].drop_duplicates()

In [3]:
contraction_map = {
    # Negative contractions
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "isn't": "is not",
    "mightn't": "might not",
    "mustn't": "must not",
    "needn't": "need not",
    "shan't": "shall not",
    "shouldn't": "should not",
    "wasn't": "was not",
    "weren't": "were not",
    "won't": "will not",
    "wouldn't": "would not",
    
    # Pronoun contractions
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    
    # Misc contractions
    "let's": "let us",
    "who's": "who is",
    "what's": "what is",
    "here's": "here is",
    "there's": "there is",
    "when's": "when is",
    "where's": "where is",
    "why's": "why is",
    "how's": "how is",
    "y'all": "you all",
    "o'clock": "of the clock",
    
    # Informal / common text contractions
    "ma'am": "madam",
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to",
    "lemme": "let me",
    "gimme": "give me",
    "kinda": "kind of",
    "ain’t": "am not",
    "y’all": "you all",
    "could’ve": "could have",
    "should’ve": "should have",
    "would’ve": "would have",
    "might’ve": "might have",
    "must’ve": "must have",
    "shan’t": "shall not",
    "let’s": "let us"
}


In [4]:
def expand_contractions(text):
    for contraction, expanded in contraction_map.items():
        text = text.replace(contraction, expanded)
    return text

In [5]:
stop_words = set(stopwords.words('english')) - {"not", "no", "never"}
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    # Expand contractions
    text = expand_contractions(text)
    
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercase and keep only alphabetic words
    tokens = [word for word in tokens if word.isalpha()]

    # Remove stopwords
    tokens = [w for w in tokens if w not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

In [6]:
def chatbot_reply(user_query):
    # Transform user query into TF-IDF vector
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data,cols = [],[]
    for word,count in tf.items():
        if word in unique_words:
            data.append((count/length)*idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix((data, ([0]*len(cols), cols)), shape=(1, len(unique_words)))

    # Compute cosine similarity
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    
    # Find the most similar question
    idx = similarity.argmax()
    print(idx)
    Id = unique_df.iloc[idx]['Id']
    best_ans = df[df['Id'] == Id].sort_values('Score_answer',ascending=False).iloc[0]
    # Retrieve the best matching Q&A
    return best_ans["Body_answer"], best_ans["question"], similarity[idx]

In [7]:
chatbot_reply("How to print a backslash in Python?")

199621


('you need to escape your backslash by preceding it with, yes, another backslash: print "\\\\" the \\ character is called an escape character, which interprets the character following it differently. for example, n by itself is simply a letter, but when you precede it with a backslash, it becomes \\n, which is the newline character. as you can probably guess, \\ also needs to be escaped so it doesn\'t function like an escape character. you have to... escape the escape, essentially.',
 'print backslash python write print print print python not print backslash symbol get expected result',
 np.float64(0.8852584212584412))

In [8]:
def top_n_results(user_query,n=1,score_req = 10):
    # Transform user query into TF-IDF vector
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data,cols = [],[]
    for word,count in tf.items():
        if word in unique_words:
            data.append((count/length)*idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix((data, ([0]*len(cols), cols)), shape=(1, len(unique_words)))

    # Compute cosine similarity
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    # Find the most similar question
    idx = np.argsort(-similarity)[:n]
    values = -np.sort(-similarity)[:n]
    idx_values = pd.DataFrame({'Id':unique_df.iloc[idx]['Id'], 'Similarity':values})
    Id = unique_df.iloc[idx]['Id']
    best_ans = df[df['Id'].isin(Id)]
    best_ans = best_ans[best_ans['Score_answer']>=score_req]
    return best_ans.merge(idx_values, on='Id')

In [9]:
top_n_results("How to print a backslash in Python?",n=5,score_req=10)

Unnamed: 0,Id,Tag,Score_question,question,Score_answer,Body_answer,Similarity
0,17327202,"['python', 'string', 'replace', 'backslash']",18,python replace single backslash double backsla...,29,no need to use str.replace or string.replace h...,0.86108
1,19095796,"['python', 'python-2.7']",25,print backslash python write print print print...,33,you need to escape your backslash by preceding...,0.885258
2,19095796,"['python', 'python-2.7']",25,print backslash python write print print print...,19,"another clue, if you're trying to accomplish s...",0.885258
3,25047976,['python'],8,split string backslash python simple question ...,15,you have the right idea with escaping the back...,0.821077


In [10]:
#make a scratch version for sparse matrix/cosine similarity for presentation
#but use sklearn for final implementation for speed

In [12]:
user_query = input("Enter your question:" )
top_n_results(user_query,n=5,score_req=0)

Unnamed: 0,Id,Tag,Score_question,question,Score_answer,Body_answer,Similarity
0,7562832,"['python', 'regex']",1,python inserting backslashes regular expressio...,5,it's not inserting a double backslash. that is...,0.83492
1,7562832,"['python', 'regex']",1,python inserting backslashes regular expressio...,2,i suppose this isn't an answer (i second liqui...,0.83492
2,7562832,"['python', 'regex']",1,python inserting backslashes regular expressio...,0,"prefix the string with the letter ""r"". this in...",0.83492
3,17327202,"['python', 'string', 'replace', 'backslash']",18,python replace single backslash double backsla...,8,"use escape characters: ""full\\path\\here"", ""\\...",0.86108
4,17327202,"['python', 'string', 'replace', 'backslash']",18,python replace single backslash double backsla...,29,no need to use str.replace or string.replace h...,0.86108
5,17327202,"['python', 'string', 'replace', 'backslash']",18,python replace single backslash double backsla...,0,"maybe a syntax error in your case, you may cha...",0.86108
6,17327202,"['python', 'string', 'replace', 'backslash']",18,python replace single backslash double backsla...,0,"given the source string, manipulation with os....",0.86108
7,17327202,"['python', 'string', 'replace', 'backslash']",18,python replace single backslash double backsla...,0,the backslash indicates a special escape chara...,0.86108
8,17327202,"['python', 'string', 'replace', 'backslash']",18,python replace single backslash double backsla...,0,let me make it simple and clear. lets use the ...,0.86108
9,17327202,"['python', 'string', 'replace', 'backslash']",18,python replace single backslash double backsla...,1,in python \ (backslash) is used as an escape c...,0.86108
