## Presentation (Score-Ratio Metric and TF-IDF)

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import re
import tqdm
from collections import defaultdict
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from scipy.sparse import save_npz, load_npz

In [2]:
contraction_map = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "isn't": "is not",
    "mightn't": "might not",
    "mustn't": "must not",
    "needn't": "need not",
    "shan't": "shall not",
    "shouldn't": "should not",
    "wasn't": "was not",
    "weren't": "were not",
    "won't": "will not",
    "wouldn't": "would not",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "let's": "let us",
    "who's": "who is",
    "what's": "what is",
    "here's": "here is",
    "there's": "there is",
    "when's": "when is",
    "where's": "where is",
    "why's": "why is",
    "how's": "how is",
    "y'all": "you all",
    "o'clock": "of the clock",
    "ma'am": "madam",
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to",
    "lemme": "let me",
    "gimme": "give me",
    "kinda": "kind of",
    "ain’t": "am not",
    "y’all": "you all",
    "could’ve": "could have",
    "should’ve": "should have",
    "would’ve": "would have",
    "might’ve": "might have",
    "must’ve": "must have",
    "shan’t": "shall not",
    "let’s": "let us"
}

In [3]:
def expand_contractions(text):
    for contraction, expanded in contraction_map.items():
        text = text.replace(contraction, expanded)
    return text

In [4]:
df = pd.read_csv('Dataset/cleaned.csv')
df = df.dropna()
df['Score_question'] = df['Score_question'].astype("string")
unique_df = df[['Score_question',"Id"]].drop_duplicates()
temp = unique_df['Score_question'].str.lower().str.split()

In [5]:
import ssl
import certifi
ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())

In [6]:
def expand_contractions(text):
    for contraction, expanded in contraction_map.items():
        text = text.replace(contraction, expanded)
    return text
stop_words = set(stopwords.words('english')) - {"not", "no", "never"}
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = expand_contractions(text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

In [7]:
tf_idf_matrix = load_npz("Dataset/sparse_matrix.npz")
df = pd.read_csv("Dataset/cleaned.csv")
word_index_df = pd.read_csv('Dataset/word_to_index.csv', keep_default_na=False)
unique_words = dict(zip(word_index_df['word'], word_index_df['index']))
idf = pd.read_csv('Dataset/idf.csv', keep_default_na=False)
idf = dict(zip(idf['word'], idf['idf_score']))
df = df.dropna()
unique_df = df[['Score_question',"Id"]].drop_duplicates()

In [8]:
def chatbot_reply(user_query):
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data,cols = [],[]
    for word,count in tf.items():
        if word in unique_words:
            data.append((count/length)*idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix((data, ([0]*len(cols), cols)), shape=(1, len(unique_words)))
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    idx = similarity.argmax()
    print(idx)
    Id = unique_df.iloc[idx]['Id']
    best_ans = df[df['Id'] == Id].sort_values('Score_answer',ascending=False).iloc[0]
    return best_ans["Body_answer"], best_ans["question"], similarity[idx]

In [9]:
chatbot_reply("How to reverse a list in Python?")

38364


('i like this (more readable?) one: >> s = "yellow.green.red.orange.apple" >> \'.\'.join(reversed(s.split(\'.\'))) \'apple.orange.red.green.yellow\'',
 'python reverse token string following string reverse get following',
 np.float64(0.7529698925606766))

In [10]:
def top_n_results(user_query,n=1,score_req = 10):
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data,cols = [],[]
    for word,count in tf.items():
        if word in unique_words:
            data.append((count/length)*idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix((data, ([0]*len(cols), cols)), shape=(1, len(unique_words)))
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    idx = np.argsort(-similarity)[:n]
    values = -np.sort(-similarity)[:n]
    idx_values = pd.DataFrame({'Id':unique_df.iloc[idx]['Id'], 'Similarity':values})
    Id = unique_df.iloc[idx]['Id']
    best_ans = df[df['Id'].isin(Id)]
    best_ans = best_ans[best_ans['Score_answer']>=score_req]
    return best_ans.merge(idx_values, on='Id')

In [11]:
df["Score_ratio"] = abs(df["Score_question"]).div(df["Score_answer"].dropna())

In [12]:
df

Unnamed: 0,Id,Tag,Score_question,question,Score_answer,Body_answer,Score_ratio
0,469,"['python', 'osx', 'fonts', 'photoshop']",21,find full path font display name mac using pho...,4,open up a terminal (applications->utilities->t...,5.25
1,469,"['python', 'osx', 'fonts', 'photoshop']",21,find full path font display name mac using pho...,2,i haven't been able to find anything that does...,10.50
2,469,"['python', 'osx', 'fonts', 'photoshop']",21,find full path font display name mac using pho...,12,unfortunately the only api that isn't deprecat...,1.75
3,469,"['python', 'osx', 'fonts', 'photoshop']",21,find full path font display name mac using pho...,1,there must be a method in cocoa to get a list ...,21.00
4,502,"['python', 'windows', 'image', 'pdf']",27,get preview jpeg pdf window python application...,9,you can use imagemagick's convert utility for ...,3.00
...,...,...,...,...,...,...,...
987117,40142948,"['python', 'tuples']",-2,make function return tuple divided multiple li...,0,hereâs the quick and dirty way: def formatte...,inf
987118,40143133,"['python', 'beautifulsoup']",1,error handling beautifulsoup scraped url not r...,1,you may check the value of name_box variable -...,1.00
987119,40143166,"['python', 'python-3.x']",1,finding cubed root using delta epsilon python ...,2,"first thing, you should use if/elif instead of...",0.50
987120,40143190,"['python', 'bash', 'multiline']",1,execute multiline python code bash script need...,5,use a here-doc: result=$(python <,0.20


In [13]:
def bin_count(a,b):
    bin_counts = (df["Score_ratio"] >= a) & (df["Score_ratio"] < b)
    count = bin_counts.sum()
    return count

In [14]:
def Interval_Search(lower_bound, upper_bound, lowest_question_score):
    return df[(df["Score_ratio"] >= lower_bound) & (df["Score_ratio"] <= upper_bound) & (df["Score_question"] > lowest_question_score)].sort_values(by="Score_answer", ascending=False)

In [15]:
def plotting_searched_df(lower_bound, upper_bound, lowest_question_score):
    Searched_df = Interval_Search(lower_bound,upper_bound,lowest_question_score)
    plt.figure(figsize=(12,8))
    plt.scatter(Searched_df["Score_question"], Searched_df["Score_answer"], alpha=0.6)
    plt.xlabel("Question Score")
    plt.ylabel("Answer Score")
    plt.title("Question vs Answer Scores in Searched Data Frame")

    plt.figure(figsize=(12,8))
    plt.scatter(Searched_df["Score_ratio"], Searched_df["Score_question"], alpha=0.6)
    plt.xlabel("Ratio")
    plt.ylabel("Question Score")
    plt.title("Question vs Answer Scores in Searched Data Frame")

    return Searched_df

In [16]:
def top_n_interval_filtered(user_query,n=1,score_req=10,lower_bound=0,upper_bound=1,lowest_question_score=0):
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data, cols = [], []
    for word, count in tf.items():
        if word in unique_words:
            data.append((count / length) * idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix(
        (data, ([0] * len(cols), cols)),
        shape=(1, len(unique_words))
    )
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    idx = np.argsort(-similarity)[:n]
    values = -np.sort(-similarity)[:n]
    idx_values = pd.DataFrame({
        'Id': unique_df.iloc[idx]['Id'],
        'Similarity': values
    })
    top_answers = df[df['Id'].isin(idx_values['Id'])]
    top_answers = top_answers[top_answers['Score_answer'] >= score_req]
    merged = top_answers.merge(idx_values, on='Id')
    filtered = merged[
        (merged["Score_ratio"] >= lower_bound) &
        (merged["Score_ratio"] <= upper_bound) &
        (merged["Score_question"] > lowest_question_score)
    ]
    return filtered.sort_values(by="Similarity", ascending=False)

In [17]:
top_n_interval_filtered("How can I use newline characters in python?",10,10,0,3,0)

Unnamed: 0,Id,Tag,Score_question,question,Score_answer,Body_answer,Score_ratio,Similarity
3,14390123,"['python', 'newline']",4,removing newline csv file trying process csv f...,10,"note that, as the docs say: csvfile can be any...",0.4,0.740451
0,2657693,"['regex', 'python']",16,insert newline character every character using...,24,"same as in perl, but with a backslash instead ...",0.666667,0.700835
1,2657693,"['regex', 'python']",16,insert newline character every character using...,16,"without regexp: def insert_newlines(string, ev...",1.0,0.700835
2,2657693,"['regex', 'python']",16,insert newline character every character using...,10,"i'd go with: import textwrap s = ""0123456789""*...",1.6,0.700835
