In [24]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
#https://gist.github.com/koreyou/f3a8a0470d32aa56b32f198f49a9f2b8

class BM25(object):
    def __init__(self, b=0.75, k1=1.6):
        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        print("fitting vectorizer")
        self.vectorizer.fit(X)
        print("done fitting vectorizer...now transforming doc")
        self.transformed = super(TfidfVectorizer, self.vectorizer).transform(X)
        print("done transforming doc")
        self.avdl = self.transformed.sum(1).mean()

    def transform(self, q, X):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        #X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = self.transformed.sum(1).A1
        print("transforming query")
        q, = self.vectorizer.transform([q])
        print("done transforming query")
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        X = self.transformed.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
        # to idf(t) = log [ n / df(t) ] with minus 1
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1

In [1]:
#from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import pickle as pickle
def query_text(query):
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    #query = 'In C++, can you define a variable in terms of other variables that have already been defined?'
    query = tokenizer.tokenize(query)
    query = [i for i in query if not i in stop_words]

    query = [lemmatizer.lemmatize(word) for word in query]
    query = [i.lower() for i in query]
    query = ' '.join([text for text in query])
    return query

In [25]:
#texts = fetch_20newsgroups(subset='train').data
#print(texts)
bm25 = BM25()
print(":loading data")
df = pickle.load(open("proc_sentences.pkl",'rb'))
#fulldf = pd.read_pickle("processed_data_2.pkl")
texts = df["Combine"]
print("fitting data")
bm25.fit(texts)

:loading data
fitting data
fitting vectorizer
done fitting vectorizer...now transforming doc
done transforming doc


In [3]:
data = pd.read_pickle("processed_data_2.pkl")
data = data.filter(["id" , "title"])
data.to_pickle("processed_data_small.pkl")

In [54]:
d= data
d["Sim"] = order[1:]
d = d.sort_values(by=["Sim"], ascending=False)
d[['id' ,'title']]

Unnamed: 0,id,title
6827,63728809,Discord.JS TypeError: ping is not a function
14316,63697165,Discord ping command
79234,63431087,I am having an issue with Discord.js
43917,63575936,Getting API latency inside of a commandhandler
82519,63417565,Error whilst trying to use client.latency in a...
...,...,...
87345,63398028,Python3 How to print a statement that takes va...
87344,63398029,Changing the image of long clicked Sprite
87343,63398031,Change google maps street view position dynami...
87341,63398035,Firebase Relatime Database: Retrieving data fr...


In [55]:
print(df)

                                                  Combine
0       in c define variable term variable already def...
1       typeerror ping function im trying make discord...
2       getting nonetype recursion python this questio...
3       in c define variable term variable already def...
4       panda dataframe update based date value 2 data...
...                                                   ...
199996  qt desiger wizard add new page i trying create...
199997  how install apache solr mac os i need create i...
199998  image slider work using react router on front ...
199999  error undefined local variable method action_n...
200000  aws cli loop rds instance i run aws rds descri...

[200001 rows x 1 columns]


In [58]:
query="discord type error ping"
def query_input(query, qt = 150, start = 1):
    print("Query is : " , query)
    
    order=bm25.transform(query, texts)
    sorted_order  = np.argsort(order)
#     answers=[]
#     for i in range(start,qt):
#         answers.append( texts[sorted_order[-i]])
    d= data
    d["Sim"] = order[1:]
    d = d.sort_values(by=["Sim"], ascending=False)
    result=d[['id' ,'title']]
    return result
import time
start = time.time()
(query_input("discord type error ping"))
(query_input("array out of bounds"))
(query_input("null pointer exception"))
(query_input("C++ segmentation fault"))
(query_input("objects cant be called as function"))
end=time.time()
print(end-start)


Query is :  discord type error ping
transforming query
done transforming query
0.3158385753631592


In [33]:
sorted_order = np.argsort(order)
print(texts[sorted_order[-2]])
print("####")
print(texts[sorted_order[-1]])

####
discord js typeerror ping function i trying make command bot ping minecraft server hypixel whenever i run bot type command i get ping mc hypixel net 25565 error response gt typeerror ping function this code client message message gt let args message content slice prefix length split switch args 0 case mc ping mc hypixel net 25565 error response gt error throw error console log response break discord j support command package just install minecraft server util npm minecraft server util put code const ping require minecraft server util use ping command ping lt name server gt port 25565 response gt sending response channel command initiated return message channel send response catching eventual error catch error gt throw error discord j typeerror
