In [22]:
import pandas as pd
import numpy as np
import spacy
import string
import gensim
import operator
import re
from spacy.lang.en.stop_words import STOP_WORDS
from gensim.similarities import MatrixSimilarity
from operator import itemgetter
from gensim import corpora
import warnings
warnings.filterwarnings('ignore')


In [23]:
# !pip install -U pip setuptools wheel
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

In [24]:
def spacy_tokenizer(sentence):

    punctuations = string.punctuation
    stop_words = STOP_WORDS
    #remove distracting single quotes
    sentence = re.sub('\'','',sentence)

    #remove digits and words containing digits
    sentence = re.sub('\w*\d\w*','',sentence)

    #replace extra spaces with single space
    sentence = re.sub(' +',' ',sentence)
    
    #remove non-breaking new line characters
    sentence = re.sub(r'\n',' ',sentence)
    
    #creating token object
    tokens = spacy_nlp(sentence)
    
    #lower, strip and lemmatize
    tokens = [word.lemma_.lower().strip() for word in tokens]
    
    #remove stopwords, and exclude words less than 2 characters
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]
    
    n= len(tokens)
    ngrams=[]
    p=0        

    while(p<=(n-2)):   
        g= [tokens[p],tokens[p+1]]
        ngrams.append(" ".join(g))
        p+=1

    tokens=ngrams+tokens
    #return tokens
    return tokens

In [25]:
def gensim_bow_corpus(tokenized_dataframe):

    dictionary = corpora.Dictionary(tokenized_dataframe)

    stoplist = set('hello and if this can would should could tell ask stop come go')
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    dictionary.filter_tokens(stop_ids)

    corpus = [dictionary.doc2bow(desc) for desc in tokenized_dataframe]

    return dictionary, corpus 

In [26]:
def create_model(corpus,dictionary):
    problem_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
    problem_lsi_model = gensim.models.LsiModel(problem_tfidf_model[corpus], id2word=dictionary, num_topics=350)
    
    # Serialize and Store the corpus locally in Matrix market format
    gensim.corpora.MmCorpus.serialize('problem_tfidf_model_mm', problem_tfidf_model[corpus])
    gensim.corpora.MmCorpus.serialize('problem_lsi_model_mm',problem_lsi_model[problem_tfidf_model[corpus]])
    
    #Load the indexed corpus and create similarlity object
    problem_tfidf_corpus = gensim.corpora.MmCorpus('problem_tfidf_model_mm')
    problem_lsi_corpus = gensim.corpora.MmCorpus('Problem_lsi_model_mm')
    
    return problem_tfidf_model, problem_lsi_model,problem_tfidf_corpus, problem_lsi_corpus

In [27]:
def search_similar_problem(search_term):

    query_bow = dictionary.doc2bow(spacy_tokenizer(search_term))
    query_tfidf = problem_tfidf_model[query_bow]
    query_lsi = problem_lsi_model[query_tfidf]

    problem_index.num_best = 4

    problem_list = problem_index[query_lsi]

    problem_list.sort(key=itemgetter(1), reverse=True)
    problem_names = []
    

    for j, problem in enumerate(problem_list):
        
        index_value=df_problem1.index[problem[0]] #done to fix the issue of problem_index index
        

        problem_names.append (
            {
                
                'Relevance': round((problem[1] * 100),2),
                'S.NO': df_problem1['S.NO'][index_value],
                'Problem statement': df_problem1['Problem statement'][index_value],
                'Tag': df_problem1['Tag'][index_value]
            }

        )
        if j == (problem_index.num_best-1):
            break

    return pd.DataFrame(problem_names, columns=['Relevance','S.NO','Problem statement','Tag'])

In [28]:
def sel_lang(df_problem):

    case = int(input("""Select the language you work in:
                          Input 1 for Python
                          Input 2 for Java
                          Input 3 for C#
                          Input 4 for Javascript\n"""))
    
    if case==1:
        df_problem1= df_problem[df_problem['Tag']=='python']
    elif case==2:
        df_problem1= df_problem[df_problem['Tag']=='java']
    elif case==3:
        df_problem1= df_problem[df_problem['Tag']=='C#']
    elif case==4:
        df_problem1= df_problem[df_problem['Tag']=='javascript']
    else:
        df_problem1= df_problem

    return df_problem1

In [29]:
df_problem=pd.read_excel('Pj1_v1.xlsx')

df_problem1=sel_lang(df_problem)    #using function 5

spacy_nlp = spacy.load('en_core_web_sm')

Problem_statement_1 = df_problem1['Problem statement'].map(lambda x: spacy_tokenizer(x)) #using function 1

dictionary, corpus= gensim_bow_corpus(Problem_statement_1)   #using function 2

problem_tfidf_model, problem_lsi_model,problem_tfidf_corpus,problem_lsi_corpus = create_model(corpus,dictionary) #using function 3

problem_index = MatrixSimilarity(problem_lsi_corpus, num_features = problem_lsi_corpus.num_terms)

search_similar_problem(input())   # using function 4


Select the language you work in:
                          Input 1 for Python
                          Input 2 for Java
                          Input 3 for C#
                          Input 4 for Javascript
2
inner class and statis nested


Unnamed: 0,Relevance,S.NO,Problem statement,Tag
0,73.62,338,Java inner class and static nested class,java
1,67.22,320,How do I test a class that has private methods...,java
2,43.02,339,How do I break out of nested loops in Java?,java
3,20.8,385,"What is the difference between canonical name,...",java


In [30]:
# search_similar_problem(input())   # using function 4

In [31]:
search_similar_problem(input()) 

two list with unique entries


Unnamed: 0,Relevance,S.NO,Problem statement,Tag
0,68.6,310,How do I efficiently iterate over each entry i...,java
1,44.61,461,How can I turn a List of Lists into a List in ...,java
2,41.84,590,Convert Set to List without creating new List,java
3,39.53,440,How to make a new List in Java,java
