In [118]:
import pandas as pd
import numpy as np
import ast
import nltk

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split

In [198]:
file_path = "../dataGeneration/problemData.csv"                                                                                                                                                                                            
data = pd.read_csv(file_path)
NON_HINT_TAGS = {'array', 'string', 'math', 'tree', 'graph', 'design', 'brainteaser', 'linked-list', 'geometry', 'random'}

In [199]:
problemDescs = []
problemTags = []

for i, d in data.iterrows():
    tags = [t for t in ast.literal_eval(d['tags']) if t not in NON_HINT_TAGS]
    
    if tags:
        problemDescs.append(d['description'])
        problemTags.append(tags)
        
print(len(problemDescs), len(problemTags))
print((problemDescs[0]), (problemTags[0]))

947 947
You are given n​​​​​​ tasks labeled from 0 to n - 1 represented by a 2D integer array tasks, where tasks[i] = [enqueueTimei, processingTimei] means that the i​​​​​​th​​​​ task will be available to process at enqueueTimei and will take processingTimei to finish processing.

You have a single-threaded CPU that can process at most one task at a time and will act in the following way:


	If the CPU is idle and there are no available tasks to process, the CPU remains idle.
	If the CPU is idle and there are available tasks, the CPU will choose the one with the shortest processing time. If multiple tasks have the same shortest processing time, it will choose the task with the smallest index.
	Once a task is started, the CPU will process the entire task without stopping.
	The CPU can finish a task then start a new one instantly.


Return the order in which the CPU will process the tasks. ['heap']


In [204]:
regexpTokenizer = RegexpTokenizer("[a-zA-Z]{2,}")
wnl = WordNetLemmatizer()

def tokenize(doc):
    return [wnl.lemmatize(t) for t in regexpTokenizer.tokenize(doc)]

vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_df=0.7, min_df=5)
X = vectorizer.fit_transform(problemDescs)

In [214]:
hintTags = set()

for tags in problemTags:
    for t in tags:
        hintTags.add(t)
        
hintTags = np.array(list(hintTags))
tagToIndex = {tag: i for i, tag in enumerate(hintTags)}

In [205]:
Y = []

for tags in problemTags:
    y = [0] * len(hintTags)
    
    for t in tags:
        y[tagToIndex[t]] = 1
        
    Y.append(y)
    
Y = np.array(Y)
Y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [207]:
print(X.shape, Y.shape)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)


(947, 649) (947, 32)
(710, 649) (237, 649) (710, 32) (237, 32)


In [114]:
# Neural network. Not used.
regr = MLPRegressor((500, 200, 500), max_iter=100).fit(X_train, Y_train)

In [168]:
NN:
100:
['dynamic-programming',
 'depth-first-search',
 'breadth-first-search',
 'segment-tree',
 'divide-and-conquer']
500:
['depth-first-search',
 'binary-search',
 'segment-tree',
 'breadth-first-search',
 'binary-indexed-tree'] 

NB:
"Count the number of islands"
Fit_prior:
    True:
        ['depth-first-search' 'dynamic-programming' 'breadth-first-search'
 'greedy' 'binary-search']
    False:
        ['depth-first-search' 'breadth-first-search' 'dynamic-programming'
         'binary-search' 'backtracking']

SyntaxError: invalid syntax (<ipython-input-168-dfbc9d0144db>, line 1)

In [219]:
# Mutinomial Naive Bayes. Used in app.
clf = MultiOutputClassifier(MultinomialNB(fit_prior=False), n_jobs=1).fit(X_train, Y_train)

In [220]:
def classify(classifier, text):
    x = vectorizer.transform([text])
    class_probs = np.array([p[0][1] for p in classifier.predict_proba(x)])
    print(sorted(-class_probs)[:5])
    return hintTags[(-class_probs).argsort()[:5]]

print(classify(clf, "Find the number in a sorted array that equal to the index the element is at, and requires log(n) complexity (find any such element is ok)"))
#print(classify(clf, "randomly choosing k samples from a list of n items, where n is either a very large or unknown number. Typically n is large enough that the list doesn’t fit into main memory. For example, a list of search queries in Google and Facebook"))

[-0.6492617909729264, -0.44858443001798043, -0.36961440800303363, -0.3558985033686547, -0.3374208078543204]
['binary-search' 'hash-table' 'sort' 'bit-manipulation' 'greedy']


In [175]:
hintTags

array(['hash-table', 'trie', 'sliding-window', 'binary-search-tree',
       'union-find', 'greedy', 'rolling-hash', 'recursion',
       'divide-and-conquer', 'minimax', 'binary-indexed-tree',
       'rejection-sampling', 'dequeue', 'queue', 'bit-manipulation',
       'line-sweep', 'suffix-array', 'backtracking', 'heap',
       'segment-tree', 'binary-search', 'sort', 'two-pointers',
       'ordered-map', 'memoization', 'dynamic-programming',
       'meet-in-the-middle', 'breadth-first-search', 'reservoir-sampling',
       'topological-sort', 'stack', 'depth-first-search'], dtype='<U20')