In [6]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno
import re
import os.path
import math
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from scipy import sparse
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


import nltk
from wordcloud import WordCloud
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import unicodedata

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


#plt.style.use('fivethirtyeight')
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=0.8)

from helper_functions import *

In [7]:
df = pd.read_csv("leetcode_questions.csv", encoding = "ISO-8859-1")

In [8]:
df = df.drop(['Question Title', 'Question Description', 'Examples','Similar Questions Titles','Similar Questions Links'], axis=1)

In [9]:
df = df.drop(['Unnamed: 40'], axis=1)
df = df.drop(['Unnamed: 41'], axis=1)

In [10]:
train, test = train_test_split(df, random_state=42, test_size=0.10, shuffle=True)

In [13]:
train = train.rename(columns={'Question Title Plus Description': 'Question_Title_Plus_Description'})
test = test.rename(columns={'Question Title Plus Description': 'Question_Title_Plus_Description'})

In [18]:
train_X, train_y = train['Question_Title_Plus_Description'], train.drop(['Question_Title_Plus_Description'], axis=1)
test_X, test_y = test['Question_Title_Plus_Description'], test.drop(['Question_Title_Plus_Description'], axis=1)

In [44]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [162]:
start_pointer = 0
end_pointer = 300
category_columns = train_y.columns
mean_accuracy = []
mean_precision = []
mean_f1 = []
mean_recall = []
report_df_final = pd.DataFrame([])
f1Score = {}
accuracyScore = {}
precisionScore={}


for i in range(10):
    train = pd.concat([df.iloc[0:start_pointer], df.iloc[end_pointer:3006]])
    test = df.iloc[start_pointer:end_pointer]
    start_pointer = end_pointer
    end_pointer = end_pointer + 300
    train_X, train_y = train['Question Title Plus Description'], train.drop(['Question Title Plus Description'], axis=1)
    test_X, test_y = test['Question Title Plus Description'], test.drop(['Question Title Plus Description'], axis=1)
    train_y_cluster_labels= train_y.groupby(list(category_columns)).ngroup()
    cluster_center = train_y.copy(deep=True) 
    cluster_center['Labels']=train_y_cluster_labels
    cluster_center = cluster_center.drop_duplicates()
    cluster_center = cluster_center.reset_index().set_index(['Labels']).sort_index().drop('index', axis=1)

    category_columns = train_y.columns


    pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer()),
                    ('clf', LinearSVC(class_weight='balanced'))
                ])
    # sorted(pipeline.get_params().keys()) # -- to obtain the GridSearchCV parameter names
    parameters = {
                    'tfidf__max_df': [0.5],
                    'tfidf__ngram_range': [(1, 2)],
                    'tfidf__min_df': [2],
                    'clf__C': [5, 10, 20, 50, 100]
                }
    overall_f1_score_v2_cv = make_scorer(overall_f1_score_v2, greater_is_better=True, class_to_genre_map = cluster_center)
    grid_search_cv = GridSearchCV(pipeline, parameters, cv=2, verbose=10, scoring=overall_f1_score_v2_cv)
    grid_search_cv.fit(train_X, train_y_cluster_labels)
    print("Best parameters set:")
    print (grid_search_cv.best_estimator_.steps)
    # measuring performance on test set
    print ("Applying best classifier on test data:")
    best_clf = grid_search_cv.best_estimator_
    predictions = multi_class_predict(best_clf, test_X, cluster_center) 
    report = classification_report(test_y, predictions, output_dict = True)
    for key, value in report.items():
    
        if(key in ['weighted avg']):
            if(key in weightedavgScore):
                weightedavgScore['f1-score'] += float(value['f1-score'])
                weightedavgScore['precision'] += float(value['precision'])
                weightedavgScore['recall'] += float(value['recall'])
                continue
            else:
                weightedavgScore['f1-score'] = float(value['f1-score'])
                weightedavgScore['precision'] = float(value['precision'])
                weightedavgScore['recall'] = float(value['recall'])
                continue
        if(key not in ['macro avg', 'micro avg', 'macro avg', 'samples avg']):
            print(f1Score.keys())
            if(category_columns[int(key)] in f1Score.keys()):
                print("*****************")
                f1Score[category_columns[int(key)]] += float(value['f1-score'])
                accuracyScore[category_columns[int(key)]] += float(value['precision'])
                precisionScore[category_columns[int(key)]] += float(value['recall'])
            else:
                print("--------")
                f1Score[category_columns[int(key)]] = float(value['f1-score'])
                accuracyScore[category_columns[int(key)]] = float(value['precision'])
                precisionScore[category_columns[int(key)]] = float(value['recall'])

for key in f1Score:
    f1Score[key] = f1Score[key]/10
    accuracyScore[key] = accuracyScore[key]/10
    precisionScore[key] = precisionScore[key]/10
for key in weightedavgScore:
    weightedavgScore[key] = weightedavgScore[key] / 10

print(f1Score)
print(accuracyScore)
print(precisionScore)
print(weightedavgScore)


Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.5s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.3s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.5s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.8s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.8s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   10.0s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   11.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.8s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   13.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   15.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   2.0s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   17.5s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=5, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[]
--------
[u'Segment Tree']
--------
[u'Segment Tree', u'Binary Search Tree']
--------
[u'Segment Tree', u'Binary Search Tree', u'Recursion']
--------
[u'Segment Tree', u'Memoization', u'Binary Search Tree', u'Recursion']
--------
[u'Segment Tree', u'Memoization', u'Design', u'Binary Search Tree', u'Recursion']
--------
[u'Topological Sort', u'Recursion', u'Memoization', u'Binary Search Tree', u'Design', u'Segment Tree']
--------
[u'Topological Sort', u'Trie', u'Recursion', u'Memoization', u'Binary Search Tree', u'Design', u'Segment Tree']
--------
[u'Topological Sort', u'Trie', u'Recursion', u'Memoization', u'Binary Search Tree', u'Design', u'Segment Tree', u'Binary Indexed Tree']
--------
[u'Topological Sort', u'Trie', u'Recursion', u'Memoization', u'Binary Search Tree', u'Queue', u'Design', u'Segment Tree', u'Binary Indexed Tree']
--------
[u'Topological Sort', u'Trie', u'Recursion', u'Minimax', u'Memoization', u'Binary Search Tree', u'Queue', u'Design', u'Segment Tree', u'Binary 

[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.0s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.7s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.2s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.7s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   10.7s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.3s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   12.3s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   14.2s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.3s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   15.8s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.7s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.0s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=5, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
Applying best classifier on test data:
[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 
[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.5s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.3s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.7s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.7s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.2s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   10.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   11.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.8s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   13.8s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.3s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   15.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   17.5s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=5, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array', u'Depth-first Search', u'Backtracking']
*****************
[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array'

[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.5s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.3s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.5s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.2s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    9.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   10.9s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.7s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   13.0s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   14.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.9s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   16.7s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=5, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array', u'Depth-first Search', u'Backtracking']
*****************
[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array'

[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.4s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.7s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.9s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   10.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   11.9s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.9s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   14.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   15.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   2.1s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.0s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=5, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array', u'Depth-first Search', u'Backtracking']
*****************
[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array'

[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.8s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.0s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.8s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.2s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.9s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   10.9s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   12.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.9s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   14.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   16.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   2.0s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.5s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=5, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array', u'Depth-first Search', u'Backtracking']
*****************
[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array'

[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.5s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.2s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.5s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.9s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.7s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   10.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   11.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.9s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   14.0s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   15.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.9s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   17.9s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=5, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array', u'Depth-first Search', u'Backtracking']
*****************
[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array'

[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.5s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.2s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.7s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.7s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   10.2s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   11.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.7s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   13.7s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   15.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.8s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   17.3s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=5, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array', u'Depth-first Search', u'Backtracking']
*****************
[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array'

[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.8s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.8s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.8s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   10.3s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   11.8s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.9s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   14.0s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   15.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   2.0s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   17.9s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=5, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array', u'Depth-first Search', u'Backtracking']
*****************
[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'Memoization', u'Binary Search Tree', u'Queue', u'Linked List', u'Dynamic Programming', u'Topological Sort', u'Breadth-first Search', u'Trie', u'Random', u'Searching', u'Design', u'Bit Manipulation', u'Segment Tree', u'Array'

[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.7s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.7s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.8s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   2.1s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   10.7s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.4s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   12.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   2.3s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   15.3s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.3s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   16.8s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.8s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   19.0s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=5, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
Applying best classifier on test data:
[u'Two Pointers', u'Recursion', u'Minimax', u'Sliding Window', u'Math', u'Sort', u'Map', u'String', u'Greedy', u'Union Find', u'Heap', u'Graph', u'Binary Search', u'Stack', u'Binary Indexed Tree', u'Hash Table', u'Divide and Conquer', u'Tree', u'

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [178]:
import json
data = {'f1Score' : f1Score, 'accuracyScore' : accuracyScore, 'precisionScore' : precisionScore, 'weightedavgScore' : weightedavgScore}
with open('NaiveBayesReport.json', 'w') as outfile:  
    json.dump(data, outfile)

In [177]:
start_pointer = 0
end_pointer = 300
category_columns = train_y.columns
mean_accuracy = []
mean_precision = []
mean_f1 = []
mean_recall = []
report_df_final = pd.DataFrame([])
f1Score = {}
accuracyScore = {}
precisionScore={}
ks = [75]
f1_score = []
thresh = 0.85
# TF-IDF + Naive Bayes
train_y_cluster_labels= train_y.groupby(list(category_columns)).ngroup()
cluster_center = train_y.copy(deep=True) #pd.DataFrame(train_y)
cluster_center['Labels']=train_y_cluster_labels
cluster_center = cluster_center.drop_duplicates()
cluster_center = cluster_center.reset_index().set_index(['Labels']).sort_index().drop('index', axis=1)
for i in range(10):
    train = pd.concat([df.iloc[0:start_pointer], df.iloc[end_pointer:3006]])
    test = df.iloc[start_pointer:end_pointer]
    start_pointer = end_pointer
    end_pointer = end_pointer + 300
    train_X, train_y = train['Question Title Plus Description'], train.drop(['Question Title Plus Description'], axis=1)
    test_X, test_y = test['Question Title Plus Description'], test.drop(['Question Title Plus Description'], axis=1)
    train_y_cluster_labels= train_y.groupby(list(category_columns)).ngroup()
    cluster_center = train_y.copy(deep=True) 
    cluster_center['Labels']=train_y_cluster_labels
    cluster_center = cluster_center.drop_duplicates()
    cluster_center = cluster_center.reset_index().set_index(['Labels']).sort_index().drop('index', axis=1)

    category_columns = train_y.columns


    pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', MultinomialNB(fit_prior=True, class_prior=None))
            ])
    # sorted(pipeline.get_params().keys()) # -- to obtain the GridSearchCV parameter names
    parameters = {
                    'tfidf__max_df': [0.25, 0.5, 0.75],
                    'tfidf__ngram_range': [(1, 1)],
                    'tfidf__min_df': [1, 2, 5, 10],
                    'clf__alpha': [0.001, 0.01, 0.1, 1]
                }

    overall_f1_score_v2_cv = make_scorer(overall_f1_score_v2, greater_is_better=True, class_to_genre_map = cluster_center)
    grid_search_cv = GridSearchCV(pipeline, parameters, cv=2, scoring=overall_f1_score_v2_cv)
    grid_search_cv.fit(train_X, train_y_cluster_labels)

    print()
    print("Best parameters set:")
    print (grid_search_cv.best_estimator_.steps)
    print()

    # measuring performance on test set
    print ("Applying best classifier on test data:")
    best_clf = grid_search_cv.best_estimator_
    predictions = multi_class_predict(best_clf, test_X, cluster_center)  
    report = classification_report(test_y, predictions, output_dict = True)
    for key, value in report.items():
    
        if(key in ['weighted avg']):
            if(key in weightedavgScore):
                weightedavgScore['f1-score'] += float(value['f1-score'])
                weightedavgScore['precision'] += float(value['precision'])
                weightedavgScore['recall'] += float(value['recall'])
                continue
            else:
                weightedavgScore['f1-score'] = float(value['f1-score'])
                weightedavgScore['precision'] = float(value['precision'])
                weightedavgScore['recall'] = float(value['recall'])
                continue
        if(key not in ['macro avg', 'micro avg', 'macro avg', 'samples avg']):
            if(category_columns[int(key)] in f1Score.keys()):
                
                f1Score[category_columns[int(key)]] += float(value['f1-score'])
                accuracyScore[category_columns[int(key)]] += float(value['precision'])
                precisionScore[category_columns[int(key)]] += float(value['recall'])
            else:
                
                f1Score[category_columns[int(key)]] = float(value['f1-score'])
                accuracyScore[category_columns[int(key)]] = float(value['precision'])
                precisionScore[category_columns[int(key)]] = float(value['recall'])

for key in f1Score:
    f1Score[key] = f1Score[key]/10
    accuracyScore[key] = accuracyScore[key]/10
    precisionScore[key] = precisionScore[key]/10
for key in weightedavgScore:
    weightedavgScore[key] = weightedavgScore[key] / 10

print(f1Score)
print(accuracyScore)
print(precisionScore)
print(weightedavgScore)


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))]
()
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))]
()
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))]
()
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))]
()
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))]
()
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))]
()
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))]
()
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))]
()
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))]
()
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))]
()
Applying best classifier on test data:
{u'Two Pointers': 0.19717171717171716, u'Recursion': 0.05833333333333333, u'Minimax': 0.19, u'Sliding Window': 0.12333333333333334, u'Math': 0.6278302638852808, u'Sort': 0.21015499348619557, u'Map': 0.163015873015873, u'String': 0.6510082065036044, u'Greedy': 0.07651515151515151, u'Union Find': 0.29333333333333333, u'Heap': 0.06222222222222222, u'Graph': 0.607331573655103, u'Binary Search': 0.1

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
