In [39]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno
import re
import os.path
import math
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from scipy import sparse
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


import nltk
from wordcloud import WordCloud
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import unicodedata

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


#plt.style.use('fivethirtyeight')
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=0.8)

from helper_functions import *


mydata_train = pd.read_excel('shuffled_leetcode_train.xlsx')
mydata_test = pd.read_excel('shuffle_leetcode_test.xlsx')

train_X, train_y = mydata_train['Question Description'], mydata_train.drop(['Question Title', 'Question Description', 'plot_lang'], axis=1)
test_X, test_y = mydata_test['Question Description'], mydata_test.drop(['Question Title', 'plot_lang'], axis=1)



category_columns = train_y.columns


test_y = test_y[pd.notnull(test_y['Question Description'])]
test_X = test_X[pd.notnull(test_X)]

train_y_cluster_labels= train_y.groupby(list(category_columns)).ngroup()
cluster_center = train_y.copy(deep=True) 
cluster_center['Labels']=train_y_cluster_labels
cluster_center = cluster_center.drop_duplicates()
cluster_center = cluster_center.reset_index().set_index(['Labels']).sort_index().drop('index', axis=1)

category_columns = train_y.columns


pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', LinearSVC(class_weight='balanced'))
            ])
# sorted(pipeline.get_params().keys()) # -- to obtain the GridSearchCV parameter names
parameters = {
                'tfidf__max_df': [0.5],
                'tfidf__ngram_range': [(1, 2)],
                'tfidf__min_df': [2],
                'clf__C': [5, 10, 20, 50, 100]
            }
overall_f1_score_v2_cv = make_scorer(overall_f1_score_v2, greater_is_better=True, class_to_genre_map = cluster_center)
grid_search_cv = GridSearchCV(pipeline, parameters, cv=2, verbose=10, scoring=overall_f1_score_v2_cv)
grid_search_cv.fit(train_X, train_y_cluster_labels)


print()
print("Best parameters set:")
print (grid_search_cv.best_estimator_.steps)
print()

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_cv.best_estimator_
predictions = multi_class_predict(best_clf, test_X, cluster_center) 

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.4s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.4s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.2s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.5s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    9.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   11.0s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   13.0s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.3s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   14.5s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   16.7s finished


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=5, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
()
Applying best classifier on test data:


In [59]:
# mydata_test

# new_test_X = mydata_test['Question Title','Question Description']
# predictions[0]
UIdata = {}
for j, row in predictions.iterrows():
# for j in range(len(predictions)):
    categories = ''
#     print(predictions[j])
    for i in range(len(row)):
#         print(row[i])
        if(str(row[i])=='1'):
            categories += category_columns[i] + ','
    UIdata[mydata_test.iloc[j]['Question Title']] = [mydata_test.iloc[j]['Question Description'],categories[:len(categories)-1] ]

# accuracy(test_y, predictions)

UIdata
import json
with open('JsonNaiveBayes.json', 'w') as outfile:  
    json.dump(UIdata, outfile)

In [3]:
mydata_train = pd.read_excel('shuffled_leetcode_train.xlsx')


# [i:i+100]
# [0:i]+[i+100:]
# mydata_test = mydata_train[0:50]
mydata_test = pd.read_excel('shuffle_leetcode_test.xlsx')

train_X, train_y = mydata_train['Question Description'], mydata_train.drop(['Question Title', 'Question Description', 'plot_lang'], axis=1)
test_X, test_y = mydata_test['Question Description'], mydata_test.drop(['Question Title', 'plot_lang'], axis=1)

category_columns = train_y.columns

In [4]:
test_X = test_X[pd.notnull(test_X)]

In [5]:
test_y = test_y[pd.notnull(test_y['Question Description'])]

In [6]:
test_y = test_y.drop(['Question Description'], axis=1)

In [8]:
train_y_cluster_labels= train_y.groupby(list(category_columns)).ngroup()
cluster_center = train_y.copy(deep=True) 
cluster_center['Labels']=train_y_cluster_labels
cluster_center = cluster_center.drop_duplicates()
cluster_center = cluster_center.reset_index().set_index(['Labels']).sort_index().drop('index', axis=1)

category_columns = train_y.columns


pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', LinearSVC(class_weight='balanced'))
            ])
# sorted(pipeline.get_params().keys()) # -- to obtain the GridSearchCV parameter names
parameters = {
                'tfidf__max_df': [0.5],
                'tfidf__ngram_range': [(1, 2)],
                'tfidf__min_df': [2],
                'clf__C': [5, 10, 20, 50, 100]
            }
overall_f1_score_v2_cv = make_scorer(overall_f1_score_v2, greater_is_better=True, class_to_genre_map = cluster_center)
grid_search_cv = GridSearchCV(pipeline, parameters, cv=2, verbose=10, scoring=overall_f1_score_v2_cv)
grid_search_cv.fit(train_X, train_y_cluster_labels)


print()
print("Best parameters set:")
print (grid_search_cv.best_estimator_.steps)
print()

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_cv.best_estimator_
predictions = multi_class_predict(best_clf, test_X, cluster_center) 
print(classification_report(test_y, predictions))

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=5, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.4s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.1s
[CV] tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.7s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=10, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.5s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.6s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.2s
[CV] tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=20, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   10.2s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.5s
[CV] tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   11.9s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=50, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.8s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   14.1s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.6s
[CV] tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   16.0s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=100, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.8s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.2s finished


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=5, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
()
Applying best classifier on test data:
              precision    recall  f1-score   support

           0       1.00      0.88      0.93        16
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         3
           3       1.00

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline,X_train,y_train,cv=3,
    scoring='f1_micro')

In [41]:
ks = [75]
f1_score = []
thresh = 0.85
for k in ks:
    kmeans = KMeans(n_clusters=k, random_state=2)
    labels = kmeans.fit_predict(train_y)
    train_y_cluster_labels = pd.Series(labels, index=train_y)
    cluster_center = pd.DataFrame(columns=train_y.columns)
    for cluster_id in range(k):
        cluster_center.loc[cluster_id] = (kmeans.cluster_centers_[cluster_id]>=thresh)*1

    for idx, col in enumerate(train_y.columns):
        max_idx = kmeans.cluster_centers_[:,idx].argmax()
        max_value = kmeans.cluster_centers_[:,idx].max()
        if max_value<thresh:
            cluster_center.loc[max_idx, col] = 1
    
    train_y_genre_labels = pd.DataFrame(columns=train_y.columns, index=train_y.index)
    for idx in range(k):
        train_y_genre_labels.loc[labels==idx,:] = cluster_center.loc[idx,:].values

pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', LinearSVC(class_weight='balanced'))
            ])
# sorted(pipeline.get_params().keys()) # -- to obtain the GridSearchCV parameter names
parameters = {
                'tfidf__max_df': [0.25, 0.5, 0.75],
                'tfidf__ngram_range': [(1, 2)],
                'tfidf__min_df': [1, 2, 5],
                'clf__C': [1, 10, 100]
            }
overall_f1_score_v2_cv = make_scorer(overall_f1_score_v2, greater_is_better=True, class_to_genre_map = cluster_center)
grid_search_cv = GridSearchCV(pipeline, parameters, cv=2, verbose=10, scoring=overall_f1_score_v2_cv)
grid_search_cv.fit(train_X, train_y_cluster_labels)

print()
print("Best parameters set:")
print (grid_search_cv.best_estimator_.steps)
print()

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_cv.best_estimator_
predictions = multi_class_predict(best_clf, test_X, cluster_center) 

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV] tfidf__max_df=0.25, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=1 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tfidf__max_df=0.25, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=1, score=0, total=   1.0s
[CV] tfidf__max_df=0.25, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV]  tfidf__max_df=0.25, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=1, score=0, total=   0.9s
[CV] tfidf__max_df=0.25, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.4s remaining:    0.0s


[CV]  tfidf__max_df=0.25, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   0.7s
[CV] tfidf__max_df=0.25, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.4s remaining:    0.0s


[CV]  tfidf__max_df=0.25, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   0.7s
[CV] tfidf__max_df=0.25, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=5 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.3s remaining:    0.0s


[CV]  tfidf__max_df=0.25, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=5, score=0, total=   0.7s
[CV] tfidf__max_df=0.25, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=5 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.3s remaining:    0.0s


[CV]  tfidf__max_df=0.25, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=5, score=0, total=   0.8s
[CV] tfidf__max_df=0.5, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=1 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    6.3s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=1, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=1 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    7.7s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=1, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    9.0s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   0.8s
[CV] tfidf__max_df=0.5, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=2 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   10.0s remaining:    0.0s


[CV]  tfidf__max_df=0.5, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=   1.0s
[CV] tfidf__max_df=0.5, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=5 
[CV]  tfidf__max_df=0.5, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=5, score=0, total=   0.8s
[CV] tfidf__max_df=0.5, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=5 
[CV]  tfidf__max_df=0.5, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=5, score=0, total=   0.8s
[CV] tfidf__max_df=0.75, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=1 
[CV]  tfidf__max_df=0.75, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=1, score=0, total=   1.1s
[CV] tfidf__max_df=0.75, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=1 
[CV]  tfidf__max_df=0.75, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=1, score=0, total=   0.9s
[CV] tfidf__max_df=0.75, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=2 
[CV]  tfidf__max_df=0.75, clf__C=1, tfidf__ngram_range=(1, 2), tfidf__min_df=2, score=0, total=

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  1.2min finished


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LinearSVC(C=1, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
()
Applying best classifier on test data:


In [42]:
predictions

Unnamed: 0,Array,Hash Table,Linked List,Math,Two Pointers,String,Binary Search,Divide and Conquer,Dynamic Programming,Backtracking,...,Segment Tree,Binary Search Tree,Recursion,Memoization,Queue,Minimax,Map,Random,Sliding Window,Searching
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
ks = [75]
f1_score = []
thresh = 0.85
for k in ks:
    kmeans = KMeans(n_clusters=k, random_state=2)
    labels = kmeans.fit_predict(train_y)
    train_y_cluster_labels = pd.Series(labels, index=train_y)
    cluster_center = pd.DataFrame(columns=train_y.columns)
    for cluster_id in range(k):
        cluster_center.loc[cluster_id] = (kmeans.cluster_centers_[cluster_id]>=thresh)*1

    for idx, col in enumerate(train_y.columns):
        max_idx = kmeans.cluster_centers_[:,idx].argmax()
        max_value = kmeans.cluster_centers_[:,idx].max()
        if max_value<thresh:
            cluster_center.loc[max_idx, col] = 1
    
    train_y_genre_labels = pd.DataFrame(columns=train_y.columns, index=train_y.index)
    for idx in range(k):
        train_y_genre_labels.loc[labels==idx,:] = cluster_center.loc[idx,:].values
print(classification_report(train_y, train_y_genre_labels))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       668
           1       1.00      0.84      0.91        98
           2       1.00      0.90      0.95        77
           3       1.00      1.00      1.00       636
           4       1.00      0.80      0.89        64
           5       1.00      0.98      0.99       412
           6       1.00      0.81      0.89        78
           7       0.33      0.07      0.11        29
           8       1.00      0.97      0.98       324
           9       1.00      0.65      0.79        46
          10       1.00      0.93      0.96        81
          11       1.00      0.61      0.76        51
          12       1.00      0.68      0.81        66
          13       1.00      0.92      0.96       120
          14       1.00      0.95      0.97       142
          15       1.00      0.98      0.99       238
          16       1.00      0.85      0.92       116
          17       1.00    

In [46]:
ks = [75]
f1_score = []
thresh = 0.85
for k in ks:
    kmeans = KMeans(n_clusters=k, random_state=2)
    labels = kmeans.fit_predict(train_y)
    train_y_cluster_labels = pd.Series(labels, index=train_y)
    cluster_center = pd.DataFrame(columns=train_y.columns)
    for cluster_id in range(k):
        cluster_center.loc[cluster_id] = (kmeans.cluster_centers_[cluster_id]>=thresh)*1

    for idx, col in enumerate(train_y.columns):
        max_idx = kmeans.cluster_centers_[:,idx].argmax()
        max_value = kmeans.cluster_centers_[:,idx].max()
        if max_value<thresh:
            cluster_center.loc[max_idx, col] = 1
    
    train_y_genre_labels = pd.DataFrame(columns=train_y.columns, index=train_y.index)
    for idx in range(k):
        train_y_genre_labels.loc[labels==idx,:] = cluster_center.loc[idx,:].values
    
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 1))),
                ('clf', LinearSVC(C=1, class_weight='balanced'))
            ])
pipeline.fit(train_X, train_y_cluster_labels)
predictions = multi_class_predict(pipeline, test_X, cluster_center) 


In [40]:
from sklearn.metrics import classification_report

def classification_report_csv(report):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split(' ') 
        row_data = list(filter(None, row_data))
        row['precision'] = float(row_data[1])
        row['recall'] = float(row_data[2])
        row['f1_score'] = float(row_data[3])
        row['support'] = float(row_data[4])
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    dataframe.to_csv('/Users/yashmehta/Desktop/classification_report.csv', index = False)

report = classification_report(test_y, predictions)


classification_report_csv(report)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


IndexError: list index out of range

In [None]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 1))),
                ('clf', LinearSVC(C=1, class_weight='balanced'))
            ])
pipeline.fit(train_X, train_y_cluster_labels)
predictions = multi_class_predict(pipeline, test_X, cluster_center) 
print(classification_report(test_y, predictions))

In [48]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 2))),
                ('clf', LinearSVC(C=10, class_weight='balanced'))
            ])
pipeline.fit(train_X, train_y_cluster_labels)
predictions = multi_class_predict(pipeline, test_X, cluster_center) 

In [8]:
import pandas as pd
train_y_cluster_labels = pd.DataFrame(columns=['Labels'], index=train_y.index)
train_y_cluster_labels['Labels'] = train_y.groupby(list(category_columns)).ngroup()
cluster_center = pd.DataFrame(train_y)
cluster_center['Labels']=train_y_cluster_labels
cluster_center = cluster_center.drop_duplicates()
cluster_center = cluster_center.reset_index().set_index(['Labels']).sort_index().drop('index', axis=1)

In [51]:
# TF-IDF + RandomForest Classifier
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 1))),
                ('clf', RandomForestClassifier(n_estimators=500, max_depth=70, max_features='sqrt', n_jobs=4))
            ])
pipeline.fit(train_X, train_y_cluster_labels)
predictions = multi_class_predict(pipeline, test_X, cluster_center) 
print(classification_report(test_y, predictions))

ValueError: Classification metrics can't handle a mix of multiclass-multioutput and multilabel-indicator targets

In [52]:
# TF-IDF + RandomForest Classifier
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 2))),
                ('clf', RandomForestClassifier(n_estimators=500, max_depth=70, max_features='sqrt', n_jobs=4))
            ])
pipeline.fit(train_X, train_y_cluster_labels)
predictions = multi_class_predict(pipeline, test_X, cluster_center) 


In [54]:
# Count Vectorizer + Linear SVC
train_y_cluster_labels= train_y.groupby(list(category_columns)).ngroup()
cluster_center = train_y.copy(deep=True) #pd.DataFrame(train_y)
cluster_center['Labels']=train_y_cluster_labels
cluster_center = cluster_center.drop_duplicates()
cluster_center = cluster_center.reset_index().set_index(['Labels']).sort_index().drop('index', axis=1)

pipeline = Pipeline([
                ('cvec', CountVectorizer()),
                ('clf', LinearSVC(class_weight='balanced'))
            ])
# sorted(pipeline.get_params().keys()) # -- to obtain the GridSearchCV parameter names
parameters = {
                'cvec__max_df': [0.25, 0.5],
                'cvec__ngram_range': [(1, 1)],
                'cvec__min_df': [1, 2],
                'clf__C': [1, 10, 50, 100]
            }
overall_f1_score_v2_cv = make_scorer(overall_f1_score_v2, greater_is_better=True, class_to_genre_map = cluster_center)
grid_search_cv = GridSearchCV(pipeline, parameters, cv=2, verbose=10, scoring=overall_f1_score_v2_cv)
grid_search_cv.fit(train_X, train_y_cluster_labels)

print()
print("Best parameters set:")
print (grid_search_cv.best_estimator_.steps)
print()

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_cv.best_estimator_
predictions = multi_class_predict(best_clf, test_X, cluster_center) 


Fitting 2 folds for each of 16 candidates, totalling 32 fits
[CV] cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.25 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.25, score=0, total=   0.6s
[CV] cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.25 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV]  cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.25, score=0, total=   0.7s
[CV] cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.25 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.7s remaining:    0.0s


[CV]  cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.25, score=0, total=   0.5s
[CV] cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.25 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.4s remaining:    0.0s


[CV]  cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.25, score=0, total=   0.7s
[CV] cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.5 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.3s remaining:    0.0s


[CV]  cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.5, score=0, total=   0.5s
[CV] cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.5 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.0s remaining:    0.0s


[CV]  cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.5, score=0, total=   0.7s
[CV] cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.5 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.0s remaining:    0.0s


[CV]  cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.5, score=0, total=   0.5s
[CV] cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.5 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    5.6s remaining:    0.0s


[CV]  cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=1, cvec__max_df=0.5, score=0, total=   0.6s
[CV] cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.25 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    6.6s remaining:    0.0s


[CV]  cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.25, score=0, total=   0.7s
[CV] cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.25 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    7.4s remaining:    0.0s


[CV]  cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.25, score=0, total=   0.9s
[CV] cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.25 
[CV]  cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.25, score=0, total=   0.7s
[CV] cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.25 
[CV]  cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.25, score=0, total=   0.8s
[CV] cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.5 
[CV]  cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.5, score=0, total=   0.7s
[CV] cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.5 
[CV]  cvec__min_df=1, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.5, score=0, total=   0.8s
[CV] cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.5 
[CV]  cvec__min_df=2, cvec__ngram_range=(1, 1), clf__C=10, cvec__max_df=0.5, score=0, total=   0.6s
[CV] cvec__min_

[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:   29.5s finished


()
Best parameters set:
[('cvec', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('clf', LinearSVC(C=1, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]
()
Applying best classifier on test data:


In [14]:
# # Count Vectorizer + Linear SVC
pipeline = Pipeline([
                ('cvec', CountVectorizer(max_df=0.75, min_df=1, ngram_range=(1, 1))),
                ('clf', LinearSVC(C=1, class_weight='balanced'))
            ])
pipeline.fit(train_X, train_y_cluster_labels)
predictions = multi_class_predict(pipeline, test_X, cluster_center) 
print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

           0       0.48      0.45      0.46        56
           1       0.00      0.00      0.00         4
           2       0.50      0.50      0.50         6
           3       0.37      0.38      0.38        50
           4       0.00      0.00      0.00         5
           5       0.44      0.42      0.43        38
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           8       0.31      0.43      0.36        23
           9       0.33      0.40      0.36         5
          10       0.50      0.33      0.40         6
          11       0.00      0.00      0.00         1
          12       0.33      0.17      0.22         6
          13       0.15      0.22      0.18         9
          14       0.20      0.25      0.22         8
          15       0.90      0.64      0.75        14
          16       0.43      0.38      0.40         8
          17       0.00    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [56]:
# TF-IDF + Naive Bayes
train_y_cluster_labels= train_y.groupby(list(category_columns)).ngroup()
cluster_center = train_y.copy(deep=True) #pd.DataFrame(train_y)
cluster_center['Labels']=train_y_cluster_labels
cluster_center = cluster_center.drop_duplicates()
cluster_center = cluster_center.reset_index().set_index(['Labels']).sort_index().drop('index', axis=1)

In [58]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', MultinomialNB(fit_prior=True, class_prior=None))
            ])
# sorted(pipeline.get_params().keys()) # -- to obtain the GridSearchCV parameter names
parameters = {
                'tfidf__max_df': [0.25, 0.5, 0.75],
                'tfidf__ngram_range': [(1, 1)],
                'tfidf__min_df': [1, 2, 5, 10],
                'clf__alpha': [0.001, 0.01, 0.1, 1]
            }

overall_f1_score_v2_cv = make_scorer(overall_f1_score_v2, greater_is_better=True, class_to_genre_map = cluster_center)
grid_search_cv = GridSearchCV(pipeline, parameters, cv=2, scoring=overall_f1_score_v2_cv)
grid_search_cv.fit(train_X, train_y_cluster_labels)

print()
print("Best parameters set:")
print (grid_search_cv.best_estimator_.steps)
print()

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_cv.best_estimator_
predictions = multi_class_predict(best_clf, test_X, cluster_center)


()
Best parameters set:
[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))]
()
Applying best classifier on test data:


In [18]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 1))),
                ('clf', MultinomialNB(alpha=0.001, fit_prior=True, class_prior=None))
            ])
pipeline.fit(train_X, train_y_cluster_labels)
predictions = multi_class_predict(pipeline, test_X, cluster_center) 
print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

           0       0.55      0.50      0.52        56
           1       0.00      0.00      0.00         4
           2       0.40      0.33      0.36         6
           3       0.48      0.60      0.53        50
           4       0.00      0.00      0.00         5
           5       0.71      0.63      0.67        38
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           8       0.43      0.39      0.41        23
           9       0.40      0.40      0.40         5
          10       1.00      0.33      0.50         6
          11       0.00      0.00      0.00         1
          12       0.50      0.33      0.40         6
          13       0.00      0.00      0.00         9
          14       1.00      0.12      0.22         8
          15       0.67      0.71      0.69        14
          16       0.67      0.25      0.36         8
          17       0.00    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [20]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 1))),
                ('clf', MultinomialNB(alpha=0.001, fit_prior=True, class_prior=None))
            ])
pipeline.fit(train_X, train_y_cluster_labels)
predictions = multi_class_predict(pipeline, test_X, cluster_center) 
print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

           0       0.55      0.50      0.52        56
           1       0.00      0.00      0.00         4
           2       0.40      0.33      0.36         6
           3       0.48      0.60      0.53        50
           4       0.00      0.00      0.00         5
           5       0.71      0.63      0.67        38
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           8       0.43      0.39      0.41        23
           9       0.40      0.40      0.40         5
          10       1.00      0.33      0.50         6
          11       0.00      0.00      0.00         1
          12       0.50      0.33      0.40         6
          13       0.00      0.00      0.00         9
          14       1.00      0.12      0.22         8
          15       0.67      0.71      0.69        14
          16       0.67      0.25      0.36         8
          17       0.00    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
