In [None]:
"""
This is the Code for predicting the category of joke from the following categories:
1. Animals
2. Technology
3. Doctor
4. Main
5. Politics
6. Relationship
7. Religion
8. School
9. Food
10. Others


We have pre-labelled the jokes data (150 jokes) into the above mentioned classes. Following is the distribution of the categoies:

   8 animal
   7 doctor
   8 food
   1 joke_category_reduced
  11 man
  58 others
  15 politics
  13 relationship
   1 religion
  10 school
  19 technology


Approach:
=========

Features for Jokes:
1. Obtain key words for each joke
2. Obtain Glove vectors (similar to Word2Vec) from pre-trained based on Wikipedia data of 300 dimension for each word and then averaging out for the entire joke.
3. These averaged out 300 dimensional Glove vector for joke is used as feature for classification

Classes:
10 categories mentioned above.


Training:
==========
The problem is posed as a multi-class classification problem with 10 classes. Since the data is of small size, only 10% of the jokes are used for testing while
remaining 90% jokes are used for training. The following models were used:
1. Gradient Boosting
2. Naive Bayes
3. SVM
4. Logistic Regression
5. Nearest Neighbors using Centroid
6. K-nearest Neighbors
7. Ensemble of all above classifiers

Results:
========
Following are the results of various models:  


Name: Gradient Boosting
Accuracy score:  0.4

Name: Naive Bayes
Accuracy score:  0.466666666667

Name: SVM
Accuracy score:  0.533333333333

Name: Logistic Regression
Accuracy score:  0.533333333333

Name: Nearest Neighbors using Centroid
Accuracy score:  0.533333333333

Name: K-nearest Neighbors
Accuracy score:  0.466666666667

Name: Ensemble
Accuracy score:  0.466666666667


Even though Nearest Neighbors and LR gives best performance, however they often bias towards "others" class. 
On qualitative analysis, Naive Bayes, Gradient boosting does better generally. KNN does better when their are enough samples for  a category, however it is biased towards "others" class which is a popular class making the acuracy to be high.


"""

In [27]:
import graphlab as gl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier



In [28]:
def extract_key_words(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue
    
    return continuous_chunk
    

def clean_joke(joke):
    joke = re.sub(r'([^\.\s\w]|_)+', '', joke).replace(".", ". ")
    joke = " ".join(extract_key_words(joke))
    return joke

def load_joke_classes_and_text():
    data = pd.read_csv("../data/Jokes_labelling.txt", delimiter="\t")
    data['Jokes'] = data['Jokes'].map(lambda j: clean_joke(j))
    data.drop('joke_category', axis=1, inplace=True)
    cat_feats = pd.get_dummies(data['joke_category_reduced'], prefix='cat')
    data = pd.concat([data['joke_id'], data['Jokes'], cat_feats], axis=1)
    
    Y = cat_feats
    X = data['Jokes']
    #print Y.describe()
    #print X.describe()
    
def load_joke_classes_text_and_glove_vectors():
    id_vectors = pd.read_csv("../data/Jokes_id_with_vectors.txt", delimiter="\t")
    id_vectors.drop("Unnamed: 301", axis=1, inplace=True)
    data = pd.read_csv("../data/Jokes_labelling.txt", delimiter="\t")
    cat_feats = pd.get_dummies(data['joke_category_reduced'], prefix='cat')
    
    Y = cat_feats
    X = id_vectors
    X['jokes'] = data['Jokes']
    X['joke_category_reduced'] = data['joke_category_reduced']
    #X = id_vectors.drop('joke_id', axis=1)
    cols = X.columns.tolist()
    cols.insert(1, cols.pop(cols.index('jokes')))
    cols.insert(2, cols.pop(cols.index('joke_category_reduced')))
    X = X.reindex(columns= cols)
    
    Y = data['joke_category_reduced']
    
    return X, Y

def evaluting_and_predicting_joke_category_by_words(X,Y):
    Y = Y.values
    X = X.values
    
    le = preprocessing.LabelEncoder()
    Y_c = le.fit_transform(Y)
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y_c, test_size=0.10, random_state=99)
    
    train_ids = X_train[:, 0]
    train_jokes = X_train[:, 1]
    test_ids = X_test[:, 0]
    test_jokes = X_test[:, 1]
    test_jokes_categ = X_test[:, 2]
    
    test_joke_true_categ_map = {}
    for joke, true_categ in zip(test_jokes, test_jokes_categ):
        test_joke_true_categ_map[joke] = true_categ
        
    X_train = X_train[:, 3:]
    X_test = X_test[:, 3:]
    
    model_xgb = xgb.XGBClassifier()
    model_nb = GaussianNB()
    model_svm = svm.SVC()
    model_lr = LogisticRegression()
    model_knn_centroid = NearestCentroid()
    model_knn = KNeighborsClassifier()
    
    eclf = VotingClassifier(estimators=[
        ('lr', model_lr), ('knn_centroid', model_knn_centroid), ('gnb', model_nb), ('svc', model_svm), ('knn', model_knn), ('xgb', model_xgb)],
        voting='hard', weights=[1,1,1,1,1,1])
    
    models = [model_xgb, model_nb, model_svm, model_lr, model_knn_centroid, model_knn, eclf]
    model_names = ["Gradient Boosting", "Naive Bayes", "SVM", "Logistic Regression", "Nearest Neighbors using Centroid", "K-nearest Neighbors", "Ensemble"]
    
    best_model = None
    best_accuracy = 0
    
    joke_predicts_by_model_map = {}
    
    print "Performance of models"
    print "======================"
    for model, name in zip(models, model_names):
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        accuracy = accuracy_score(y_test, preds)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
        
        print "Name:", name
        print "Accuracy score: ", accuracy
        print 
        
        best_preds = model.predict(X_test)

        pred_categories = le.inverse_transform(best_preds)
        categ_joke_map = {}
        for id, categ, joke in zip(test_ids, pred_categories, test_jokes):
            if categ not in categ_joke_map:
                categ_joke_map[categ] = [joke]
            else:
                categ_joke_map[categ].append(joke)

        """
        for categ in categ_joke_map:
            print categ + "\n" + "======"
            for joke in categ_joke_map[categ]:
                print joke
                print 
            print "\n\n\n"
        """
        for categ in categ_joke_map:
            for joke in categ_joke_map[categ]:
                if joke not in joke_predicts_by_model_map:
                    joke_predicts_by_model_map[joke] = []
                joke_predicts_by_model_map[joke].append(categ)
    
    print "Predicting the category predicted by each model for each joke in test data: "
    print "============================================================================\n"
    for joke in joke_predicts_by_model_map:
        print "Joke: \n" 
        print joke, "\n"
        print "True category -- ", test_joke_true_categ_map[joke]
        print "Predicted categories: "
        for name, pred in zip(model_names, joke_predicts_by_model_map[joke]):
            print name, "--" , pred
        #print "Models:"
        #print model_names
        #print "predicted categories:"
        #print joke_predicts_by_model_map[joke]
        print "\n\n"
        
                    
if __name__ == "__main__":
    X, Y = load_joke_classes_text_and_glove_vectors()
    evaluting_and_predicting_joke_category_by_words(X,Y)




Performance of models
Name: Gradient Boosting
Accuracy score:  0.4

Name: Naive Bayes
Accuracy score:  0.466666666667

Name: SVM
Accuracy score:  0.533333333333

Name: Logistic Regression
Accuracy score:  0.533333333333

Name: Nearest Neighbors using Centroid
Accuracy score:  0.533333333333

Name: K-nearest Neighbors
Accuracy score:  0.466666666667

Name: Ensemble
Accuracy score:  0.466666666667

Predicting the category predicted by each model for each joke in test data: 

Joke: 

Two kindergarten girls were talking outside: one said, "You won't believe what I saw on the patio yesterday--a condom!" The second girl asked, "What's a patio?" 

True category --  others
Predicted categories: 
Gradient Boosting -- man
Naive Bayes -- others
SVM -- others
Logistic Regression -- others
Nearest Neighbors using Centroid -- school
K-nearest Neighbors -- relationship
Ensemble -- others



Joke: 

  An American tourist goes into a restaurant in Spain and orders the specialty of the house. When his d