In [79]:
import graphlab as gl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier



In [91]:
def extract_key_words(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue
    
    return continuous_chunk
    

def clean_joke(joke):
    joke = re.sub(r'([^\.\s\w]|_)+', '', joke).replace(".", ". ")
    joke = joke.replace('\r', '') 
    joke = joke.replace('\n', '')
    joke = joke.replace('<br />', '')
    joke = joke.replace('<p>', '')
    joke = joke.replace('&quot;', '')
    joke = joke.replace('&#039;', '')
    joke = " ".join(extract_key_words(joke))
    #print joke
    return joke

def load_joke_classes_and_text():
    data = pd.read_csv("../data/Jokes_labelling.txt", delimiter="\t")
    data['Jokes'] = data['Jokes'].map(lambda j: clean_joke(j))
    data.drop('joke_category', axis=1, inplace=True)
    cat_feats = pd.get_dummies(data['joke_category_reduced'], prefix='cat')
    data = pd.concat([data['joke_id'], data['Jokes'], cat_feats], axis=1)
    
    Y = cat_feats
    X = data['Jokes']
    #print Y.describe()
    #print X.describe()
    
def load_joke_classes_text_and_glove_vectors():
    id_vectors = pd.read_csv("../data/Jokes_id_with_vectors.txt", delimiter="\t")
    id_vectors.drop("Unnamed: 301", axis=1, inplace=True)
    data = pd.read_csv("../data/Jokes_labelling.txt", delimiter="\t")
    cat_feats = pd.get_dummies(data['joke_category_reduced'], prefix='cat')
    
    Y = cat_feats
    X = id_vectors
    X['jokes'] = data['Jokes']
    #X = id_vectors.drop('joke_id', axis=1)
    cols = X.columns.tolist()
    cols.insert(1, cols.pop(cols.index('jokes')))
    X = X.reindex(columns= cols)
    
    Y = data['joke_category_reduced']
    
    return X, Y

def evaluting_and_predicting_joke_category_by_words(X,Y):
    Y = Y.values
    X = X.values
    
    le = preprocessing.LabelEncoder()
    Y_c = le.fit_transform(Y)
    
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y_c, test_size=0.10, random_state=11)
    
    train_ids = X_train[:, 0]
    train_jokes = X_train[:, 1]
    test_ids = X_test[:, 0]
    test_jokes = X_test[:, 1]
    
    X_train = X_train[:, 2:]
    X_test = X_test[:, 2:]
    
    model = xgb.XGBClassifier()
    model = GaussianNB()
    #model = svm.SVC()
    #model = LogisticRegression()
    #model = NearestCentroid()
    #model = KNeighborsClassifier()
    
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    pred_categories = le.inverse_transform(preds)
    #print pred_categories
    for id, categ, joke in zip(test_ids, pred_categories, test_jokes):
        print categ, "\t", joke
        print
    
    print accuracy_score(y_test, preds)
    

X, Y = load_joke_classes_text_and_glove_vectors()
evaluting_and_predicting_joke_category_by_words(X,Y)




others 	   The new employee stood before the paper shredder looking confused.     "Need some help? " a secretary asked.     "Yes, " he replied.  "How does this thing work? "     "Simple, " she said, taking the fat report from his hand and feeding it into the shredder.     "Thanks, but where do the copies come out? "  

others 	  America:    8:00 - Welcome to work!  12:00 - Lunch break  17:00 - The work day is over    Japan:    8:00 - Are you already at work?  12:00 - Continue your work  17:00 - The work day is over  20:00 - Please finish your work    Romania:    8:00 - Has anyone come to work?  12:00 - Did someone start working?  16:00 - Is anyone at work?    

others 	  An artist asked the gallery owner if there had been any interest in his paintings currently on display.  "I've got good news and bad news, " the owner replied.  "The good news is that a gentleman inquired about your work and wondered if it would appreciate in value after your death. When I told him it would, he bought 