In [32]:
import json
from pathlib import Path
import math, random
import nltk
import re
import word_category_counter

#Get our data
#Label data based on rating
#Extract features
#split between train, dev, and test

#Choose classifier 
#Train classifier (on train data)

#Test classifier (on dev data)


In [2]:
#Get our data

#standard preprocessing
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag
EOJ = 'xeoj'  # end of joke tag

#get jokes:
PATH=Path('data')

files = list(PATH.iterdir())

for fname in files:
    if "eddit" in str(fname):
        reddit_dataset = str(fname)
    if "upid" in str(fname):
        stupid_dataset = str(fname)
reddit_jokes = json.load(open(reddit_dataset))
stupid_jokes = json.load(open(stupid_dataset))

#discard reddit jokes with 0 score:
rated_jokes = [joke for joke in reddit_jokes if joke['score'] > 0]

#regularize to match stupid_jokes:
title_body = [joke['title']+' '+joke['body'] for joke in rated_jokes]

all_jokes = []
for i in range(len(reddit_jokes)):
    r_joke = reddit_jokes[i]
    #|print(r_joke)
    r_joke['rating']=round(math.log(r_joke['score']+random.randrange(1,10))/math.log(10)*5/2, 2)
    if r_joke['rating']>5:
        r_joke['rating']=5
    del r_joke['score'] 
    r_joke['body'] = r_joke['title']+" "+r_joke['body']
    del r_joke['title']
for s_joke in stupid_jokes:
    del s_joke['category']

#combine joke sets:
combined_jokes = reddit_jokes + stupid_jokes

In [3]:
#Group into funny and notFunny sets:

funny_joke_list = []
not_funny_joke_list = []
for joke in combined_jokes:
    if joke["rating"] >= 2.5:
        funny_joke_list.append(joke)
    else:
        not_funny_joke_list.append(joke)
        
(len(funny_joke_list), len(not_funny_joke_list) )

(91745, 106581)

# Feature Extraction:

In [35]:
# get unigrams and bigrams:
stopwords = nltk.corpus.stopwords.words("english")

def normalize(text):
    tokenized_text = []
    tags = []
    for sent in nltk.sent_tokenize(text):
        intermediate = [word for word in nltk.word_tokenize(sent) if (word not in stopwords) and re.search(r"\w", word)]
        for word, pos in nltk.pos_tag(intermediate):
            tokenized_text.append(word.lower())
            tags.append(pos)
    return tokenized_text, tags

def get_ngrams(tokens):
    unigrams = nltk.FreqDist(tokens)
    bigrams = nltk.FreqDist(nltk.bigrams(tokens))
    
    feature_vector = {}
    for token, freq in unigrams.items():
        feature_vector["UNI_%s" %(token)] = float(freq)/unigrams.N()
    for (token1, token2), freq in bigrams.items():
        feature_vector["BI_(%s,%s)" %(token1,token2)] = float(freq)/bigrams.N()        
    return feature_vector
        #"%s ahhhhh! %s" %("sdflks", "sdff")
    
def get_pos(tags):
    unigrams = nltk.FreqDist(tags)
    bigrams = nltk.FreqDist(nltk.bigrams(tags))
    
    feature_vector = {}
    for token, freq in unigrams.items():
        feature_vector["UNIPOS_%s" %(token)] = float(freq)/unigrams.N()
    for (token1, token2), freq in bigrams.items():
        feature_vector["BIPOS_(%s,%s)" %(token1,token2)] = float(freq)/bigrams.N()        
    return feature_vector

def get_liwc_features(tokens):
    """
    Adds all possible LIWC derived feature

    :param words:
    :return:
    """
    text = u" ".join(tokens)
    liwc_cat = ['Total Function Words',
                'Total Pronouns',
                'Personal Pronouns',
                'First Person Singular',
                'First Person Plural',
                'Second Person',
                'Third Person Singular',
                'Third Person Plural',
                ' Impersonal Pronouns',
                'Articles',
                'Common Verbs',
                'Auxiliary Verbs',
                'Past Tense',
                'Present Tense',
                'Future Tense',
                'Adverbs',
                'Prepositions',
                'Conjunctions',
                'Negations',
                'Quantifiers',
                'Number',
                'Swear Words',
                'Social Processes',
                'Family',
                'Friends',
                'Humans',
                'Affective Processes',
                'Positive Emotion',
                'Negative Emotion',
                'Anxiety',
                'Anger',
                'Sadness',
                'Cognitive Processes',
                'Insight',
                'Causation',
                'Discrepancy',
                'Tentative',
                'Certainty',
                'Inhibition',
                'Inclusive',
                'Exclusive',
                'Perceptual Processes',
                'See',
                'Hear',
                'Feel',
                'Biological Processes',
                'Body',
                'Health',
                'Sexual',
                'Ingestion',
                'Relativity',
                'Motion',
                'Space',
                'Time',
                'Work',
                'Achievement',
                'Leisure',
                'Home',
                'Money',
                'Religion',
                'Death',
                'Assent',
                'Nonfluencies',
                'Fillers',
                'Total first person',
                'Total third person',
                'Positive feelings',
                'Optimism and energy',
                'Communication',
                'Other references to people',
                'Up',
                'Down',
                'Occupation',
                'School',
                'Sports',
                'TV',
                'Music',
                'Metaphysical issues',
                'Physical states and functions',
                'Sleeping',
                'Grooming']

    feature_vectors = {}
    liwc_scores = word_category_counter.score_text(text)

    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]

    if positive_score > negative_score:
        feature_vectors["liwc:positive"] = 1
    else:
        feature_vectors["liwc:negative"] = 1

    for cat in liwc_cat:
        if cat in liwc_scores:
            label = cat.lower().replace(" ", "_")
            feature_vectors["liwc_%s" %label] = liwc_scores[cat]

    return feature_vectors

In [45]:
funny_feature_tuples = []
set_size= 8000
for joke in funny_joke_list[:set_size]:
    tokens, tags = normalize(joke["body"])
    funny_feature_tuples.append(({**get_ngrams(tokens), **get_pos(tags), **get_liwc_features(tokens)},"funny"))
    
unfunny_feature_tuples = []
for joke in not_funny_joke_list[:set_size]:
    tokens, tags = normalize(joke["body"])
    unfunny_feature_tuples.append(({**get_ngrams(tokens), **get_pos(tags), **get_liwc_features(tokens)},"unfunny"))

In [49]:
funny_feature_tuples[0]

({'BIPOS_(DT,NN)': 0.03278688524590164,
  'BIPOS_(DT,NNP)': 0.01639344262295082,
  'BIPOS_(JJ,NNP)': 0.04918032786885246,
  'BIPOS_(JJ,NNS)': 0.01639344262295082,
  'BIPOS_(JJR,JJ)': 0.01639344262295082,
  'BIPOS_(MD,VB)': 0.01639344262295082,
  'BIPOS_(NN,JJ)': 0.01639344262295082,
  'BIPOS_(NN,NN)': 0.03278688524590164,
  'BIPOS_(NN,NNP)': 0.04918032786885246,
  'BIPOS_(NN,NNS)': 0.01639344262295082,
  'BIPOS_(NN,RB)': 0.01639344262295082,
  'BIPOS_(NN,VBD)': 0.01639344262295082,
  'BIPOS_(NN,VBZ)': 0.03278688524590164,
  'BIPOS_(NNP,NN)': 0.03278688524590164,
  'BIPOS_(NNP,NNP)': 0.08196721311475409,
  'BIPOS_(NNP,NNS)': 0.04918032786885246,
  'BIPOS_(NNP,RB)': 0.01639344262295082,
  'BIPOS_(NNP,VBZ)': 0.08196721311475409,
  'BIPOS_(NNS,MD)': 0.01639344262295082,
  'BIPOS_(NNS,NN)': 0.03278688524590164,
  'BIPOS_(NNS,NNP)': 0.01639344262295082,
  'BIPOS_(NNS,PRP)': 0.01639344262295082,
  'BIPOS_(NNS,RB)': 0.01639344262295082,
  'BIPOS_(PRP,NNP)': 0.01639344262295082,
  'BIPOS_(PRP,V

### Partitioning

In [46]:
division_size = int(set_size*4/5)
train = funny_feature_tuples[:division_size]+unfunny_feature_tuples[:division_size]
dev = funny_feature_tuples[division_size:]+unfunny_feature_tuples[division_size:]

# Training:

In [47]:
classifier = nltk.classify.NaiveBayesClassifier.train(train)

In [48]:
accuracy = nltk.classify.accuracy(classifier, dev)
accuracy

0.554375

In [50]:
features_only = []
labels_only = []
for vector, label in dev:
    features_only.append(vector)
    labels_only.append(label)
    

predicted_labels = classifier.classify_many(features_only)

confusion_matrix = nltk.ConfusionMatrix(labels_only, predicted_labels)
print(confusion_matrix)

        |       u |
        |       n |
        |   f   f |
        |   u   u |
        |   n   n |
        |   n   n |
        |   y   y |
--------+---------+
  funny |<823>777 |
unfunny | 649<951>|
--------+---------+
(row = reference; col = test)



# Things to try:

Word embeddings

Binning