In [1]:
import json
from pathlib import Path
import math, random
import nltk

#Get our data
#Label data based on rating
#Extract features
#split between train, dev, and test

#Choose classifier 
#Train classifier (on train data)

#Test classifier (on dev data)


In [2]:
#Get our data

#standard preprocessing
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag
EOJ = 'xeoj'  # end of joke tag

#get jokes:
PATH=Path('data')

files = list(PATH.iterdir())

for fname in files:
    if "eddit" in str(fname):
        reddit_dataset = str(fname)
    if "upid" in str(fname):
        stupid_dataset = str(fname)
reddit_jokes = json.load(open(reddit_dataset))
stupid_jokes = json.load(open(stupid_dataset))

#discard reddit jokes with 0 score:
rated_jokes = [joke for joke in reddit_jokes if joke['score'] > 0]

#regularize to match stupid_jokes:
title_body = [joke['title']+' '+joke['body'] for joke in rated_jokes]

all_jokes = []
for i in range(len(reddit_jokes)):
    r_joke = reddit_jokes[i]
    #|print(r_joke)
    r_joke['rating']=round(math.log(r_joke['score']+random.randrange(1,10))/math.log(10)*5/2, 2)
    if r_joke['rating']>5:
        r_joke['rating']=5
    del r_joke['score'] 
    r_joke['body'] = r_joke['title']+" "+r_joke['body']
    del r_joke['title']
for s_joke in stupid_jokes:
    del s_joke['category']

#combine joke sets:
combined_jokes = reddit_jokes + stupid_jokes

In [3]:
#Group into funny and notFunny sets:

funny_joke_list = []
not_funny_joke_list = []
for joke in combined_jokes:
    if joke["rating"] >= 2.5:
        funny_joke_list.append(joke)
    else:
        not_funny_joke_list.append(joke)
        
(len(funny_joke_list), len(not_funny_joke_list) )

(91745, 106581)

# Feature Extraction:

In [21]:
# get unigrams and bigrams:
stopwords = nltk.corpus.stopwords.words("english")

def normalize(text):
    tokenized_text = []
    tags = []
    for sent in nltk.sent_tokenize(text):
        intermediate = [word for word in nltk.word_tokenize(sent) if (word not in stopwords) and ]
        for word, pos in nltk.pos_tag(intermediate):
            tokenized_text.append(word.lower())
            tags.append(pos)
    return tokenized_text, tags

def get_ngrams(tokens):
    unigrams = nltk.FreqDist(tokens)
    bigrams = nltk.FreqDist(nltk.bigrams(tokens))
    
    feature_vector = {}
    for token, freq in unigrams.items():
        feature_vector["UNI_%s" %(token)] = float(freq)/unigrams.N()
    for (token1, token2), freq in bigrams.items():
        feature_vector["BI_(%s,%s)" %(token1,token2)] = float(freq)/bigrams.N()        
    return feature_vector
        #"%s ahhhhh! %s" %("sdflks", "sdff")
    
def get_pos(tags):
    unigrams = nltk.FreqDist(tags)
    bigrams = nltk.FreqDist(nltk.bigrams(tags))
    
    feature_vector = {}
    for token, freq in unigrams.items():
        feature_vector["UNIPOS_%s" %(token)] = float(freq)/unigrams.N()
    for (token1, token2), freq in bigrams.items():
        feature_vector["BIPOS_(%s,%s)" %(token1,token2)] = float(freq)/bigrams.N()        
    return feature_vector
    

In [14]:
funny_feature_tuples = []
for joke in funny_joke_list[:2000]:
    tokens, tags = normalize(joke["body"])
    funny_feature_tuples.append(({**get_ngrams(tokens), **get_pos(tags)},"funny"))
    
unfunny_feature_tuples = []
for joke in not_funny_joke_list[:2000]:
    tokens, tags = normalize(joke["body"])
    unfunny_feature_tuples.append(({**get_ngrams(tokens), **get_pos(tags)},"unfunny"))

In [16]:
funny_feature_tuples[0]

({'BIPOS_(,,IN)': 0.014388489208633094,
  'BIPOS_(,,JJ)': 0.007194244604316547,
  'BIPOS_(,,NN)': 0.007194244604316547,
  'BIPOS_(,,NNP)': 0.014388489208633094,
  'BIPOS_(,,NNS)': 0.007194244604316547,
  'BIPOS_(,,PRP$)': 0.007194244604316547,
  'BIPOS_(,,VBP)': 0.007194244604316547,
  'BIPOS_(,,VBZ)': 0.014388489208633094,
  'BIPOS_(.,JJ)': 0.007194244604316547,
  'BIPOS_(.,POS)': 0.007194244604316547,
  'BIPOS_(.,RB)': 0.007194244604316547,
  'BIPOS_(.,VBZ)': 0.007194244604316547,
  'BIPOS_(CC,NNS)': 0.007194244604316547,
  'BIPOS_(CC,VBZ)': 0.02877697841726619,
  'BIPOS_(DT,.)': 0.007194244604316547,
  'BIPOS_(DT,JJ)': 0.007194244604316547,
  'BIPOS_(DT,NN)': 0.02877697841726619,
  'BIPOS_(DT,NNP)': 0.007194244604316547,
  'BIPOS_(IN,DT)': 0.014388489208633094,
  'BIPOS_(IN,NNP)': 0.02158273381294964,
  'BIPOS_(IN,PRP$)': 0.02158273381294964,
  'BIPOS_(IN,PRP)': 0.014388489208633094,
  'BIPOS_(IN,RB)': 0.007194244604316547,
  'BIPOS_(JJ,DT)': 0.007194244604316547,
  'BIPOS_(JJ,IN)':

### Partitioning

In [17]:
train = funny_feature_tuples[:1600]+unfunny_feature_tuples[:1600]
dev = funny_feature_tuples[1600:]+unfunny_feature_tuples[1600:]

# Training:

In [18]:
classifier = nltk.classify.NaiveBayesClassifier.train(train)

In [19]:
accuracy = nltk.classify.accuracy(classifier, dev)
accuracy

0.50875

In [20]:
features_only = []
labels_only = []
for vector, label in dev:
    features_only.append(vector)
    labels_only.append(label)
    

predicted_labels = classifier.classify_many(features_only)

confusion_matrix = nltk.ConfusionMatrix(labels_only, predicted_labels)
print(confusion_matrix)

        |       u |
        |       n |
        |   f   f |
        |   u   u |
        |   n   n |
        |   n   n |
        |   y   y |
--------+---------+
  funny |<157>243 |
unfunny | 150<250>|
--------+---------+
(row = reference; col = test)

