In [1]:
import nltk
nltk.download('stopwords')
nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jfuentes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /Users/jfuentes/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /Users/jfuentes/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jfuentes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
##Needed to get access to mappening.utils.database since this is under mappening.ml
import sys
sys.path.insert(0,'./../..')

from mappening.utils.database import ucla_events_collection, events_ml_collection
import re
import nltk
import string
from tqdm import tqdm
import numpy as np
from nltk.tokenize.moses import MosesTokenizer
from collections import Counter


from gensim.models import Phrases
from nltk.corpus import wordnet as wn

In [22]:
class PreprocessText:
    def __init__(self, categorizedEvents):
        """categorizedEvents should be a list of dictionaries each corresponding to an event
            X is the tokenized preprocessed text
            Y is the corresponding categories
            phraseMl is the phrase model that can further trained and used
            phrases is a list of all the phrases identified"""
        self.X = []
        self.Y = []
        tokenizer = MosesTokenizer() #tokenizers are basically an advanced split
        for e in categorizedEvents:
            text = e[u'name'] + " " + e[u'description']
            text = tokenizer.tokenize(text)
            text = self.preprocess(text)
            self.X.append(text)
            self.Y.append(e[u'category'])
        self.phraseMl = Phrases(self.X, min_count=3) #learn ml model for phrase
        self.X = list(self.phraseMl[self.X]) #use ml model for phrases
#         self.X = list(self.phraseMl[self.X]) #get triples
        self.phrases = phrases = set([w for doc in self.X for w in doc if '_' in w])
        
    def matchNotX(self, strg, search=re.compile(r'[^!#$%&()*+,-./:;<=>?@\\^_}|{~0123456789]').search):
        """make sure word has something than punctuation"""
        return bool(search(strg)) #make sure word has something other than punctuation

    def preprocess(self, text):
        """Remove all useless words and punct, make lowercase"""
        stoplist = set('for a of the and to in . / '.split())
        stoplist = set(nltk.corpus.stopwords.words('english')) | stoplist | set(string.punctuation)
        return [word.strip(string.punctuation).lower() for word in text if word not in stoplist and self.matchNotX(word)]    
        
    def topBigrams(self, texts, n, tri=False):
        """Other method of getting phrases, currently unused because phrases can be further trained(online) and saved"""
        flatTexts = []
        for text in texts:
            for word in text:
                flatTexts.append(word)
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        trigram_measures = nltk.collocations.TrigramAssocMeasures()
        topAnswers = []
        if tri:
            finder = nltk.collocations.TrigramCollocationFinder.from_words(flatTexts)
            finder.apply_freq_filter(7)
            return finder.nbest(trigram_measures.pmi, n)
        else:
            finder = nltk.collocations.BigramCollocationFinder.from_words(flatTexts)
            finder.apply_freq_filter(7)
            return finder.nbest(bigram_measures.pmi, n)

In [23]:
def gatherCategorizedEvents():
    allCategorizedEvents = []
    allEvents = events_ml_collection.find({}, {"category": 1, "description": 1, "name": 1, "_id": 0})
    count = 0
    for e in allEvents:
        count += 1
        if 'category' in e and 'description' in e and 'name' in e:
            allCategorizedEvents.append(e)
    modernEvents = reduceCategories(allCategorizedEvents)
    print count, "total events, learning from the", len(modernEvents), "well categorized events"
    return modernEvents

## Event Categorization

In [24]:
def someCurrentCategories():
    """Looks at current events for the categories list, to be used if Facebook changes its events in the future"""
    allCategorizedEvents = []
    allEvents = ucla_events_collection.find({}, {"category": 1, "description": 1, "name": 1, "_id": 0})
    for e in allEvents:
        if 'category' in e and 'description' in e and 'name' in e:
            allCategorizedEvents.append(e)
    skTarget = [e['category'] for e in allCategorizedEvents]
    count = sorted(list(set(skTarget)))
    print(count)
    
curListOfCategories = [u'ART', u'CAUSE', u'COMEDY_PERFORMANCE', u'DANCE', u'DRINKS', u'FILM', u'FITNESS', u'FOOD',
                       u'GAMES', u'GARDENING', u'HEALTH', u'LITERATURE', u'MEETUP', u'MUSIC', u'NETWORKING', u'PARTY',
                       u'RELIGION', u'SHOPPING', u'SPORTS', u'THEATER', u'WELLNESS']

In [25]:
def reduceCategories(events):
    """OTHER will be discarded from the training data"""
    categoryMapping = {
        u'BOOK': u'LITERATURE',
        u'COMEDY': u'COMEDY_PERFORMANCE',
        u'CLASS': u'OTHER',
        u'DINING': u'FOOD',
        u'FAMILY': u'OTHER',
        u'FESTIVAL': u'PARTY',
        u'FOOD_TASTING': u'FOOD',
        u'FUNDRAISER': u'CAUSE',
        u'LECTURE': u'OTHER',
        u'MOVIE': u'FILM',
        u'NEIGHBORHOOD': u'OTHER',
        u'NIGHTLIFE': u'OTHER',
        u'RELIGIOUS': u'RELIGION',
        u'VOLUNTEERING': u'CAUSE',
        u'WORKSHOP': u'OTHER'
    }
    
    for e in events:
        category = e['category']
        if category in categoryMapping:
            e['category'] = categoryMapping[category]
    reducedEvents = [e for e in events if e['category'] != u'OTHER']
    return reducedEvents
    

In [26]:
X = gatherCategorizedEvents()
skText = [e['name']+' '+e['description'] for e in X]
skTarget = [e['category'] for e in X]

6936 total events, learning from the 2828 well categorized events


## ML Helper Functions

In [27]:
from sklearn.cross_validation import train_test_split

def train(classifier, X, y, trails=25):
    scores = np.zeros(trails)
    for i in tqdm(range(0, trails)):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=i)

        classifier.fit(X_train, y_train)
        scores[i] = (classifier.score(X_test, y_test))
    print "Average Accuracy over %d trials: %s" % (trails, np.mean(scores))
    classifier.fit(X, y)
    return classifier
 



In [28]:
def predictList(vectorizer, classifier, x):
    print(x)
    x = vectorizer.transform(x)
    y_pred = classifier.predict(x)
    print(y_pred)


In [29]:
def giveProbPerCategory(vectorizer, classifier, x, threshold=.15):
    print(x)
    x = vectorizer.transform(x)
    y_pred = classifier.predict_proba(x)

    strongest_category = ''
    highest_match = 0
    above_threshold = []
    for i in range(len(classifier.classes_)):
        
        if y_pred[0][i] > highest_match:
            highest_match = y_pred[0][i]
            strongest_category = classifier.classes_[i]
            
        if y_pred[0][i] > threshold:
            above_threshold.append(classifier.classes_[i])
    
    if not above_threshold:
        return [strongest_category]
    else:
        return above_threshold

# Model Making

In [30]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
 
trial1 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB()),
])
 
nbModel = train(trial1, skText, skTarget)

100%|██████████| 25/25 [00:13<00:00,  1.84it/s]


Average Accuracy over 25 trials: 0.5043847241867043


In [205]:
trial2 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB(alpha=0.05)),
])

nbModel = train(trial2, skText, skTarget)

100%|██████████| 25/25 [00:11<00:00,  2.23it/s]


Average Accuracy over 25 trials: 0.7001047120418847


0.7022687609075043 without stoplist 

0.7209773123909248 with my stoplist

In [206]:
stoplist = 'for a of the and to in ucla . / '.split()
stoplist = nltk.corpus.stopwords.words('english') + stoplist + string.punctuation.split()

# create the transform
vectorizer = TfidfVectorizer(stop_words='english')

# tokenize and build vocab
X = vectorizer.fit_transform(skText)

# print(vectorizer.vocabulary_)
# print(vectorizer.idf_)

nbModel = MultinomialNB(alpha=0.05)
nbModel = train(nbModel, X, skTarget)

100%|██████████| 25/25 [00:00<00:00, 49.76it/s]

Average Accuracy over 25 trials: 0.7225130890052357





0.7177661431064574 portersteemer with matchNotX

0.7091099476439789 just snowball stemmer

In [31]:
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize

def matchNotX(strg, search=re.compile(r'[^!#$%&()*+,-./:;<=>?@\\^_}|{~0123456789]').search):
    """make sure word has something than punctuation"""
    return bool(search(strg)) #make sure word has something other than punctuation

def stemming_tokenizer(text):
    stemmer = SnowballStemmer('english') #used to remove plural version of words
    tokenizer = word_tokenize
    return [stemmer.stem(w) for w in tokenizer(text)]

vectorizer = TfidfVectorizer(tokenizer=stemming_tokenizer, stop_words='english') 
X = vectorizer.fit_transform(skText)
nbModel = MultinomialNB(alpha=0.05)
nbModel = train(nbModel, X, skTarget)

100%|██████████| 25/25 [00:00<00:00, 45.23it/s]

Average Accuracy over 25 trials: 0.7057425742574257





In [17]:
giveProbPerCategory(vectorizer, nbModel, skText[5:6])
def giveProbPerCategory(vectorizer, classifier, x, threshold=.15):

[u'A Funny Thing Happened on the Way to the Gynecologic Oncology A Funny Thing Happened on the Way to the Gynecologic Oncology Unit at Memorial Sloan Kettering Cancer Center of New York City\nSeptember 5 - October 8, 2017 \nGil Cates Theater\nWritten by Halley Feiffer\nDirected by Trip Cullman\nFeaturing Halley Feiffer, Jason Butler Harner, Eileen T\'Kaye & JoBeth Williams\n\n"Under Trip Cullman\'s perceptive direction, Halley Feiffer\'s distinct voice is on fine display throughout, in all its uniquely unsettling glory."  \u2013 TheaterMania \n\nSitting bedside at Memorial Sloan Kettering has never been so entertaining. While their ailing mothers share a hospital room, Karla and Don discover truth in the old clich\xe9 that opposites attract\u2026and repel\u2026and attract.']


IndexError: invalid index to scalar variable.

TESTING

In [188]:
tokenizer = vectorizer.build_tokenizer()

for w in vectorizer.vocabulary_:
    if not matchNotX(w):
        print(w)

04
03
05
09
11
12
19
18
66
68
89
98
91
90
94
96
02
92


In [20]:
sorted(list(set(skTarget)))

NameError: name 'skTarget' is not defined

In [21]:
someCurrentCategories()

[u'ART', u'CAUSE', u'COMEDY_PERFORMANCE', u'CONFERENCE', u'DANCE', u'FILM', u'FOOD', u'GARDENING', u'HEALTH', u'LITERATURE', u'MUSIC', u'NEIGHBORHOOD', u'NETWORKING', u'PARTY', u'SPORTS', u'THEATER', u'WELLNESS']
