In [1]:
import nltk
nltk.download('stopwords')
nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jfuentes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /Users/jfuentes/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /Users/jfuentes/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


True

In [2]:
from ourDb import events_collection, total_events_collection
import re
import nltk
import string
from nltk.tokenize.moses import MosesTokenizer

from gensim.models import Phrases
from nltk.corpus import wordnet as wn

In [5]:
class PreprocessText:
    def __init__(self, categorizedEvents):
        """categorizedEvents should be a list of dictionaries each corresponding to an event
            X is the tokenized preprocessed text
            Y is the corresponding categories
            phraseMl is the phrase model that can further trained and used
            phrases is a list of all the phrases identified"""
        self.X = []
        self.Y = []
        tokenizer = MosesTokenizer() #tokenizers are basically an advanced split
        for e in categorizedEvents:
            text = e[u'name'] + " " + e[u'description']
            text = tokenizer.tokenize(text)
            text = self.preprocess(text)
            self.X.append(text)
            self.Y.append(e[u'category'])
        self.phraseMl = Phrases(self.X, min_count=3) #learn ml model for phrase
        self.X = list(self.phraseMl[self.X]) #use ml model for phrases
        self.X = list(self.phraseMl[self.X]) #get triples
        self.phrases = phrases = set([w for doc in self.X for w in doc if '_' in w])
        
    def matchNotX(self, strg, search=re.compile(r'[^!#$%&()*+,-./:;<=>?@\\^_}|{~0123456789]').search):
        """make sure word has something than punctuation"""
        return bool(search(strg)) #make sure word has something other than punctuation

    def preprocess(self, text):
        """Remove all useless words and punct, make lowercase"""
        stoplist = set('for a of the and to in . / '.split())
        stoplist = set(nltk.corpus.stopwords.words('english')) | stoplist | set(string.punctuation)
        return [word.strip(string.punctuation).lower() for word in text if word not in stoplist and self.matchNotX(word)]    
        
    def topBigrams(self, texts, n, tri=False):
        """Other method of getting phrases, currently unused because phrases can be further trained(online) and saved"""
        flatTexts = []
        for text in texts:
            for word in text:
                flatTexts.append(word)
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        trigram_measures = nltk.collocations.TrigramAssocMeasures()
        topAnswers = []
        if tri:
            finder = nltk.collocations.TrigramCollocationFinder.from_words(flatTexts)
            finder.apply_freq_filter(7)
            return finder.nbest(trigram_measures.pmi, n)
        else:
            finder = nltk.collocations.BigramCollocationFinder.from_words(flatTexts)
            finder.apply_freq_filter(7)
            return finder.nbest(bigram_measures.pmi, n)

In [6]:
def gatherCategorizedEvents():
    allCategorizedEvents = []
    allEvents = total_events_collection.find({}, {"category": 1, "description": 1, "name": 1, "_id": 0})
    count = 0
    for e in allEvents:
        count += 1
        if 'category' in e and 'description' in e and 'name' in e:
            allCategorizedEvents.append(e)
    print count, "events and using the", len(allCategorizedEvents), "categorized events"
    return allCategorizedEvents

X = gatherCategorizedEvents()
p = PreprocessText(X)

p.phrases

505 events and using the 251 categorized events


{u'2nd_3rd',
 u'35mm_b',
 u'35mm_color',
 u'35mm_technicolor',
 u'a_life',
 u'able_attend',
 u'about_exhibition',
 u'academies_\u2022',
 u'according_ucla',
 u'afro-brazilian_culture',
 u'afro-brazilian_metropolis',
 u'ages_ages',
 u'alan_k',
 u'all_day',
 u'alpha_kappa',
 u'alumni_friends',
 u'ambitious_exploration',
 u'america_apos;s',
 u'american_film',
 u'amp_b',
 u'angeles_ca',
 u'angeles_more',
 u'another_actor',
 u'apos;s_career',
 u'approx_min',
 u'armenian_studies',
 u'art_gallery',
 u'art_history',
 u'articles_en',
 u'ashe_center',
 u'assistant_professor',
 u'associate_professor',
 u'astronomy_building',
 u'attend_event',
 u'attend_presentations',
 u'attendees_rsvp',
 u'available_first',
 u'available_ucla',
 u'away_pauley',
 u'ax\xe9_bahia',
 u'b.a_\u2019',
 u'b_w',
 u'basis_if',
 u'basketball_pregame',
 u'basketball_sale',
 u'basketball_vs',
 u'bear_\u201d',
 u'beautiful_shabbat',
 u'beginning_p.m',
 u'big_band',
 u'billy_wilder',
 u'black_violin',
 u'blvd_day',
 u'book_signi

In [9]:
X = gatherCategorizedEvents()
skText = [e['name']+' '+e['description'] for e in X]
skTarget = [e['category'] for e in X]

505 events and using the 251 categorized events


In [10]:
from sklearn.cross_validation import train_test_split
 
def train(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
 
    classifier.fit(X_train, y_train)
    print "Accuracy: %s" % classifier.score(X_test, y_test)
    return classifier
 



In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
 
trial1 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB()),
])
 
nbModel = train(trial1, skText, skTarget)

Accuracy: 0.42857142857142855


In [13]:
nbModel.predict_proba(p.X[0])

array([[0.11257104, 0.0508018 , 0.00856987, 0.00912853, 0.0043049 ,
        0.00449522, 0.22853011, 0.00894059, 0.09242311, 0.00430119,
        0.00858153, 0.00467696, 0.00896002, 0.15703394, 0.06009538,
        0.01287158, 0.00430042, 0.14144164, 0.04858232, 0.02938985],
       [0.09464797, 0.05874922, 0.01081222, 0.01083033, 0.00542392,
        0.00542015, 0.1892959 , 0.01083639, 0.09081922, 0.00542061,
        0.01082262, 0.00542208, 0.01082094, 0.14675995, 0.05872703,
        0.0162333 , 0.00541993, 0.16858409, 0.05903041, 0.03592372],
       [0.09753098, 0.06274225, 0.01073853, 0.01075651, 0.00538695,
        0.00538321, 0.18800566, 0.01076253, 0.09020019, 0.00538366,
        0.01074886, 0.00538512, 0.01074718, 0.14575964, 0.05832675,
        0.01612265, 0.00538298, 0.16743502, 0.05862806, 0.03457327],
       [0.0918303 , 0.04706191, 0.0078939 , 0.00790712, 0.00395995,
        0.0039572 , 0.26401308, 0.0082978 , 0.07393313, 0.00395754,
        0.00836401, 0.00395861, 0.00880683, 0

In [15]:
nbModel.classes_


array([u'ART', u'CAUSE', u'COMEDY_PERFORMANCE', u'CRAFTS', u'DANCE',
       u'DRINK', u'FILM', u'FITNESS', u'FOOD', u'GAMES', u'HEALTH',
       u'LITERATURE', u'MEETUP', u'MUSIC', u'NETWORKING', u'PARTY',
       u'RELIGION', u'SPORTS', u'THEATER', u'WELLNESS'], dtype='<U18')