In [8]:
import numpy as np
import pandas as pd
import json
from dateutil import parser
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB

In [9]:
with open('../data/Final.json') as json_file:  
    data = json.load(json_file)

In [10]:
old_tags = []
for key in ['CALSTATE', 'CALTECH', 'USC', 'UCLA']:
    for record in data[key]:
        old_tags += record['tags']

In [11]:
new_tags = {u' Careers / Career Center': 'Career',
 u' Commencement': 'Social',
 u' Commencement / East Asian Studies Center (EASC)': 'Social',
 u' Commencement / LGBTQ Resource Center': 'Social',
 u' Conference/Symposia': 'Lecture_Talk_Workshop',
 u' Conference/Symposia / AMP SoCal': 'Lecture_Talk_Workshop',
 u' Conference/Symposia / Early Modern Studies Institute (EMSI) ': 'Lecture_Talk_Workshop',
 u' Conference/Symposia / History': 'Lecture_Talk_Workshop',
 u' Dance/Theater': 'Dance_Theatre_Film',
 u' Dance/Theater / School of Dramatic Arts': 'Dance_Theatre_Film',
 u' Event Highlights': 'Lecture_Talk_Workshop',
 u' Event Highlights / Annenberg Special Events': 'Lecture_Talk_Workshop',
 u' Event Highlights / Dornsife Office of Communication': 'Lecture_Talk_Workshop',
 u' Event Highlights / Institute for Armenian Studies': 'Lecture_Talk_Workshop',
 u' Event Highlights / Mathematics': 'Lecture_Talk_Workshop',
 u' Event Highlights / Neuroscience Graduate Program / Viterbi School of Engineering': 'Lecture_Talk_Workshop',
 u' Event Highlights / Rossier School of Education': 'Lecture_Talk_Workshop',
 u' Event Highlights / Thornton School of Music': 'Lecture_Talk_Workshop',
 u' Event Highlights / USC Libraries / Writing Center': 'Lecture_Talk_Workshop',
 u' Exhibit': 'Social',
 u' Exhibit / USC Libraries': 'Social',
 u' Film Screening': 'Dance_Theatre_Film',
 u' Film Screening / Comparative Literature (COLT) / Latin American and Iberian Cultures': 'Dance_Theatre_Film',
 u' Film Screening / German Studies Program': 'Dance_Theatre_Film',
 u' Film Screening / Jesse M. Unruh Institute of Politics': 'Dance_Theatre_Film',
 u' Free Food': 'Social',
 u' Free Food / English ': 'Social',
 u' Free Food / Office of International Services': 'Social',
 u' Free Food / Office of Religious Life': 'Social',
 u' Free Food / Rossier School of Education': 'Social',
 u' Free Food / School of Pharmacy': 'Social',
 u' Free Food / Thematic Option': 'Social',
 u' Lecture / Talk / Workshop': 'Lecture_Talk_Workshop',
 u' Lecture / Talk / Workshop / Anthropology': 'Lecture_Talk_Workshop',
 u' Lecture / Talk / Workshop / Creative Writing & Literature': 'Lecture_Talk_Workshop',
 u' Lecture / Talk / Workshop / Pacific Asia Museum': 'Lecture_Talk_Workshop',
 u' Music': 'Music',
 u' Music / Early Modern Studies Institute (EMSI) ': 'Music',
 u' Music / History': 'Music',
 u' Music / Thornton School of Music': 'Music',
 u' Music / Thornton Student Recitals': 'Music',
 u' Reception': 'Student',
 u' Reception / East Asian Studies Center (EASC)': 'Student',
 u' Student Life': 'Student',
 u' Student Life / East Asian Studies Center (EASC)': 'Student',
 u' Student Life / Engemann Student Health Center' : 'Wellness',
 u' Student Life / Visions and Voices: The Arts and Humanities Initiative': 'Student',
 u'App - ResX': 'Social',
 u'Arts': 'Art',
 u'Arts / Pacific Asia Museum': 'Art',
 u'Biokinesiology and Physical Therapy': 'Lecture_Talk_Workshop',
 u'Building Wide Event': 'Lecture_Talk_Workshop',
 u'Career': 'Career',
 u'Career and Professional': 'Career',
 u'Careers': 'Career',
 u'Careers / Career Center': 'Career',
 u'Commencement': 'Social',
 u'Commencement / Biokinesiology and Physical Therapy': 'Social',
 u'Commencement / Mrs. T.H. Chan Division of Occupational Science and Occupational Therapy': 'Social',
 u'Commencement / School of Pharmacy': 'Social',
 u'Community Council': 'Social',
 u'Conference/Symposia': 'Lecture_Talk_Workshop',
 u'Conference/Symposia / AMP SoCal': 'Lecture_Talk_Workshop',
 u'Conference/Symposia / Biokinesiology and Physical Therapy': 'Lecture_Talk_Workshop',
 u'Conference/Symposia / Department of Economics': 'Lecture_Talk_Workshop',
 u'Conference/Symposia / History': 'Lecture_Talk_Workshop',
 u'Conference/Symposia / Keck School of Medicine': 'Lecture_Talk_Workshop',
 u'Conference/Symposia / Korean Studies Institute (KSI)': 'Lecture_Talk_Workshop',
 u'Conference/Symposia / Marshall School of Business': 'Lecture_Talk_Workshop',
 u'Conference/Symposia / Preventive Medicine': 'Lecture_Talk_Workshop',
 u'Conference/Symposia / Price School of Public Policy': 'Lecture_Talk_Workshop',
 u'Conference/Symposia / Stem Cell at USC': 'Lecture_Talk_Workshop',
 u'Conference/Symposia / Stevens Center for Innovation': 'Lecture_Talk_Workshop',
 u'Conference/Symposia / Thematic Option': 'Lecture_Talk_Workshop',
 u'Conferences, Lectures, & Seminars': 'Lecture_Talk_Workshop',
 u'Free Food': 'Social',
 u'Free Food / Office of Religious Life': 'Social',
 u'Language Center': 'Student',
 u"Late Night 'SC": 'Social',
 u'Lecture / Talk / Workshop': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / AMP SoCal': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Anthropology': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Biokinesiology and Physical Therapy': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Campus Activities': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Center for Craniofacial Molecular Biology': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Department of Chemistry': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Emeriti Center': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Gould School of Law': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / History': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / History / Institute on California and the West (ICW) ': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Keck School of Medicine': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Korean Studies Institute (KSI)': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / LGBTQ Resource Center': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Levan Institute for Humanities and Ethics': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Mathematics': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Molecular and Computational Biology (MCB)': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Neuroscience Graduate Program': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Office of Religious Life': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Office of Research': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Pacific Asia Museum': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Preventive Medicine': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Price School of Public Policy': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Psychology': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Rossier School of Education': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Shinso Ito Center for Japanese Religions and Culture': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Stem Cell at USC': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / USC Dornsife Institute for New Economic Thinking (INET)': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Visions and Voices: The Arts and Humanities Initiative / Jesse M. Unruh Institute of Politics / Political Science': 'Lecture_Talk_Workshop',
 u'Lecture / Talk / Workshop / Visual Studies Research Institute (VSRI)': 'Lecture_Talk_Workshop',
 u'Mrs. T.H. Chan Division of Occupational Science and Occupational Therapy': 'Student',
 u'Norris Medical Library': 'Wellness',
 u'Off-Campus': 'Social',
 u'Office of Religious Life': 'Social',
 u'RA Floor Program': 'Student',
 u'Reception': 'Student',
 u'Reception / East Asian Studies Center (EASC)': 'Student',
 u'Reception / Korean Studies Institute (KSI)': 'Student',
 u'Reception / Price School of Public Policy': 'Student',
 u'Reception / School of Pharmacy': 'Student',
 u'Reception / USC Libraries': 'Student',
 u'Receptions & Special Events': 'Student',
 u'Rossier School of Education': 'Student',
 u'Service': 'Student',
 u'Social': 'Social',
 u'Student Activity': 'Student',
 u'Student Life': 'Student',
 u'Student Life / Dentistry': 'Student',
 u'Student Life / Engemann Student Health Center': 'Wellness',
 u'Student Life / Eric Cohen Health Center': 'Wellness',
 u'Student Life / Office of International Services': 'Student',
 u'Student Life / Office of Religious Life': 'Student',
 u'Student Life / School of Pharmacy': 'Student',
 u'Thornton Student Recitals': 'Music',
 u'University Calendar': 'Social',
 u'Visions and Voices: The Arts and Humanities Initiative': 'Dance_Theatre_Film',
 u'Wellness': 'Wellness',
 u'Workshops & Infosessions': 'Lecture_Talk_Workshop',
 u'alumni': 'Alumni',
 u'art': 'Art',
 u'lectures, readings, symposia': 'Lecture_Talk_Workshop',
 u'music': 'Music',
 u'special event': 'Social',
 u'student': 'Social',
 u'training and workshops': 'Lecture_Talk_Workshop',
 u'video and film': 'Dance_Theatre_Film'}

In [12]:
old_tags = []
for key in ['CALSTATE', 'CALTECH', 'USC', 'UCLA']:
    for i, record in enumerate(data[key]):
        if data[key][i]['tags']:
            if data[key][i]['tags'] == ['']:
                data[key][i]['tags'] = []
            data[key][i]['tags'] = list(set(map(lambda x: new_tags[x], data[key][i]['tags'])))

In [13]:
new_tags = []
for key in ['CALSTATE', 'CALTECH', 'USC', 'UCLA']:
    for record in data[key]:
        new_tags += record['tags']

In [14]:
set(new_tags)

{'Alumni',
 'Art',
 'Career',
 'Dance_Theatre_Film',
 'Lecture_Talk_Workshop',
 'Music',
 'Social',
 'Student',
 'Wellness'}

In [15]:
#Preprocessing DATE and TIME
new_tags = []
for key in ['CALSTATE', 'CALTECH', 'USC', 'UCLA']:
    for i, record in enumerate(data[key]):
        #time
        if data[key][i]['date_time'].get('time'):
            data[key][i]['date_time']['time'] = data[key][i]['date_time']['time'].replace(' ', '').lower()
            if data[key][i]['date_time']['time'].count(':') == 2:
                _time = data[key][i]['date_time']['time'].split(':')
                hour = int(_time[0])
                minute = _time[1]
                if hour == 0:
                    data[key][i]['date_time']['time'] = '{}:{}{}'.format('12', minute, 'am')
                elif hour == 12:
                    data[key][i]['date_time']['time'] = '{}:{}{}'.format('12', minute, 'pm')
                elif hour > 12:
                    data[key][i]['date_time']['time'] = '{}:{}{}'.format(hour%12, minute, 'pm')
                else:
                    data[key][i]['date_time']['time'] = '{}:{}{}'.format(hour, minute, 'am')
            if data[key][i]['date_time']['time'] == 'allday':
                data[key][i]['date_time']['time'] = 'NA'
            if data[key][i]['date_time']['time'].startswith('0'):
                data[key][i]['date_time']['time'] = data[key][i]['date_time']['time'][1:]
            if data[key][i]['date_time']['time'].startswith('urday'):
                s = data[key][i]['date_time']['time']
                month = re.search(pattern=',([a-z]+)([0-9]+)', string=s).group(1).capitalize()
                _date = re.search(pattern=',([a-z]+)([0-9]+)', string=s).group(2)
                data[key][i]['date_time']['date'] = 'Saturday, {} {}'.format(month, _date)
                data[key][i]['date_time']['time'] = 'NA'
        else:
            data[key][i]['date_time']['time'] = 'NA'
        #date
        data[key][i]['date_time']['date'] = str(parser.parse(data[key][i]['date_time']['date']).date())
        #tags
        if len(data[key][i]['tags']) > 0:
            data[key][i]['tags'] = data[key][i]['tags'][0]
        else:
            data[key][i]['tags'] = 'NA'

In [16]:
for key in ['CALSTATE', 'CALTECH', 'USC', 'UCLA']:
    for i, record in enumerate(data[key]):
        data[key][i][u'univ'] = key

In [17]:
final_data = []
for key in ['CALSTATE', 'CALTECH', 'USC', 'UCLA']:
    final_data += data[key]

In [18]:
for i, record in enumerate(final_data):
    final_data[i][u'id'] = i+1

In [19]:
train = [(data['id'], (data['title'] + '. ' + data['description']).strip(), data['tags']) for data in final_data if data['tags'] != 'NA']
pred = [(data['id'], (data['title'] + '. ' + data['description']).strip(), data['tags']) for data in final_data if data['tags'] == 'NA']

In [20]:
len(train), len(pred)

(771, 502)

In [21]:
train = pd.DataFrame(train, columns=['id', 'desc', 'tags'])
pred = pd.DataFrame(pred, columns=['id', 'desc', 'tags'])

### Training and Validation on TFIDF vectors

In [22]:
text = pd.concat([train['desc'], pred['desc']]).reset_index(drop=True)

In [23]:
tfidf = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'[a-z]{3,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_df = 500,
    max_features=1000)

tfidf.fit(text)
train_features = tfidf.transform(train['desc'])

In [24]:
X_train, X_val, y_train, y_val = train_test_split(train_features.toarray(), train['tags'], test_size=0.2, random_state=0)

In [25]:
model = MultinomialNB()
model.fit(X_train, y_train)
prediction = model.predict(X_val)
actual = y_val

In [27]:
print("Weighted F1 score = {}".format(f1_score(actual, prediction, average='weighted')))

Weighted F1 score = 0.783747665877


### Predictions

In [28]:
pred_features = tfidf.transform(pred['desc'])

In [29]:
predictions = model.predict(pred_features.toarray())

In [30]:
pred['tags'] = predictions

In [31]:
d = {}
for val in pred.values:
    d[val[0]] = val[-1]

In [32]:
for i, data in enumerate(final_data):
    if data['tags'] == 'NA':
        final_data[i]['tags'] = d[data['id']]
        final_data[i]['pred'] = 1
    else:
        final_data[i]['pred'] = 0

In [34]:
for i, data in enumerate(final_data):
    final_data[i]['description'] = (data['description'] + '. ' + data['title']).strip()

In [36]:
with open('final_data.json', 'w') as f:
    for i, record in enumerate(final_data):
        f.write(json.dumps({'index': {'_id': final_data[i][u'id']}}))
        f.write("\n")
        f.write(json.dumps(record))
        f.write("\n")

In [39]:
s = ['2019-04-13', '2018-04-13', '2019-12-13', '2019-04-03']

In [40]:
sorted(s)

['2018-04-13', '2019-04-03', '2019-04-13', '2019-12-13']