In [31]:
import articledata
from highchartsplotter import *
import fitmodel
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.naive_bayes import BernoulliNB
from sklearn.grid_search import GridSearchCV

from sklearn.feature_selection import SelectKBest, SelectFdr, chi2
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from nltk.corpus import names

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator

from mlxtend.preprocessing import DenseTransformer

In [2]:
data = pd.read_pickle('/Users/teresaborcuch/capstone_project/notebooks/final_data_test_set.pkl')

In [14]:
test_data = data[:5]

In [12]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y = None):
        return self
    
    def transform(self,X):
        return(X[self.key])
    
class BodyTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
       
        ss = [compute_score(x) for x in X['body']]
        persons, places, males, females = count_entities(X['body'])
        
        body_array = map(list,(zip(persons, places, males, females,ss)))
        return pd.DataFrame(body_array, columns = ['body_persons', 'body_places',
                                                   'body_men', 'body_women', 'body_ss'])

class TitleTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        persons, places, males, females = count_entities(X['title'])
        ss = np.asarray([compute_score(x) for x in X['title']])
        
        title_array = map(list,(zip(persons, places, males, females, ss))) #ss
        
        return pd.DataFrame(title_array, columns = ['title_persons', 'title_places', 
                                                    'title_men', 'title_women', 'title_ss'])
    
def count_entities(col):
    # set up tagger
    os.environ['CLASSPATH'] = "/Users/teresaborcuch/stanford-ner-2013-11-12/stanford-ner.jar"
    os.environ['STANFORD_MODELS'] = '/Users/teresaborcuch/stanford-ner-2013-11-12/classifiers'
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')

    tagged_titles = []
    persons = []
    places = []
    male_counts = []
    female_counts = []
    male_names = names.words("male.txt")
    female_names = names.words("female.txt")

    for x in col:
        tokens = word_tokenize(x)
        tags = st.tag(tokens)
        tagged_titles.append(tags)

    for pair_list in tagged_titles:
        person_count = 0
        place_count = 0

        for pair in pair_list:
            m_count = 0
            f_count = 0
            if pair[1] == 'PERSON':
                person_count +=1
                if pair[0] in male_names:
                    m_count +=1
                elif pair[0] in female_names:
                    f_count +=1
                else:
                    continue


            elif pair[1] == 'LOCATION':
                place_count +=1
            else:
                continue

        persons.append(person_count)
        places.append(place_count)
        male_counts.append(m_count)
        female_counts.append(f_count)
        
    return persons, places, male_counts, female_counts

class DummyMaker(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        dummies = pd.get_dummies(X)
        return dummies
    
def compute_score(sentence):
    tagger = PerceptronTagger()
    taggedsentence = []
    sent_score = []
    taggedsentence.append(tagger.tag(sentence.split()))
    wnl = nltk.WordNetLemmatizer()
    for idx, words in enumerate(taggedsentence):
        for idx2, t in enumerate(words):
            newtag = ''
            lemmatizedsent = wnl.lemmatize(t[0])
            if t[1].startswith('NN'):
                newtag = 'n'
            elif t[1].startswith('JJ'):
                newtag = 'a'
            elif t[1].startswith('V'):
                newtag = 'v'
            elif t[1].startswith('R'):
                newtag = 'r'
            else:
                newtag = ''
            if (newtag != ''):
                synsets = list(swn.senti_synsets(lemmatizedsent, newtag))
                score = 0.0
                if (len(synsets) > 0):
                    for syn in synsets:
                        score += syn.pos_score() - syn.neg_score()
                    sent_score.append(score / len(synsets))
        if (len(sent_score)==0 or len(sent_score)==1):
            return (float(0.0))
        else:
            return (sum([word_score for word_score in sent_score]) / (len(sent_score)))

class TfidfDF(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary = None):
        self.vocabulary = vocabulary
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        tv = TfidfVectorizer(preprocessor = fitmodel.tokenize, ngram_range = (1,2), vocabulary = self.vocabulary)
        tv.fit(X)
        body_feat = tv.transform(X).todense()
        df = pd.DataFrame(body_feat, columns = tv.get_feature_names())
        return df

In [120]:
tf = TfidfDF().transform(test_data['body'])

In [121]:
tf

Unnamed: 0,000,000 cancer,000 case,000 death,000 people,000 year,10,10 percent,100,100 billion,...,young gay,youre,youre discussion,youtube,youtube celebrate,youtube different,youtube see,youtube twitter,zamora,zamora mtv
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.024513,0.024513,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014487,0.0,...,0.0,0.0,0.0,0.071826,0.017956,0.017956,0.017956,0.017956,0.0,0.0
2,0.025926,0.0,0.0,0.0,0.0,0.032134,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.096577,0.014963,0.044889,0.044889,0.014963,0.0,0.014963,0.014963,0.012072,0.014963,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.026908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026908,0.026908


In [68]:
# pipe to train the model and establish the vocabulay
training_pipe = Pipeline([('features', FeatureUnion([
                                    ('titles', TitleTransformer()),
                                    ('bodies', BodyTransformer()),
                                    ('sources', Pipeline([('itemselector', ItemSelector(key = 'source')),
                                                         ('dummy', DummyMaker())])),
                                   ('tfidf-pipe', Pipeline([('itemselector', ItemSelector(key = 'body')),
                                    ('tv', TfidfVectorizer(preprocessor = fitmodel.tokenize, 
                                                                           ngram_range = (1,2))),
                                    ('dense', DenseTransformer()),
                                    ('selector', SelectKBest(score_func = chi2, k = 5))]))
                                  ])),
                         ('scale', MinMaxScaler())
                         ])


In [13]:
# transformation pipe to transform new data
transform_pipe = FeatureUnion([('titles', TitleTransformer()),
                                    ('bodies', BodyTransformer()),
                                    ('sources', Pipeline([('itemselector', ItemSelector(key = 'source')),
                                                         ('dummy', DummyMaker())])),
                                   ('tfidf-pipe', Pipeline([('itemselector', ItemSelector(key = 'body')),
                                                              ('tfidf', TfidfDF())                     ]))
                                  ])
                          

In [74]:
y = test_data['condensed_section']

In [16]:
body_feat2 = transform_pipe.fit_transform(test_data)

NameError: global name 'fitmodel' is not defined

In [136]:
body_feat2.head()

Unnamed: 0,000,000 2015,000 american,000 euro,000 job,000 member,000 mile,000 new,000 people,000 per,...,zero,zhang,zimbabwe,zionist,zip,zombie,zone,zoo,zoom,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.023754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.072619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015795,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# get vocabulary to fit the permanent model

In [3]:
tv = TfidfVectorizer(preprocessor = fitmodel.tokenize, ngram_range = (1,2), min_df = 10)
tv.fit(data['body'])
body_feat = pd.DataFrame(tv.transform(data['body']).todense(),
                         columns = tv.get_feature_names())

In [42]:
y = data['condensed_section']
selector = SelectKBest(score_func = chi2, k = 5000)
selector.fit(body_feat, y)

SelectKBest(k=5000, score_func=<function chi2 at 0x11a7ccaa0>)

In [46]:
vocab = body_feat.columns[selector.get_support()]

In [47]:
vocab

Index([u'000', u'000 people', u'000 student', u'000 year', u'05', u'06', u'10',
       u'10 point', u'10 rebound', u'106',
       ...
       u'young people', u'young player', u'young son', u'youre', u'youtube',
       u'zakaria', u'zealand', u'zimbabwe', u'zoo', u'zuckerberg'],
      dtype='object', length=5000)

# Try from import

In [35]:
#!/usr/bin/env python

from sqlalchemy import create_engine
from datetime import datetime, timedelta
import psycopg2
import string
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.sentiment.util import *
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import names
from nltk.tag.perceptron import PerceptronTagger
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
import os
import re

from nltk import WordNetLemmatizer, wordpunct_tokenize, pos_tag
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn

from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from mlxtend.preprocessing import DenseTransformer


class ArticleData():

    '''
    Usage:
    >>> from articledata import *
    >>> data = ArticleData().call()
    >>> data = get_sent_scores(data = data)
    >>> topic_data = evaluate_topic(data = data, section = 'opinion', source = 'NYT', topic = 'healthcare')
    >>> data = count_entities(data = data, title = True)
    >>> people_dict, place_dict = evaluate_entities(data = data, section = 'opinion', source = 'NYT')

    '''


    def __init__(self): pass

    def call(self):
        engine = create_engine('postgresql://teresaborcuch@localhost:5433/capstone')

        # get data from all three tables and add source column
        query1 = "SELECT DISTINCT ON(title) title, date, body, section FROM fox_news;"
        fox_data = pd.read_sql(query1, engine)
        fox_data['source'] = ['Fox']*len(fox_data)

        query2 = "SELECT DISTINCT ON(title) title, date, body, section FROM ny_times;"
        nyt_data = pd.read_sql(query2, engine)
        nyt_data['source'] = ['NYT'] * len(nyt_data)

        query3 = "SELECT DISTINCT ON(title) title, date, body, section FROM washington_post;"
        wp_data = pd.read_sql(query3, engine)
        wp_data['source'] = ['WP']*len(wp_data)

        # merge the dataframes into one big one
        data = pd.concat([nyt_data, fox_data, wp_data], axis = 0)


        # drop those with empty or suspiciously short bodies
        problem_rows = []
        for i, row in data.iterrows():
            try:
                if len(row[2]) < 200:
                    problem_rows.append(row.name)
            except TypeError:
                problem_rows.append(row.name)

        data = data.drop(data.index[problem_rows])

        # fix the dates
        new_dates = []
        for x in data['date']:
            if type(x) == int:
                x = str(x)
                x = (x[:4] + '-' + x[4:6] + '-' + x[6:8]).replace(' 00:00:00','')
                x = datetime.strptime(x, '%Y-%m-%d')
                new_dates.append(x)
            else:
                x = str(x).replace(' 00:00:00','')
                x = datetime.strptime(x, '%Y-%m-%d')
                new_dates.append(x)

        data['date'] = new_dates

        # eliminate | Fox News from titles
        clean_titles = []
        for x in data['title']:
            match = re.search('\|.*$', x)
            if match:
                clean_x = re.sub('\|.*$','',x)
                clean_titles.append(clean_x)
            else:
                clean_titles.append(x)
        data['title'] = clean_titles


        # create the condensed section
        def condense_section(x):
            if 'world' in x:
                section = 'world'
            elif 'pinion' in x:
                section = 'opinion'
            elif ('business' in x) or ('tech' in x):
                section = 'bus_tech'
            elif ('entertain' in x) or ('art' in x) or ('theater' in x) or ('book' in x) or ('movie' in x) or ('travel' in x) or ('fashion' in x) or ('style' in x) or ('dining' in x):
                section = 'entertainment'
            elif 'sport' in x:
                section = 'sports'
            elif ('health' in x) or ('science' in x) or ('well' in x):
                section = 'sci_health'
            elif ('education' in x):
                section = 'education'
            elif ('olitic' in x) or ('us' in x) or ('national' in x) or ('powerpost' in x):
                section = 'politics'
            else:
                section = 'other'
            return section

        data['condensed_section'] = [condense_section(x) for x in data['section']]
        data = data.reset_index(drop = True)
        mask1 = data['condensed_section'] != 'other'
        mask2 = data['condensed_section'] != 'education'
        data = data[mask1 & mask2]

        return data

class EvaluateTime():
    '''
    Usage:
    >>> et = EvaluateTime(data = data, source = 'NYT', section = 'politics', topic = 'health')
    >>> et.plot_time()
    '''
    def __init__(self, data = None, section = None, source = None, topic = None, date = None):
        self.data = data
        self.section = section
        self.source = source
        self.topic = topic
        self.date = date

    def call(self):
        #self.plot_date_dict,
        self.range_date_dict, self.groupings = self.make_dict()
        return self

    def make_dict(self):
        # define masks
        section_mask = (self.data['condensed_section'] == self.section)
        source_mask = (self.data['source'] == self.source)
        date_mask = (self.data['date'] > self.date)

        # initialize lists for plot_date_dict
        topic_scores = []
        dates = []
        groupings = []

        # initialize other dict
        range_date_dict = {}

        if not self.date:
            print "Please select a start date."

        # make plot_date_dict from appropriate subset of data
        else:
            if self.section and self.source:
                masked_data = self.data[section_mask & source_mask & date_mask]

            elif self.section and (not self.source):
                masked_data = self.data[section_mask & date_mask]

            elif self.source and (not self.section):
                masked_data = self.data[source_mask & date_mask]

            else:
                masked_data = self.data[date_mask]

            for i, row in masked_data.iterrows():

                if self.topic in row[2]:
                    topic_scores.append(row[6]) #body score
                    dates.append(row[1])
                    score_title_date = (row[0], row[1], row[6])
                    groupings.append(score_title_date)


                    # add to range_date_dict where keys are the dates and the values are a list of scores
                    if row[1] not in range_date_dict.keys():
                        range_date_dict[row[1]] = [row[6]]

                    elif row[1] in range_date_dict.keys():
                        (range_date_dict[row[1]]).append(row[6])

        return range_date_dict, groupings


    def plot_time(self):

        x = self.range_date_dict.keys()
        x.sort()
        ordered_x = []
        y = []
        for val in x:
            ordered_x.append(val)
            values = self.range_date_dict[val]
            mean = np.mean(values)
            y.append(mean)

        # define upper and lower boundaries for error bars
        upper_bounds = [max(self.range_date_dict[x]) for x in ordered_x]
        lower_bounds = [min(self.range_date_dict[x]) for x in ordered_x]

        # define distance for upper error bar
        y_upper = zip(y, upper_bounds)
        upper_error = [abs(pair[0] - pair[1]) for pair in y_upper]

        # define distance for lower error bar
        y_lower = zip(y, lower_bounds)
        lower_error = [abs(pair[0] - pair[1]) for pair in y_lower]

        asymmetric_error = [lower_error, upper_error]

        plt.plot(ordered_x, y, c = 'r', marker = 'o')
        plt.errorbar(ordered_x, y, yerr = asymmetric_error, ecolor = 'r', capthick = 1)
        plt.xlim(min(ordered_x) + timedelta(days = -1), max(ordered_x) + timedelta(days = 1))
        plt.xticks(rotation = 70)
        plt.show()

class HighChartPlotter():
    def __init__(self, et):
        self.et = et

    def call(self):

        self.x_dates, self.y_means, self.error_pairs, self.date_list = self.get_plotting_data()

        self.groups = self.et.groupings

        self.spline_series = self.get_spline_series()

        self.error_bar_series = self.get_error_bar_series()

        self.min_titles, self.max_titles = self.get_titles()

        self.min_scatter_series, self.max_scatter_series = self.get_scatter_points()


    def get_plotting_data(self):
        # get dates for x-axis
        date_list = self.et.range_date_dict.keys()
        date_list.sort()
        x_dates = [x.value// 10 ** 6 for x in date_list]

        # y-values
        y_values = [np.mean(self.et.range_date_dict[x]) for x in date_list]

        # error bars
        error_min_max = []
        for x in date_list:
            temp_list = []
            minimum = min(self.et.range_date_dict[x])
            maximum = max(self.et.range_date_dict[x])
            temp_list.append(minimum)
            temp_list.append(maximum)
            error_min_max.append(temp_list)

        return x_dates, y_values, error_min_max, date_list

    def get_spline_series(self):
        # format splines for jsfiddle - do this first!
        d = []
        series = {'name': 'Mean Score', 'type': 'spline'}
        for x in range(len(self.date_list)):
            data_point = [self.x_dates[x], self.y_means[x]]
            d.append(data_point)
        series['data'] = d
        spline_series = json.dumps(series)
        return spline_series

    def get_error_bar_series(self):
        d = []
        series = {'color': '#FF0000', 'name': 'Range', 'type': 'errorbar', 'stemWidth': 3, 'whiskerLength': 0}
        for x in range(len(self.date_list)):
            data_point = [self.x_dates[x], self.error_pairs[x][0], self.error_pairs[x][1]]
            d.append(data_point)
        series['data'] = d
        error_series = json.dumps(series)
        return error_series

    def get_titles(self):
        min_score_titles = {}
        max_score_titles = {}

        # min scores
        for x in self.groups:
            if x[1] not in min_score_titles.keys():
                min_score_titles[x[1]] = (x[2], x[0])
            elif x[1] in min_score_titles.keys():
                if x[2] < min_score_titles[x[1]][0]:
                    min_score_titles[x[1]] = (x[2], x[0])
                elif x[2] >= min_score_titles[x[1]][0]:
                    continue

        # max scores
        for x in self.groups:
            if x[1] not in max_score_titles.keys():
                max_score_titles[x[1]] = (x[2], x[0])
            elif x[1] in max_score_titles.keys():
                if x[2] > max_score_titles[x[1]][0]:
                    max_score_titles[x[1]] = (x[2], x[0])
                elif x[2] <= max_score_titles[x[1]][0]:
                    continue

        min_titles = [min_score_titles[x][1].encode('ascii', 'ignore') for x in self.date_list]
        max_titles = [max_score_titles[x][1].encode('ascii','ignore') for x in self.date_list]

        return min_titles, max_titles

    def get_scatter_points(self):
        max_series = []

        for x in range(len(self.date_list)):
            data_point = {'showInLegend': False, 'type': 'scatter', 'color': '#FF0000',
                          'marker': {'symbol': 'circle', 'enabled': True, 'color': '#FF0000'},
                          'tooltip': {'pointFormat': '{point.y}'}}

            data_point['name'] = self.max_titles[x]

            data_list = [[self.x_dates[x], self.error_pairs[x][1]]]

            data_point['data'] = data_list
            max_series.append(data_point)

        max_series = json.dumps(max_series)

        # return minimum scatter points series
        min_series = []

        for x in range(len(self.date_list)):
            data_point = {'showInLegend': False, 'type': 'scatter', 'color': '#FF0000',
                          'marker': {'symbol': 'circle', 'enabled': True, 'color': '#FF0000'},
                          'tooltip': {'pointFormat': '{point.y}'}}
            # get title
            data_point['name'] = self.min_titles[x]

            data_list = [[self.x_dates[x], self.error_pairs[x][0]]]

            data_point['data'] = data_list

            min_series.append(data_point)

        min_series = json.dumps(min_series)

        return min_series, max_series

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y = None):
        return self

    def transform(self,X):
        return(X[self.key])

class BodyTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self

    def transform(self, X):

        ss = [compute_score(x) for x in X['body']]
        persons, places, males, females = count_entities(X['body'])

        body_array = map(list,(zip(persons, places, males, females,ss)))
        return pd.DataFrame(body_array, columns = ['body_persons', 'body_places',
                                                   'body_men', 'body_women', 'body_ss'])
    def get_feature_names(self):
        return ['body_persons', 'body_places', 'body_men', 'body_women', 'body_ss']

class TitleTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self

    def transform(self, X):
        persons, places, males, females = count_entities(X['title'])
        ss = np.asarray([compute_score(x) for x in X['title']])
        
        title_array = map(list,(zip(persons, places, males, females, ss))) #ss

        return pd.DataFrame(title_array, columns = ['title_persons', 'title_places',
                                                    'title_men', 'title_women', 'title_ss'])
    def get_feature_names(self):
        return ['title_persons', 'title_places','title_men', 'title_women', 'title_ss']

class DummyMaker(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self

    def transform(self, X):
        self.dummies = pd.get_dummies(X)
        return self.dummies
    
    def get_feature_names(self):
        return self.dummies.columns

def compute_score(sentence):
    tagger = PerceptronTagger()
    taggedsentence = []
    sent_score = []
    taggedsentence.append(tagger.tag(sentence.split()))
    wnl = nltk.WordNetLemmatizer()
    for idx, words in enumerate(taggedsentence):
        for idx2, t in enumerate(words):
            newtag = ''
            lemmatizedsent = wnl.lemmatize(t[0])
            if t[1].startswith('NN'):
                newtag = 'n'
            elif t[1].startswith('JJ'):
                newtag = 'a'
            elif t[1].startswith('V'):
                newtag = 'v'
            elif t[1].startswith('R'):
                newtag = 'r'
            else:
                newtag = ''
            if (newtag != ''):
                synsets = list(swn.senti_synsets(lemmatizedsent, newtag))
                score = 0.0
                if (len(synsets) > 0):
                    for syn in synsets:
                        score += syn.pos_score() - syn.neg_score()
                    sent_score.append(score / len(synsets))
        if (len(sent_score)==0 or len(sent_score)==1):
            return (float(0.0))
        else:
            return (sum([word_score for word_score in sent_score]) / (len(sent_score)))

def count_entities(col):
    # set up tagger
    os.environ['CLASSPATH'] = "/Users/teresaborcuch/stanford-ner-2013-11-12/stanford-ner.jar"
    os.environ['STANFORD_MODELS'] = '/Users/teresaborcuch/stanford-ner-2013-11-12/classifiers'
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')

    tagged_titles = []
    persons = []
    places = []
    male_counts = []
    female_counts = []
    male_names = names.words("male.txt")
    female_names = names.words("female.txt")

    for x in col:
        tokens = word_tokenize(x)
        tags = st.tag(tokens)
        tagged_titles.append(tags)

    for pair_list in tagged_titles:
        person_count = 0
        place_count = 0

        for pair in pair_list:
            m_count = 0
            f_count = 0
            if pair[1] == 'PERSON':
                person_count +=1
                if pair[0] in male_names:
                    m_count +=1
                elif pair[0] in female_names:
                    f_count +=1
                else:
                    continue


            elif pair[1] == 'LOCATION':
                place_count +=1
            else:
                continue

        persons.append(person_count)
        places.append(place_count)
        male_counts.append(m_count)
        female_counts.append(f_count)

    return persons, places, male_counts, female_counts

def tokenize(text):
    text = text.encode('ascii','ignore')
    lemmas = []

    def lemmatize(token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        wnl = WordNetLemmatizer()
        return wnl.lemmatize(token, tag)

    for token, tag in pos_tag(wordpunct_tokenize(text)):
        token = token.lower()
        token = token.strip()
        token = token.strip('_')
        token = token.strip('*')

        if token in sw.words('english'):
            continue
        if all(char in string.punctuation for char in token):
            continue

        lemma = lemmatize(token, tag)
        lemmas.append(lemma)
        lemma_string = ' '.join(lemmas)

    return lemma_string

def name_entities(data = None, section = None, source = None):
    section_mask = (data['condensed_section'] == section)
    source_mask = (data['source'] == source)

    if section and source:
        masked_data = data[section_mask & source_mask]

    elif section:
        masked_data = data[section_mask]

    elif source:
        masked_data = data[source_mask]

    else:
        masked_data = data

    # set up tagger
    os.environ['CLASSPATH'] = "/Users/teresaborcuch/stanford-ner-2013-11-12/stanford-ner.jar"
    os.environ['STANFORD_MODELS'] = '/Users/teresaborcuch/stanford-ner-2013-11-12/classifiers'
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
    # dictionaries to hold counts of entities
    person_dict = {}
    place_dict = {}

    for x in masked_data['body']:
        tokens = word_tokenize(x)
        tags = st.tag(tokens)
        for pair in tags:
            if pair[1] == 'PERSON':
                if pair[0] not in person_dict.keys():
                    person_dict[pair[0]] = 1
                else:
                    person_dict[pair[0]] +=1
            elif pair[1] == 'LOCATION':
                if pair[0] not in place_dict.keys():
                    place_dict[pair[0]] = 1
                else:
                    place_dict[pair[0]] += 1

    return person_dict, place_dict

def evaluate_topic(data = None, section = None, source = None, topic = None):
    topic_scores = []
    nontopic_scores = []

    section_mask = (data['condensed_section'] == section)
    source_mask = (data['source'] == source)

    if section and source:
        masked_data = data[section_mask & source_mask]

    elif section:
        masked_data = data[section_mask]

    elif source:
        masked_data = data[source_mask]

    else:
        masked_data = data

    for i, row in masked_data.iterrows():

        if topic in row[2]:
            topic_scores.append(row[6])

        else:
            nontopic_scores.append(row[6])

    score_dict = {'topic': topic_scores, 'nontopic': nontopic_scores}

def transform_data(data, vocab = None):
    transform_pipe = FeatureUnion([('titles', TitleTransformer()),
                                    ('bodies', BodyTransformer()),
                                    ('sources', Pipeline([('itemselector', ItemSelector(key = 'source')),
                                                         ('dummy', DummyMaker())])),
                                   ('tfidf-pipe', Pipeline([('itemselector', ItemSelector(key = 'body')),
                                                              ('tfidf', TfidfDF(vocabulary = vocab))                     ]))
                                  ])

    Xt = transform_pipe.fit_transform(data)
    return Xt


def get_vocabulary(X, y):
    scaler = MinMaxScaler()
    Xt = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
    selector = SelectKbest(scoring_func = chi2, k = 5000)
    selector.fit_transform(Xt, y)
    vocabulary = X.columns[selector.get_support()]
    for x in ['body_persons', 'body_places','body_men', 'body_women', 'body_ss',
                'title_persons', 'title_places', 'title_men', 'title_women', 'title_ss',
                'source_NYT', 'source_WP', 'source_Fox']:
        if x in vocabulary:
            vocabulary = vocabulary.remove(x)
    return vocabulary

#def build_model(X, y):


In [6]:
data = ArticleData().call()

In [48]:
test_data = data[:5]

In [49]:
def transform_data(data, vocab = None):
    transform_pipe = FeatureUnion([('titles', TitleTransformer()),
                                    ('bodies', BodyTransformer()),
                                    ('sources', Pipeline([('itemselector', ItemSelector(key = 'source')),
                                                         ('dummy', DummyMaker())])),
                                   ('tfidf-pipe', Pipeline([('itemselector', ItemSelector(key = 'body')),
                                                              ('tfidf', TfidfVectorizer(preprocessor = tokenize, ngram_range = (1,2)))                     ]))
                                  ])

    Xt = transform_pipe.fit_transform(data)
    return Xt

In [50]:
test_data = transform_data(test_data)

In [39]:
tt = TitleTransformer()
tt.fit_transform(test_data)

Unnamed: 0,title_persons,title_places,title_men,title_women,title_ss
0,0,0,0,0,0.052484
1,0,0,0,0,-0.023148
2,0,0,0,0,0.041667
3,0,0,0,0,-0.034722
4,0,0,0,0,0.084028


In [40]:
bt = BodyTransformer()
bt.fit_transform(test_data)

Unnamed: 0,body_persons,body_places,body_men,body_women,body_ss
0,42,5,0,0,-2.5e-05
1,17,8,0,0,0.01624
2,13,2,0,0,0.020668
3,24,9,0,0,0.000946
4,80,4,0,0,0.027508


In [41]:
tf = TfidfDF()
tf.fit_transform(test_data)

Unnamed: 0,body,condensed_section,date,section,source,title
0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0
5,0.0,1.0,0.0,0.0,0.0,0.0
