In [2]:
import urllib2
from bs4 import BeautifulSoup
import re
import nytimesarticle as nyta
import requests
import time
import pandas as pd
import numpy as np
import nytimes_crawl_2 as nytc
import operator as op
import text_processing
import detect_language
from encoder import Encoder
import os
import pickleizer
%matplotlib inline

In [3]:
%load_ext autoreload
%autoreload 2

# Misc

In [228]:
names = range(1,15)

## Analysis

### Scrape

In [230]:
# nytc.read_all(names,enddate='20160331')

In [231]:
articles = nytc.query_articles_one_page(page=13,begindate='20151201',enddate='20160331')

In [103]:
nytarticle = nytc.NYTArticle(articles[0])

### Load

In [237]:
readers = []
for name in names[:11]:
    csvfile = '{0}_100.csv'.format(name)
    readers.append(nytc.OpEdReader.init_from_file(name,csvfile))
dataall = pd.concat([reader.data for reader in readers])
dataall = dataall.reset_index(drop=True);

# Detect Bad Articles

In [240]:
def get_bad_idx(dataall,verbose=True):
    badidx = []
    for iloc, text in enumerate(dataall['full_text']):
        try:
            if detect_language.detect_language(text) != u'english':
                badidx.append(iloc)
                if verbose:
                    print(text[:40])  
        except TypeError:
            badidx.append(iloc)
            if verbose:
                print(dataall['url'].iloc[iloc])
    return badidx

In [241]:
badidx = get_bad_idx(dataall,verbose=True)

Conocí a Atilano Román Tirado, de quie
El año pasado nos propusimos hacer una 
DAKAR, Sénégal — La semaine dernièr
ORAN, Algérie — Après Tahrir, Cologn
Oran, Algérie — L’occasion aurait p
Millones de cubanos podrían tener acces
Daesh noir, Daesh blanc. Le premier égo
PARIS — Au lendemain des attentats de 
FLORENCE, Italie — Comme l’a déclar
Minggu ini menandai peringatan 50 tahun 
Cztery lata temu, u mojego nowonarodzone
Retratos enmarcados del fallecido líder
PARIS — « Mal nommer les choses, c’
La artista cubana Tania Bruguera, quien 
Las palabras fueron escritas en grafiti 
A veces, hacer clic para publicar un tex
En Cuba, históricamente las marchas han
Tras meses de negociaciones secretas con
En julio de 2007, cuando Raúl Castro es
El Secretario de Estado John Kerry y la 
En 1996, motivados por un apetito de ven
Hace casi cinco años, las autoridades e
En tiempos pasados, no muy lejanos, todo
Cuba es una isla pobre y relativamente a
Evo Morales, el presidente populista de 
Luego de h

# Save!

In [5]:
subdir = 'final_csvs2'

In [244]:
dataall.drop(badidx,inplace=True)
dataall.reset_index(drop=True)

In [284]:
dataall.to_csv(os.path.join(subdir,'dataall.csv'),encoding='utf-8')

# Load!

In [6]:
dataall = pd.DataFrame.from_csv(os.path.join(subdir,'dataall.csv'),encoding='utf-8')

# Other Features

In [248]:
def normalize_feature(features):
    fmin, fmax = min(features), max(features)
    frange = fmax - fmin
    return [f/frange for f in features]

## Textual

### Lengths

In [249]:
datatextualother = pd.DataFrame(index=dataall.index,columns=[])
lens = [len(text.split()) for text in dataall['full_text']]
datatextualother['len'] = normalize_feature(lens)
lenstitle = [len(title.split()) for title in dataall['title']]
datatextualother['lentitle'] = normalize_feature(lenstitle)

In [251]:
datatextualother.to_csv(os.path.join(subdir,'datatextualother.csv'),encoding='utf-8')

## Non-Textual

In [264]:
datanonother = pd.DataFrame(index=dataall.index,columns=[])

### Time

In [265]:
dates = pd.to_datetime(dataall['date'])
dateoldest = dates.iloc[-1]
datenewest = dates.iloc[0]
daterange = datenewest - dateoldest
times = [(date - dateoldest)/daterange for date in dates]
datanonother['time'] = times

### Day of Week

In [266]:
daysofweek = [date.dayofweek for date in dates]
dayencoder = Encoder('day',daysofweek)
datanonother = dayencoder.encode_to_df(datanonother)

### Author

In [267]:
dataall['author'] = [nytc.author_name(firstname,lastname)
           for firstname, lastname in zip(dataall['first_name'],dataall['last_name'])]

In [268]:
authorencoder = Encoder('author',dataall['author'],
                        nonegroup=True,cutoff=20)
datanonother = authorencoder.encode_to_df(datanonother)

### Save

In [269]:
datanonother.to_csv(os.path.join(subdir,'datanonother.csv'),encoding='utf-8')

## Save All

In [284]:
dataall.to_csv(os.path.join(subdir,'dataall.csv'),encoding='utf-8')

## Pickle Encoders

In [270]:
pickleizer.save_encoders(dayencoder,authorencoder)

# By Author

In [271]:
def df_author(dataall,authorname):
    return dataall[dataall.author == authorname]

In [272]:
edsall = df_author(dataall,'Thomas Edsall')
edsall.shape

(96, 11)

## Top Authors -> to visualize

In [7]:
n_top_authors = 12

In [8]:
def top_authors_id(dataall,n_authors):
    groupedauthors = dataall.groupby('author').size()
    topauthors = groupedauthors.sort_values(ascending=False)[:n_authors]
    topid = [1 if author in topauthors else 0 for author in dataall['author']]
    dataall['topid'] = topid
    return dataall, topauthors

In [9]:
dataall, topauthors = top_authors_id(dataall,n_top_authors)
datatop = dataall[dataall['topid'] == 1]

In [10]:
topauthors

author
The Editorial Board    2886
Paul Krugman            239
David Brooks            222
Nicholas Kristof        222
Roger Cohen             205
Charles Blow            201
Frank Bruni             200
Gail Collins            173
Joe Nocera              167
Thomas Friedman         155
Ross Douthat            130
Maureen Dowd            122
dtype: int64

In [11]:
datafinal = datatop[['share_count','author','date','title']]
datafinal.rename(columns = {'share_count':'Share Count', 'author': 'Author', 'date': 'Date', 'title': 'Title'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [12]:
datafinal.to_csv('timeseries.tsv',sep='\t',encoding='utf-8',index=False)

In [19]:
subdir = 'final_csvs2'

In [21]:
dataall = pd.DataFrame.from_csv(os.path.join(subdir,'dataall.csv'),encoding='utf-8')

In [24]:
dataall.to_csv(os.path.join(subdir,'dataall.csv'),encoding='utf-8')

In [1]:
def gen_row(row,keys1,keys2):
    s = []
    for key1, key2 in zip(keys1,keys2):
        item = row[key1]
        if type(item) == unicode or type(item) == str:
            s.append(u'"{0}": "{1}"'.format(key2,item))
        else:
            s.append(u'"{0}": {1}'.format(key2,item))
        
    return u'{{{0}}}'.format(', '.join(s))