In [2]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import topic_weights as tw
%matplotlib qt

In [3]:
%load_ext autoreload
%autoreload 2

## Database

### Populate Database

In [4]:
# load a database from CSV
dataallnew = pd.DataFrame.from_csv('dataallnew4.csv')
datadate = pd.DataFrame.from_csv('datadate2.csv')
dataauthor = pd.DataFrame.from_csv('dataauthor2_38.csv')
dataother = pd.DataFrame.from_csv('dataother2.csv')
topicweights50 = pd.DataFrame.from_csv('topicweights2_50.csv')

IOError: File dataallnew4.csv does not exist

### Connect to Database

In [4]:
n_topics = 50
n_authors = 38
n_days = 7

In [5]:
#In Python: Define a database name, and your username for your computer. 
dbname = 'oped3_db'
username = 'varun'

In [6]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print engine.url

postgres://varun@localhost/oped3_db


In [7]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

True


In [11]:
## insert data into database from Python (proof of concept - this won't be useful for big data, of course)
## df is any pandas dataframe 
dataallnew.to_sql('orig', engine, if_exists='replace')
datadate.to_sql('dates', engine, if_exists='replace')
dataauthor.to_sql('authors', engine, if_exists='replace')
dataother.to_sql('other', engine, if_exists='replace')
topicweights50.to_sql('topic_weights50', engine, if_exists='replace')

In [8]:
## Now try the same queries, but in python!
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username)

In [9]:
# query:
def query_all_2():
    return """
    SELECT * FROM orig"""

In [10]:
# query:
def query_for_person(firstname,lastname):
    return """
    SELECT orig.share_count, topic_weights50.*, dates.*
    FROM orig
        JOIN topic_weights50
            ON orig.index = topic_weights50.index
        JOIN dates
            ON orig.index = dates.index
    WHERE orig.first_name='{0}' AND orig.last_name='{1}';
    """.format(firstname,lastname)

In [12]:
charlesblowquery = query_for_person('charles','blow')
rossdouthatquery = query_for_person('ross','douthat')
charlesblow = pd.read_sql_query(charlesblowquery,con)
rossdouthat = pd.read_sql_query(rossdouthatquery,con)
rossdouthat.head()

Unnamed: 0,share_count,index,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,...,index.1,Dates,Times,Day0,Day1,Day2,Day3,Day4,Day5,Day6
0,665,53,7.8e-05,7.8e-05,7.8e-05,7.8e-05,7.8e-05,7.8e-05,7.8e-05,0.223136,...,53,2016-03-27,0.995289,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,305,85,0.003805,5.7e-05,5.7e-05,5.7e-05,5.7e-05,5.7e-05,5.7e-05,0.125632,...,85,2016-03-24,0.991755,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,412,135,7.2e-05,7.2e-05,7.2e-05,7.2e-05,7.2e-05,0.024914,7.2e-05,0.169041,...,135,2016-03-20,0.987044,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,296,177,5.5e-05,5.5e-05,5.5e-05,5.5e-05,5.5e-05,5.5e-05,0.013638,0.048901,...,177,2016-03-16,0.982332,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,3061,215,0.02064,7.3e-05,7.3e-05,7.3e-05,7.3e-05,7.3e-05,7.3e-05,7.3e-05,...,215,2016-03-13,0.978799,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [27]:
# query:
def query_all():
    return """
    SELECT orig.share_count, orig.first_name, orig.last_name, topic_weights50.*, authors.*, dates.*, other.*
    FROM orig
        LEFT JOIN topic_weights50
            ON orig.index = topic_weights50.index
        LEFT JOIN authors
            ON orig.index = authors.index
        LEFT JOIN dates
            ON orig.index = dates.index
        LEFT JOIN other
            ON orig.index = other.index
    WHERE authors.author0 = 0
    """ 

In [28]:
sql_query = query_all()
data_from_sql = pd.read_sql_query(sql_query,con)
data_from_sql.shape

(5767, 107)

In [29]:
data_from_sql['log_share_count'] = np.log10(data_from_sql['share_count'])
data_from_sql[data_from_sql['log_share_count'] < 0] = 0

# Plotting

In [30]:
def pretty_figure(fig,aspect=None,xlabel=None,ylabel=None,axisbounds=None,fontsizeaxes=36,fontsizeother=27,
                  ticksize=False,borderwidth=5,tight=True,tightlayout=True,tightfac=1.08,ticksizedef=[18,5]):
    '''Prettifies a figure with labels, proper linewidths and font sizes, ticks, tight axes/layout, etc.
    Note that for the axisbounds, there are three options: direct specification (axisbounds)
    tight = True (tight axes), or tight = False (tight axes with small padding around outside).
    Also, aspect should be set to 1 for plots of physical objects'''
    ax = fig.axes[0]
    if xlabel is not None:
        ax.set_xlabel(xlabel,fontsize=fontsizeaxes)
    if ylabel is not None:
        ax.set_ylabel(ylabel,fontsize=fontsizeaxes)
    ax.legend(loc='NorthWest',fontsize=fontsizeother)
    if axisbounds is not None:
        ax.axis(axisbounds)
    else:
        ax.autoscale()
        ax.axis('tight')
        if not tight:
            xcenter = np.mean(ax.get_xlim())
            ycenter = np.mean(ax.get_ylim())
            xlimnottight = [(x-xcenter)*tightfac + xcenter for x in ax.get_xlim()]
            ylimnottight = [(y-ycenter)*tightfac + ycenter for y in ax.get_ylim()]
            ax.axis(xlimnottight + ylimnottight)
    if aspect is not None:
        ax.set_aspect(aspect)
    for axis in ['top','bottom','left','right']:
        ax.spines[axis].set_linewidth(borderwidth)
    if ticksize is None:
        ax.tick_params(axis='both',labelsize=fontsizeother,bottom='off',top='off',left='off',right='off')
    else:
        if not ticksize:
            ticksize = ticksizedef      
        ax.tick_params(axis='both',labelsize=fontsizeother,width=ticksize[1],length=ticksize[0])
    if tightlayout:
        fig.tight_layout()
    return fig

In [33]:
def pd_plot(df,xlabel,ylabels,ynames,ylabel,window=None):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    df2 = pd.DataFrame(index=df.index,columns=['x','y'])
    df2['x'] = df[xlabel]
    for ylabelcurr, yname in zip(ylabels,ynames):
        if window is None:
            df2['y'] = df[ylabelcurr]
        else:
            df2['y'] = pd.rolling_mean(df[ylabelcurr],window=window)
        df2.plot(ax=ax,x='x',y='y',linewidth=5,label=yname)
    fig = pretty_figure(fig,xlabel='Date',ylabel=ylabel)
    fig.show()

In [34]:
pd_plot(rossdouthat[::-1],'Dates',['topic14','topic44','topic47'],['General politics','Republican politics','Culture/social issues'],ylabel='Topic composition',window=8)

	Series.rolling(window=8,center=False).mean()


In [35]:
pd_plot(charlesblow[::-1],'Dates',['topic4','topic10','topic47','topic48'],['Race/policing','Presidency','Culture/social issues','Inequality'],ylabel='Topic composition',window=8)

	Series.rolling(window=8,center=False).mean()


In [37]:
pd_plot(rossdouthat[::-1],xlabel='Dates',ylabels=['share_count'],ynames=['Shares'],ylabel='Share count',window=None)

In [66]:
pd_plot(data_from_sql[::-1],xlabel='Dates',ylabels=['share_count'],ynames=['Shares'],ylabel='Share count',window=None)

# Machine Learning

## Training/Test

In [30]:
from sklearn import linear_model, neighbors, ensemble, preprocessing
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_curve

In [66]:
def split_data(data,frac=0.7):
    datanew = data.sample(frac=1)
    nrows = len(datanew)
    idx = int(nrows*frac)
    return {'train': datanew.iloc[:idx], 'test': datanew.iloc[idx:]}
    # use 70-30 split

In [67]:
def train_model(data,featurenames,viralityname,model,errorfun,**kwargs):
    splitdata = split_data(data)
    datatrain = splitdata['train']
    datatest = splitdata['test']
    model.fit(datatrain[featurenames],datatrain[viralityname],**kwargs)
    train_pred = model.predict(datatrain[featurenames])
    test_pred = model.predict(datatest[featurenames])
    train_error = errorfun(train_pred,datatrain[viralityname])
    test_error = errorfun(test_pred,datatest[viralityname])
    print('Training error: {0}'.format(train_error))
    print('Test error: {0}'.format(test_error))

In [33]:
topicnames = ['topic{0}'.format(i) for i in range(n_topics)]
authornames = ['author{0}'.format(i) for i in range(n_authors)]
daynames = ['Day{0}'.format(i) for i in range(n_days)]
# othernames = ['len','Times']
# othernames = ['Times']
othernames = []
featurenames = topicnames + authornames + daynames + othernames
features = data_from_sql[featurenames]
viralityname = 'log_share_count'
virality = data_from_sql[viralityname]

### Bin Data according to Percentile

In [34]:
percentile = 90.
dec = percentile/100.
weight = dec/(1.-dec)
cutoff = np.percentile(data_from_sql[viralityname],percentile)
classassignments = [1 if virality > cutoff else 0 for virality in data_from_sql[viralityname]]
classweightdict = {1: weight, 0: 1}
weights = [classweightdict[classassignment] for classassignment in classassignments]
data_from_sql['class'] = classassignments
data_from_sql['weight'] = weights


### Train/Test

In [35]:
datasplit = split_data(data_from_sql,frac=0.5)

## Classification

### Use Weighting!

In [332]:
model = linear_model.LogisticRegression(class_weight=classweightdict)
model2 = ensemble.RandomForestClassifier(max_depth=10,class_weight=classweightdict)
model3 = ensemble.BaggingClassifier(bootstrap=True,bootstrap_features=True)

In [339]:
mymodel = model

In [340]:
datatrain = datasplit['train']
mymodel.fit(datatrain[featurenames],datatrain['class'])
res = mymodel.predict(datatrain[featurenames])
print('Train',accuracy_score(datatrain['class'], res))
datatest = datasplit['test']
probas = mymodel.predict_proba(datatest[featurenames])
res = mymodel.predict(datatest[featurenames])
print('Test',accuracy_score(datatest['class'], res))
fpr, tpr, thresholds = roc_curve(datatest['class'], probas[:, 1])
prec, rec, thresh  = precision_recall_curve(datatest['class'], probas[:, 1])
thresh = np.append(thresh,1)


('Train', 0.78737426292056889)
('Test', 0.76907073509015256)


In [341]:
def plot_fig(fpr,tpr,thresh1,prec,rec,thresh2,fignum=1,option='roc'):
    fig = plt.figure(fignum)
    ax = fig.gca()
    if option == 'roc':
        x = [fpr]
        y = [tpr]
    elif option == 'precrec':
        x = [rec]
        y = [prec]
    elif option == 'precrecthresh':
        x = [thresh2,thresh2]
        y = [prec,rec]
    for xcurr, ycurr in zip(x,y):
        ax.plot(xcurr,ycurr)
    fig.show()

In [343]:
plot_fig(fpr,tpr,thresholds,prec,rec,thresh,option='precrec')

In [288]:
fig = plt.figure(1)
ax = fig.gca()
ax.plot(thresh, prec)
ax.plot(thresh, rec)
fig.show()

In [251]:
fig = plt.figure(2)
ax = fig.gca()
ax.plot(rec,prec)
fig.show()

In [140]:
mymodel.fit(data_from_sql[featurenames],data_from_sql['class'])
probas = mymodel.predict_proba(data_from_sql[featurenames])
res = mymodel.predict(data_from_sql[featurenames])
fpr, tpr, thresholds = roc_curve(data_from_sql['class'], probas[:, 1])
accuracy_score(data_from_sql['class'], res)

0.77284550026010057

### Logistic Regression

In [123]:
fig = plt.figure(1)
ax = fig.gca()
ax = ax.matshow(data_from_sql[featurenames], cmap=plt.cm.gray)
plt.show()

In [113]:
model = linear_model.LogisticRegression()
model.fit(datatrain[featurenames],datatrain['bin_share_count'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### K-Nearest Neighbors

In [None]:
model = neighbors.KNeighbors(n_neighbors=i)
train_model(datasplit,model)

## Regression

In [36]:
def model_error_regr(predicted,actual):
    nobs = actual.size
    return np.sqrt(1.0/(2.0*nobs)*np.sum((actual - predicted)**2))

In [37]:
def coeff_regr(predicted,actual):
    avg = np.mean(actual)
    sstot = np.sum((actual - avg)**2)
    ssres = np.sum((actual - predicted)**2)
    return 1 - ssres/sstot

### Linear Regression

In [78]:
model = linear_model.Ridge(alpha=0.01)

In [75]:
model.fit(data_from_sql[featurenames],data_from_sql[viralityname])

Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [79]:
for i in np.abs(model.coef_).argsort():
    pass
    # print(featurenames[i],model.coef_[i])

In [79]:
train_model(data_from_sql,featurenames,viralityname,model,coeff_regr)

Training error: 0.4791213694
Test error: 0.471185117435


In [72]:
for featurename, coef in zip(featurenames, model.coef_):
    print(featurename, coef)

('topic0', 2.9814248414746345)
('topic1', 3.6076759018200617)
('topic2', 2.7145149878369748)
('topic3', 2.2318192563795209)
('topic4', 3.6101955737904237)
('topic5', 3.1324073977838909)
('topic6', 3.1952031055581167)
('topic7', 3.0161974794220709)
('topic8', 2.437344841580555)
('topic9', 2.5022442184888116)
('topic10', 3.1384742739679439)
('topic11', 3.0884752722055806)
('topic12', 2.9326794368848952)
('topic13', -13.152639268231551)
('topic14', 2.8038590260980301)
('topic15', 2.6980174493417461)
('topic16', 2.48883448964876)
('topic17', 3.0652225875645875)
('topic18', -13.152639268231551)
('topic19', 2.5818149085043389)
('topic20', 2.3225718410000273)
('topic21', 3.0046006094774205)
('topic22', -13.152639268231546)
('topic23', 1.6082704382912816)
('topic24', -13.152639268231539)
('topic25', -13.152639268231541)
('topic26', 3.4973596041867774)
('topic27', 3.1588957615861792)
('topic28', 2.9229127997798914)
('topic29', 6.3575888248040355)
('topic30', 2.3580108238481632)
('topic31', 2.83

In [62]:
model = neighbors.KNeighborsRegressor(n_neighbors=10)

In [63]:
train_model(datasplit,model,coeff_regr)

Training error: 0.525892577549
Test error: 0.380602064415


## Random Forest

In [96]:
model = ensemble.RandomForestRegressor(n_estimators=200,max_features='sqrt')

In [97]:
train_model(datasplit,model,coeff_regr)

Training error: 0.927809274534
Test error: 0.445940914378


In [46]:
for feature, importance in zip(featurenames,model.feature_importances_):
    # print(feature, importance)
    pass

AttributeError: 'Ridge' object has no attribute 'feature_importances_'

## K-Nearest Neighbors

In [105]:
for i in range(1,20):
    model = neighbors.KNeighborsRegressor(n_neighbors=i)
    train_model(datasplit,model,coeff_regr)

Training error: 0.998883446246
Test error: -0.147600334939
Training error: 0.716377900879
Test error: 0.152592380258
Training error: 0.619053854171
Test error: 0.243409621316
Training error: 0.569168974685
Test error: 0.290740795355
Training error: 0.535852162635
Test error: 0.314018356802
Training error: 0.515676841609
Test error: 0.328033807347
Training error: 0.500432695738
Test error: 0.332794871677
Training error: 0.488118540344
Test error: 0.34068243082
Training error: 0.478990621509
Test error: 0.345101970322
Training error: 0.469109110887
Test error: 0.344840873636
Training error: 0.461235157144
Test error: 0.34593098155
Training error: 0.454972515161
Test error: 0.350557689176
Training error: 0.448723296798
Test error: 0.351332730917
Training error: 0.444368263776
Test error: 0.353331101435
Training error: 0.439076912761
Test error: 0.355747969173
Training error: 0.437369664485
Test error: 0.355988512055
Training error: 0.43284801782
Test error: 0.352899477027
Training error: 

# Save Model

In [70]:
import pickleizer

In [103]:
pickleizer.save_model(model)

In [71]:
model = pickleizer.load_model()

In [82]:
actual = data_from_sql[viralityname]

In [83]:
pred = model.predict(data_from_sql[featurenames])

In [87]:
plt.scatter(actual,pred)
plt.show()

# Make Some Predictions!

In [9]:
import model_predict
import similar_articles
import math

In [238]:
authorid = model_predict.AUTHORID

[autoreload of the_opinionator.views failed: Traceback (most recent call last):
  File "/home/varun/anaconda2/lib/python2.7/site-packages/IPython/extensions/autoreload.py", line 247, in check
    superreload(m, reload, self.old_objects)
AssertionError: View function mapping is overwriting an existing endpoint function: index
]


In [4]:
url = 'http://www.nytimes.com/2016/06/03/opinion/the-id-that-ate-the-planet.html'
firstname = 'nicholas'
lastname = 'kristof'
dayofweek = 0

In [6]:
model_predict.predict_new_article_url(firstname,lastname,dayofweek,url)

2500

In [21]:
fulltext = u'Trump trumped Trump yesterday'
weights = model_predict.article_weights(fulltext)
df = similar_articles.return_df()
sameauthor, otherauthor = similar_articles.same_other_df(df,firstname,lastname)
expectedshares = model_predict.predict_new_article_text(firstname,lastname,dayofweek,weights)
percsame, percother = similar_articles.percentiles(sameauthor,otherauthor,expectedshares)

In [272]:
model_predict.predict_new_article_url(firstname,lastname,dayofweek,url)

2500

# Misc

In [209]:
dataall = pd.DataFrame.from_csv('dataall3.csv')

[autoreload of the_opinionator.views failed: Traceback (most recent call last):
  File "/home/varun/anaconda2/lib/python2.7/site-packages/IPython/extensions/autoreload.py", line 247, in check
    superreload(m, reload, self.old_objects)
AssertionError: View function mapping is overwriting an existing endpoint function: index
]


In [210]:
res = []
for firstname, lastname in zip(dataall['first_name'],dataall['last_name']):
    if type(firstname) == float and type(lastname) == float:
        res.append('The Editorial Board')
    elif type(firstname) == float or type(lastname) == float:
        res.append('Other')
    else:
        res.append('{0} {1}'.format(firstname.title(),lastname.title()))

In [211]:
dataall['author'] = res

In [212]:
dataall['Date'] = datadate['Dates']

In [216]:
def top_authors_id(dataall,n_authors):
    groupedauthors = dataall.groupby('author').size()
    groupedauthors['Other'] = 0
    topauthors = groupedauthors.sort_values(ascending=False)[:n_authors]
    topid = [1 if author in topauthors else 0 for author in dataall['author']]
    dataall['topid'] = topid
    return dataall

In [217]:
dataall = top_authors_id(dataall,12)
datatop = dataall[dataall['topid'] == 1]

In [220]:
datatop.columns

Index([u'comment_count', u'document_type', u'first_name', u'full_text',
       u'last_name', u'like_count', u'share_count', u'url', u'Title',
       u'author', u'Date', u'topid'],
      dtype='object')

In [221]:
datafinal = datatop[['share_count','author','Date','Title']]

In [222]:
datafinal.rename(columns = {'share_count':'Share Count', 'author': 'Author'}, inplace = True)

In [223]:
datafinal.to_csv('test.csv',index=False)

### Obsolete

In [205]:
with open('test.out','wb') as f:
    for row in datatop.iterrows():
        linewrite = []
        for key in ['Date','Share Count','Author']:
            val = row[1][key]
            linewrite.append("'{0}': '{1}'".format(key,val))
        lineall = '{' + ', '.join(linewrite)+ '},' + '\n'
        f.write(lineall)

# Similar Articles

In [3]:
import similar_articles

In [7]:
df = pd.read_sql_query("SELECT * FROM orig",similar_articles.con)

In [5]:
sql_query = similar_articles.predict_similar_articles(np.array([1,2]),'Paul','Krugman')

DatabaseError: Execution failed on sql '
    SELECT orig.index, orig.first_name, orig.last_name, orig.url, orig.Title, orig.share_count, topic_weights50.*
    FROM orig
        LEFT JOIN topic_weights50
            ON orig.index = topic_weights50.index
    ': column orig.title does not exist
LINE 2: ...index, orig.first_name, orig.last_name, orig.url, orig.Title...
                                                             ^
