In [1]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import topic_weights as tw
import os
import queries
import model_predict
%matplotlib qt

In [2]:
%load_ext autoreload
%autoreload 2

### Load Data

In [3]:
subdir = 'final_csvs2'

In [4]:
dataall = pd.DataFrame.from_csv(os.path.join(subdir,'dataall.csv'),encoding='utf-8')
datatextualother = pd.DataFrame.from_csv(os.path.join(subdir,'datatextualother.csv'),encoding='utf-8')
datanonother = pd.DataFrame.from_csv(os.path.join(subdir,'datanonother.csv'),encoding='utf-8')

In [5]:
dataallnew = dataall[dataall.columns.drop(u'full_text')]

In [6]:
n_topics = 50
stem = 'stem'
package = 'sklearn'
twcsv = tw.topic_weights_csv(n_topics,stem,package,'all')
topicweights = pd.DataFrame.from_csv(os.path.join(subdir,twcsv))

In [7]:
sentiment = pd.DataFrame.from_csv(os.path.join(subdir,'sentiment.csv'))

### Connect to Database (Manually)

In [8]:
#In Python: Define a database name, and your username for your computer. 
dbname = 'oped_v4_db'
username = 'varun'

In [9]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print engine.url

postgres://varun@localhost/oped_v4_db


In [10]:
if not database_exists(engine.url):
    create_database(engine.url)

In [10]:
## insert data into database from Python (proof of concept - this won't be useful for big data, of course)
## df is any pandas dataframe 
dataallnew.to_sql('orig', engine, if_exists='replace')
datatextualother.to_sql('text', engine, if_exists='replace')
datanonother.to_sql('non', engine, if_exists='replace')
topicweights.to_sql('tw', engine, if_exists='replace')
sentiment.to_sql('sent', engine, if_exists='replace')

In [12]:
## Now try the same queries, but in python!
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username)

# Some Queries

In [13]:
allquery = queries.all_query()
alldata = queries.return_df(allquery)

In [17]:
charlesblowquery = queries.author_query('Charles Blow')
charlesblow = queries.return_df(charlesblowquery)
charlesblow.shape

(201, 116)

In [18]:
alldata['log_share_count'] = np.log10(alldata['share_count'])
alldata[alldata['log_share_count'] < 0] = 0

In [20]:
featurenames = datatextualother.columns.append(
    [datanonother.columns,topicweights.columns,sentiment.columns])
viralityname = 'log_share_count'

# Other Data

In [21]:
regdata = alldata[alldata['author_37'] == 0]

# Machine Learning

## Training/Test

In [22]:
from sklearn import linear_model, neighbors, ensemble, preprocessing
from sklearn.metrics import roc_curve, auc, r2_score
from sklearn.metrics import accuracy_score, precision_recall_curve, mean_squared_error

In [23]:
def split_data(data,frac=0.7):
    datanew = data.sample(frac=1)
    nrows = len(datanew)
    idx = int(nrows*frac)
    return {'train': datanew.iloc[:idx], 'test': datanew.iloc[idx:]}
    # use 70-30 split

In [24]:
def train_model(data,featurenames,viralityname,model,errorfun,**kwargs):
    splitdata = split_data(data)
    datatrain = splitdata['train']
    datatest = splitdata['test']
    model.fit(datatrain[featurenames],datatrain[viralityname],**kwargs)
    train_pred = model.predict(datatrain[featurenames])
    test_pred = model.predict(datatest[featurenames])
    train_error = errorfun(datatrain[viralityname],train_pred)
    test_error = errorfun(datatest[viralityname],test_pred)
    print('Training error: {0}'.format(train_error))
    print('Test error: {0}'.format(test_error))

## Regression

In [25]:
def model_error_regr(predicted,actual):
    nobs = actual.size
    return 1.0/(2.0*nobs)*np.sum((actual - predicted)**2)

In [26]:
def coeff_regr(predicted,actual):
    avg = np.mean(actual)
    sstot = np.sum((actual - avg)**2)
    ssres = np.sum((actual - predicted)**2)
    return 1 - ssres/sstot

## Gradient Boosted Regression

In [27]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
          'learning_rate': 0.01, 'loss': 'ls'}
model = ensemble.GradientBoostingRegressor(**params)

In [70]:
train_model(regdata,featurenamesfinal,viralityname,model,r2_score)

Training error: 0.574027545612
Test error: 0.511294053322


### Linear Regression

In [92]:
featurenamesfinal = featurenames.drop(['len','lentitle','time'])

In [93]:
model = linear_model.Ridge(alpha=0.1)

In [94]:
model.fit(regdata[featurenamesfinal],regdata[viralityname])

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [95]:
train_pred = model.predict(regdata[featurenamesfinal])
train_error = coeff_regr(train_pred,regdata[viralityname])
print(train_error)

0.508215734612


In [76]:
train_model(regdata,featurenamesfinal,viralityname,model,r2_score)

Training error: 0.513762233673
Test error: 0.483250459073


In [77]:
for featurename, coef in zip(featurenamesfinal, model.coef_):
    if featurename.startswith('author'):
        code = int(featurename[7:])
        author = model_predict.AUTHORENCODER.idtotoken.get(code,None)
        print(author,coef)
    else:
        print(featurename, coef)

(u'day_0', 1.7564273094729226)
(u'day_1', 1.7222739200337427)
(u'day_2', 1.7637797301388771)
(u'day_3', 1.7241493344098953)
(u'day_4', 1.6840938634721363)
(u'day_5', 1.6949644781211113)
(u'day_6', 1.8481719643773789)
('Timothy Egan', 0.87801483060424246)
('Lawrence Downes', 0.37362045907742303)
('Joe Nocera', 0.33273716891634142)
('Bina Shah', 0.12479124127842375)
('David Brooks', 0.80785002462932243)
('Nicholas Kristof', 1.0636527129303563)
('Emma Roller', -0.053800226216782987)
('The Editorial Board', 0.22253906551379729)
('Arthur Brooks', 0.63096479268740591)
('Serge Schmemann', -0.34790184837874172)
('Jennifer Boylan', 0.39772303960621813)
('Gail Collins', 0.4316406077089357)
('Nikos Konstandaras', -0.05055471805492856)
('Charles Blow', 0.69892553299547788)
('Shmuel Rosner', -0.26884983002679924)
('Paul Krugman', 1.2672344721674487)
('Maureen Dowd', 0.43161336250877796)
('Ross Douthat', 0.16793109453363272)
('Thomas Edsall', 0.3014805356791318)
('Linda Greenhouse', 0.10652155655679

## Random Forest

In [56]:
model = ensemble.RandomForestRegressor(n_estimators=200)

In [57]:
train_model(regdata,featurenames,viralityname,model,mean_squared_error)

Training error: 0.0323574534752
Test error: 0.256422541736


In [106]:
for feature, importance in zip(featurenames,model.feature_importances_):
    pass
    # print(feature, importance)

AttributeError: 'Ridge' object has no attribute 'feature_importances_'

## K-Nearest Neighbors

In [105]:
for i in range(1,20):
    model = neighbors.KNeighborsRegressor(n_neighbors=i)
    train_model(datasplit,model,coeff_regr)

Training error: 0.998883446246
Test error: -0.147600334939
Training error: 0.716377900879
Test error: 0.152592380258
Training error: 0.619053854171
Test error: 0.243409621316
Training error: 0.569168974685
Test error: 0.290740795355
Training error: 0.535852162635
Test error: 0.314018356802
Training error: 0.515676841609
Test error: 0.328033807347
Training error: 0.500432695738
Test error: 0.332794871677
Training error: 0.488118540344
Test error: 0.34068243082
Training error: 0.478990621509
Test error: 0.345101970322
Training error: 0.469109110887
Test error: 0.344840873636
Training error: 0.461235157144
Test error: 0.34593098155
Training error: 0.454972515161
Test error: 0.350557689176
Training error: 0.448723296798
Test error: 0.351332730917
Training error: 0.444368263776
Test error: 0.353331101435
Training error: 0.439076912761
Test error: 0.355747969173
Training error: 0.437369664485
Test error: 0.355988512055
Training error: 0.43284801782
Test error: 0.352899477027
Training error: 

# Save Model

In [35]:
import pickleizer

In [37]:
pickleizer.save_model(model,featurenamesfinal)

# Make Some Predictions!

In [40]:
import model_predict
import similar_articles
import math

In [162]:
authorname = 'David Brooks'
dayofweek = 0
fulltext = u'Trump trumped Trump yesterday'

## Shares

In [166]:
model_predict.predict_new_article_text(authorname,dayofweek,fulltext)

4500

## Percentiles

In [171]:
weights = model_predict.article_weights(fulltext)
df = queries.return_df(queries.all_query())
sameauthor, otherauthor = similar_articles.same_other_df(df,authorname)
expectedshares = model_predict.predict_new_article_text_sub(authorname,dayofweek,weights)
percsame, percother = similar_articles.percentiles(sameauthor,otherauthor,expectedshares)

In [172]:
percsame, percother

(87.837837837837839, 93.262711864406782)

## Recommendations

In [50]:
fnames = model_predict.FEATURENAMES

In [82]:
text = u'Trump trumped Trump yesterday'
author = 'Maureen Dowd'
dayofweek = 1
weights = model_predict.article_weights(text)
df = queries.return_df(queries.all_query())
model_predict.get_recommendations(author,dayofweek,weights,df)

['The score for the topics of your opinion ranks in percentile 100 of all opinion pieces. I see some potential here!',
 'The author you selected ranks in percentile 61 of all authors. You could stand to be more interesting, like Timothy Egan.',
 'The day you selected ranks in percentile 57 of all days. May I suggest a different day of the week? Perhaps Sunday.']

## Similar Articles

In [3]:
import similar_articles

In [179]:
samedf, otherdf = similar_articles.same_other_df(alldata,authorname)
similar_articles.predict_similar_articles(samedf,otherdf,weights)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sim'] = df[fnames].dot(weights.T)


(      index first_name last_name  comment_count document_type  like_count  \
 474     476      david    brooks           4162       article       13949   
 1216   1221      david    brooks           1049       article        1328   
 272     273      david    brooks          11956       article       20901   
 317     318      david    brooks           5899       article       18660   
 151     152      david    brooks          64375       article      219166   
 
       share_count                                                url  \
 474          5064  http://www.nytimes.com/2016/02/19/opinion/a-li...   
 1216         1145  http://www.nytimes.com/2015/12/11/opinion/the-...   
 272          6222  http://www.nytimes.com/2016/03/08/opinion/its-...   
 317          7075  http://www.nytimes.com/2016/03/04/opinion/dona...   
 151         48755  http://www.nytimes.com/2016/03/18/opinion/no-n...   
 
             date                             title    ...     topic_42  \
 474   2016-02-

In [2]:
Str = "this is string example....wow!!!";
print(type(Str))
Str = Str.decode('utf-8')
print(type(Str))

<type 'str'>
<type 'unicode'>
