# SQL

In [1]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [41]:
%load_ext autoreload
%autoreload 2

## Database

### Populate Database

In [2]:
# load a database from CSV
dataall = pd.DataFrame.from_csv('dataall.csv')
datadate = pd.DataFrame.from_csv('datadate.csv')
dataauthor = pd.DataFrame.from_csv('dataauthor.csv')
dataother = pd.DataFrame.from_csv('dataother.csv')
topicweights10 = pd.DataFrame.from_csv('topicweights10.csv')
topicweights50 = pd.DataFrame.from_csv('topicweights_2_50.csv')

### Connect to Database

In [3]:
n_topics = 50
n_authors = 38
n_days = 7

In [4]:
#In Python: Define a database name, and your username for your computer. 
dbname = 'oped_db'
username = 'varun'

In [5]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print engine.url

postgres://varun@localhost/oped_db


In [6]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

True


In [7]:
## insert data into database from Python (proof of concept - this won't be useful for big data, of course)
## df is any pandas dataframe 
dataall.to_sql('orig', engine, if_exists='replace')
datadate.to_sql('dates', engine, if_exists='replace')
dataauthor.to_sql('authors', engine, if_exists='replace')
dataother.to_sql('other', engine, if_exists='replace')
topicweights10.to_sql('topic_weights10', engine, if_exists='replace')
topicweights50.to_sql('topic_weights50', engine, if_exists='replace')

In [8]:
## Now try the same queries, but in python!
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username)

In [9]:
# query:
def query_for_person(firstname,lastname):
    return """
    SELECT orig.share_count, topic_weights.*
    FROM orig
        JOIN topic_weights
            ON orig.index = topic_weights.index
    WHERE orig.first_name='{0}' AND orig.last_name='{1}';
    """.format(firstname,lastname)

In [10]:
sql_query = query_for_person('ross','douthat')
data_from_sql = pd.read_sql_query(sql_query,con)
data_from_sql.head()

Unnamed: 0,share_count,index,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
0,665,53,0.000372,0.000372,0.000372,0.000372,0.106056,0.000372,0.000372,0.000372,0.890969,0.000372
1,305,85,0.000253,0.000253,0.093017,0.000253,0.000253,0.000253,0.000253,0.000253,0.904957,0.000253
2,412,135,0.000343,0.000343,0.000343,0.000343,0.092126,0.030038,0.083471,0.156396,0.616574,0.020025
3,296,177,0.000262,0.000262,0.071105,0.007671,0.000262,0.000262,0.098962,0.000262,0.820691,0.000262
4,3061,215,0.000339,0.000339,0.000339,0.04413,0.026326,0.08199,0.000339,0.000339,0.738386,0.107473


In [11]:
# query:
def query_all():
    return """
    SELECT orig.share_count, topic_weights50.*, authors.*, dates.*
    FROM orig
        JOIN topic_weights50
            ON orig.index = topic_weights50.index
        JOIN authors
            ON orig.index = authors.index
        JOIN dates
            ON orig.index = dates.index
    """ 

In [12]:
sql_query = query_all()
data_from_sql = pd.read_sql_query(sql_query,con)
data_from_sql.head()

Unnamed: 0,share_count,index,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,...,index.1,Dates,Times,Day0,Day1,Day2,Day3,Day4,Day5,Day6
0,1502,0,5.9e-05,5.9e-05,5.9e-05,5.9e-05,0.014383,5.9e-05,5.9e-05,5.9e-05,...,0,2016-03-31,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,6308,1,0.012066,4.3e-05,4.3e-05,4.3e-05,0.005877,4.3e-05,4.3e-05,0.154154,...,1,2016-03-31,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,826,2,4.4e-05,4.4e-05,0.01369,4.4e-05,0.018525,4.4e-05,4.4e-05,0.177759,...,2,2016-03-31,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,685,3,7e-05,0.225319,7e-05,7e-05,7e-05,0.014989,7e-05,0.373768,...,3,2016-03-31,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1040,4,0.288671,6.6e-05,6.6e-05,6.6e-05,0.007971,6.6e-05,0.106781,6.6e-05,...,4,2016-03-31,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [13]:
data_from_sql['log_share_count'] = np.log10(data_from_sql['share_count'])
data_from_sql[data_from_sql['log_share_count'] < 0] = 0

## Turn into Classification

In [92]:
xmin = np.min(data_from_sql['log_share_count'])
xmax = np.max(data_from_sql['log_share_count'])
bins = np.linspace(xmin,xmax,10)
binnum = np.digitize(data_from_sql['log_share_count'],bins)
binnum[binnum <= 5] = 0
binnum[binnum > 5] = 1
data_from_sql['bin_share_count'] = binnum

In [78]:
plt.hist(data_from_sql['log_share_count'])
plt.show()

# Machine Learning

## One-Hot Encoding

In [9]:
encode = preprocessing.OneHotEncoder()

In [20]:
encode.fit(np.array([[1],[2],[3],[5],[6],[7],[2],[3],[4]]))

OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [22]:
encode.transform(np.array([[2],[3],[2]])).toarray()

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.]])

## Training/Test

In [21]:
from sklearn import linear_model, neighbors, ensemble, preprocessing

In [22]:
def split_data(data,frac=0.7):
    datanew = data.sample(frac=1)
    nrows = len(datanew)
    idx = int(nrows*frac)
    return {'train': datanew.iloc[:idx], 'test': datanew.iloc[idx:]}
    # use 70-30 split

In [23]:
def train_model(split_data,model,errorfun,**kwargs):
    datatrain = split_data['train']
    datatest = split_data['test']
    model.fit(datatrain[featurenames],datatrain[viralityname],**kwargs)
    train_pred = model.predict(datatrain[featurenames])
    test_pred = model.predict(datatest[featurenames])
    train_error = errorfun(train_pred,datatrain[viralityname])
    test_error = errorfun(test_pred,datatest[viralityname])
    print('Training error: {0}'.format(train_error))
    print('Test error: {0}'.format(test_error))

In [85]:
topicnames = ['topic{0}'.format(i) for i in range(n_topics)]
authornames = ['author{0}'.format(i) for i in range(n_authors)]
daynames = ['Day{0}'.format(i) for i in range(n_days)]
# othernames = ['len','Times']
# othernames = ['Times']
othernames = []
featurenames = topicnames + authornames + daynames + othernames
features = data_from_sql[featurenames]
viralityname = 'log_share_count'
virality = data_from_sql[viralityname]

In [86]:
datasplit = split_data(data_from_sql)

## Classification

In [117]:
def model_error_class(predicted,actual):
    nobs = actual.size
    return 1.0/nobs*np.sum((actual - predicted)**2)

### Logistic Regression

In [123]:
fig = plt.figure(1)
ax = fig.gca()
ax = ax.matshow(data_from_sql[featurenames], cmap=plt.cm.gray)
plt.show()

In [113]:
model = linear_model.LogisticRegression()
model.fit(datatrain[featurenames],datatrain['bin_share_count'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### K-Nearest Neighbors

In [None]:
model = neighbors.KNeighbors(n_neighbors=i)
train_model(datasplit,model)

## Regression

In [29]:
def model_error_regr(predicted,actual):
    nobs = actual.size
    return np.sqrt(1.0/(2.0*nobs)*np.sum((actual - predicted)**2))

In [30]:
def coeff_regr(predicted,actual):
    avg = np.mean(actual)
    sstot = np.sum((actual - avg)**2)
    ssres = np.sum((actual - predicted)**2)
    return 1 - ssres/sstot

### Linear Regression

In [100]:
model = linear_model.Ridge(alpha=0.01)

In [101]:
model.fit(data_from_sql[featurenames],data_from_sql[viralityname])

Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [102]:
train_model(datasplit,model,coeff_regr)

Training error: 0.393160838817
Test error: 0.380005421563


In [45]:
for featurename, coef in zip(featurenames, model.coef_):
    # print(featurename, coef)
    pass

In [105]:
model = neighbors.KNeighborsRegressor(n_neighbors=15)

In [106]:
train_model(datasplit,model,coeff_regr)

Training error: 0.187695875874
Test error: 0.164337671611


## Random Forest

In [32]:
model = ensemble.RandomForestRegressor(n_estimators=50,max_features='sqrt')

In [33]:
train_model(datasplit,model,coeff_regr)

Training error: 0.912122608486
Test error: 0.416150523842


In [46]:
for feature, importance in zip(featurenames,model.feature_importances_):
    # print(feature, importance)
    pass

AttributeError: 'Ridge' object has no attribute 'feature_importances_'

## K-Nearest Neighbors

In [105]:
for i in range(1,20):
    model = neighbors.KNeighborsRegressor(n_neighbors=i)
    train_model(datasplit,model,coeff_regr)

Training error: 0.998883446246
Test error: -0.147600334939
Training error: 0.716377900879
Test error: 0.152592380258
Training error: 0.619053854171
Test error: 0.243409621316
Training error: 0.569168974685
Test error: 0.290740795355
Training error: 0.535852162635
Test error: 0.314018356802
Training error: 0.515676841609
Test error: 0.328033807347
Training error: 0.500432695738
Test error: 0.332794871677
Training error: 0.488118540344
Test error: 0.34068243082
Training error: 0.478990621509
Test error: 0.345101970322
Training error: 0.469109110887
Test error: 0.344840873636
Training error: 0.461235157144
Test error: 0.34593098155
Training error: 0.454972515161
Test error: 0.350557689176
Training error: 0.448723296798
Test error: 0.351332730917
Training error: 0.444368263776
Test error: 0.353331101435
Training error: 0.439076912761
Test error: 0.355747969173
Training error: 0.437369664485
Test error: 0.355988512055
Training error: 0.43284801782
Test error: 0.352899477027
Training error: 

# Save Model

In [51]:
import pickleizer

In [103]:
pickleizer.save_model(model)

# Make Some Predictions!

In [196]:
import model_predict
import math

In [173]:
authorid = model_predict.AUTHORID

In [188]:
for key in authorid.keys():
    if type(key[0]) != str:
        ans = key

In [197]:
math.isnan(ans[0])

True

In [200]:
url = 'http://www.nytimes.com/2016/06/03/opinion/the-id-that-ate-the-planet.html'
firstname = 'nicholas'
lastname = 'kristof'
dayofweek = 0

In [201]:
model_predict.predict_new_article_url(firstname,lastname,dayofweek,url)

2500

In [202]:
import os

In [204]:
os.environ['NYTIMESAPIKEY']

KeyError: 'NYTIMESAPIKEY'

In [209]:
AUTHORID

[autoreload of nytimes_crawl_2 failed: Traceback (most recent call last):
  File "/home/varun/anaconda2/lib/python2.7/site-packages/IPython/extensions/autoreload.py", line 247, in check
    superreload(m, reload, self.old_objects)
KeyError: 'NYTIMESAPIKEY'
]


NameError: name 'AUTHORID' is not defined