# SQL

In [1]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [3]:
n_topics = 10

In [4]:
#In Python: Define a database name, and your username for your computer. 
dbname = 'oped_db'
username = 'varun'

In [5]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print engine.url

postgres://varun@localhost/oped_db


In [6]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

True


In [6]:
# load a database from CSV
dataall = pd.DataFrame.from_csv('dataall.csv')
datadate = pd.DataFrame.from_csv('datadate.csv')
topicweights10 = pd.DataFrame.from_csv('topicweights10.csv')
topicweights50 = pd.DataFrame.from_csv('topicweights50.csv')

In [7]:
## insert data into database from Python (proof of concept - this won't be useful for big data, of course)
## df is any pandas dataframe 
dataall.to_sql('orig', engine, if_exists='replace')
datadate.to_sql('dates_and_tidy', engine, if_exists='replace')
topicweights.to_sql('topic_weights', engine, if_exists='replace')

In [8]:
## Now try the same queries, but in python!
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username)

In [12]:
# query:
def query_for_person(firstname,lastname):
    return """
    SELECT orig.share_count, topic_weights.*
    FROM orig
        JOIN topic_weights
            ON orig.index = topic_weights.index
    WHERE orig.first_name='{0}' AND orig.last_name='{1}';
    """.format(firstname,lastname)

In [62]:
sql_query = query_for_person('ross','douthat')
data_from_sql = pd.read_sql_query(sql_query,con)
data_from_sql.head()

Unnamed: 0,share_count,index,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
0,665,53,0.000372,0.000372,0.000372,0.000372,0.106056,0.000372,0.000372,0.000372,0.890969,0.000372
1,305,85,0.000253,0.000253,0.093017,0.000253,0.000253,0.000253,0.000253,0.000253,0.904957,0.000253
2,412,135,0.000343,0.000343,0.000343,0.000343,0.092126,0.030038,0.083471,0.156396,0.616574,0.020025
3,296,177,0.000262,0.000262,0.071105,0.007671,0.000262,0.000262,0.098962,0.000262,0.820691,0.000262
4,3061,215,0.000339,0.000339,0.000339,0.04413,0.026326,0.08199,0.000339,0.000339,0.738386,0.107473


In [63]:
data_from_sql['log_share_count'] = np.log10(data_from_sql['share_count'])

## Obsolete

In [92]:
xmin = np.min(data_from_sql['log_share_count'])
xmax = np.max(data_from_sql['log_share_count'])
bins = np.linspace(xmin,xmax,10)
binnum = np.digitize(data_from_sql['log_share_count'],bins)
binnum[binnum <= 5] = 0
binnum[binnum > 5] = 1
data_from_sql['bin_share_count'] = binnum

In [78]:
plt.hist(data_from_sql['log_share_count'])
plt.show()

# Machine Learning

In [35]:
from sklearn import linear_model, neighbors, ensemble

In [64]:
featurenames = ['topic{0}'.format(i) for i in range(n_topics)]
features = data_from_sql[featurenames]
viralityname = 'log_share_count'
virality = data_from_sql[viralityname]

In [17]:
def split_data(data,frac=0.7):
    datanew = data.sample(frac=1)
    nrows = len(datanew)
    idx = int(nrows*frac)
    return {'train': datanew.iloc[:idx], 'test': datanew.iloc[idx:]}
    # use 70-30 split

In [18]:
def train_model(split_data,model,errorfun,**kwargs):
    datatrain = split_data['train']
    datatest = split_data['test']
    model.fit(datatrain[featurenames],datatrain[viralityname],**kwargs)
    train_pred = model.predict(datatrain[featurenames])
    test_pred = model.predict(datatest[featurenames])
    train_error = errorfun(train_pred,datatrain[viralityname])
    test_error = errorfun(test_pred,datatest[viralityname])
    print('Training error: {0}'.format(train_error))
    print('Test error: {0}'.format(test_error))

In [65]:
datasplit = split_data(data_from_sql)

## Classification

In [117]:
def model_error_class(predicted,actual):
    nobs = actual.size
    return 1.0/nobs*np.sum((actual - predicted)**2)

### Logistic Regression

In [123]:
fig = plt.figure(1)
ax = fig.gca()
ax = ax.matshow(data_from_sql[featurenames], cmap=plt.cm.gray)
plt.show()

In [113]:
model = linear_model.LogisticRegression()
model.fit(datatrain[featurenames],datatrain['bin_share_count'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### K-Nearest Neighbors

In [None]:
model = neighbors.KNeighbors(n_neighbors=i)
train_model(datasplit,model)

## Regression

In [20]:
def model_error_regr(predicted,actual):
    nobs = actual.size
    return np.sqrt(1.0/(2.0*nobs)*np.sum((actual - predicted)**2))

In [21]:
def coeff_regr(predicted,actual):
    avg = np.mean(actual)
    sstot = np.sum((actual - avg)**2)
    ssres = np.sum((actual - predicted)**2)
    return 1 - ssres/sstot

### Linear Regression

In [66]:
model = linear_model.LinearRegression()

In [67]:
train_model(datasplit,model,coeff_regr)

Training error: 0.146068168915
Test error: -0.0577355934222


In [70]:
model = neighbors.KNeighborsRegressor(n_neighbors=2)

In [71]:
train_model(datasplit,model,coeff_regr)

Training error: 0.369414017449
Test error: -0.0482728944169


In [30]:
train_model(datasplit,model,coeff_regr)

Training error: 0.152259692225
Test error: 0.0897870521909


## Random Forest

In [72]:
model = ensemble.RandomForestRegressor(n_estimators=150,max_features='sqrt')

In [73]:
train_model(datasplit,model,coeff_regr)

Training error: 0.842398451897
Test error: -0.0643607796997


## K-Nearest Neighbors

In [74]:
for i in range(1,20):
    model = neighbors.KNeighborsRegressor(n_neighbors=i)
    train_model(datasplit,model,coeff_regr)

Training error: 1.0
Test error: -1.0149636486
Training error: 0.369414017449
Test error: -0.0482728944169
Training error: 0.152794370917
Test error: 0.0526646914458
Training error: 0.122197021429
Test error: -0.0483907340925
Training error: 0.0864077893876
Test error: -0.0715782565035
Training error: 0.0967194262485
Test error: 0.0141731922349
Training error: 0.0971783424809
Test error: -0.0546648948211
Training error: 0.085061601116
Test error: -0.0504616712963
Training error: 0.0589406883477
Test error: -0.0764581091433
Training error: 0.071953880027
Test error: -0.0442268828961
Training error: 0.0222583633123
Test error: -0.096040116486
Training error: 0.0135838541776
Test error: -0.0840116784431
Training error: 0.0368099718252
Test error: -0.0972754416235
Training error: 0.0214035161665
Test error: -0.0778496180432
Training error: 0.0360527968211
Test error: -0.0290707608636
Training error: 0.0406370568012
Test error: -0.0571675479292
Training error: 0.0438023465451
Test error: -0.