A simple linear regression model that developed to become acquinted with the package based on the sin and cosine transformations of the fractional hour.

In [49]:
import pymc as pm
import arviz as az
import pandas as pd
import sys

sys.path.append('../')
from util.preprocessing_utils import standardize_column

In [50]:
df = pd.read_csv('../data/processed/ProcessedTweets.csv')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negative_reason,negative_reason_confidence,airline,name,retweet_count,text,tweet_created,tweet_location,user_timezone,fractional_hour,hour_sin,hour_cos
0,570306133677760513,0,1.0,0,0.0,0,cairdin,0,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52-08:00,Unknown,Eastern Time (US & Canada),11.597778,0.105107,-0.994461
1,570301130888122368,1,0.3486,0,0.0,0,jnardino,0,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59-08:00,Unknown,Pacific Time (US & Canada),11.266389,0.19088,-0.981613
2,570301083672813571,0,0.6837,0,0.0,0,yvonnalynn,0,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48-08:00,Lets Play,Central Time (US & Canada),11.263333,0.191666,-0.98146
3,570301031407624196,2,1.0,1,0.7033,0,jnardino,0,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36-08:00,Unknown,Pacific Time (US & Canada),11.26,0.192522,-0.981293
4,570300817074462722,2,1.0,2,1.0,0,jnardino,0,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45-08:00,Unknown,Pacific Time (US & Canada),11.245833,0.19616,-0.980572


In [51]:
# to keep things clean, let's first drop all the columns that we won't be using

df = df.drop(['tweet_id', 'negative_reason', 'negative_reason_confidence', 
              'name', 'retweet_count', 'text', 'tweet_created', 'tweet_location',
              'user_timezone',], axis=1)
df.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,airline,fractional_hour,hour_sin,hour_cos
0,0,1.0,0,11.597778,0.105107,-0.994461
1,1,0.3486,0,11.266389,0.19088,-0.981613
2,0,0.6837,0,11.263333,0.191666,-0.98146
3,2,1.0,0,11.26,0.192522,-0.981293
4,2,1.0,0,11.245833,0.19616,-0.980572


In [52]:
airline_mapping = {
    0: 'virgin_america',
    1: 'united',
    2: 'southwest',
    3: 'delta',
    4: 'us_airways',
    5: 'american'
}

df['airline'] = df['airline'].map(airline_mapping)
df  = pd.get_dummies(df, columns=['airline'])

df.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,fractional_hour,hour_sin,hour_cos,airline_american,airline_delta,airline_southwest,airline_united,airline_us_airways,airline_virgin_america
0,0,1.0,11.597778,0.105107,-0.994461,False,False,False,False,False,True
1,1,0.3486,11.266389,0.19088,-0.981613,False,False,False,False,False,True
2,0,0.6837,11.263333,0.191666,-0.98146,False,False,False,False,False,True
3,2,1.0,11.26,0.192522,-0.981293,False,False,False,False,False,True
4,2,1.0,11.245833,0.19616,-0.980572,False,False,False,False,False,True


In [60]:
with pm.Model() as model:
    alpha = pm.Normal('Intercept', mu=0, sigma=10)
    hour_sin_coeff = pm.Normal('hour_sin_coeff', mu=0, sigma=10)
    hour_cos_coeff = pm.Normal('hour_cos_coeff', mu=0, sigma=10)
    sigma = pm.HalfNormal('sigma', sigma=1)

    mu = (alpha + 
          hour_sin_coeff * df['hour_sin'] +
          hour_cos_coeff * df['hour_cos']
         )

    sentiment_obs = pm.Normal('sentiment_obs', mu=mu, sigma=sigma, observed=df['airline_sentiment_confidence'])

    nuts_sampler_kwargs = {'target_accept': 0.98, 'max_treedepth': 12}
    trace = pm.sample(2000, tune=1000, nuts_sampler_kwargs=nuts_sampler_kwargs, return_inferencedata=True)

az.summary(trace)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, hour_sin_coeff, hour_cos_coeff, sigma]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 4 seconds.


Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
Intercept,0.9,0.001,0.898,0.903,0.0,0.0,7531.0,3223.0,1.0
hour_sin_coeff,-0.001,0.001,-0.004,0.001,0.0,0.0,6436.0,3268.0,1.0
hour_cos_coeff,0.006,0.001,0.003,0.008,0.0,0.0,6421.0,3522.0,1.0
sigma,0.163,0.001,0.161,0.165,0.0,0.0,7677.0,3238.0,1.0
