In [49]:
import pymc as pm
import arviz as az
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys

sys.path.append('../')
from util.preprocessing_utils import standardize_column

In [50]:
df = pd.read_csv('../data/processed/ProcessedTweets.csv')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negative_reason,negative_reason_confidence,airline,name,retweet_count,text,tweet_created,tweet_location,user_timezone,fractional_hour,hour_sin,hour_cos
0,570306133677760513,0,1.0,0,0.0,0,cairdin,0,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52-08:00,Unknown,Eastern Time (US & Canada),11.597778,0.105107,-0.994461
1,570301130888122368,1,0.3486,0,0.0,0,jnardino,0,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59-08:00,Unknown,Pacific Time (US & Canada),11.266389,0.19088,-0.981613
2,570301083672813571,0,0.6837,0,0.0,0,yvonnalynn,0,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48-08:00,Lets Play,Central Time (US & Canada),11.263333,0.191666,-0.98146
3,570301031407624196,2,1.0,1,0.7033,0,jnardino,0,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36-08:00,Unknown,Pacific Time (US & Canada),11.26,0.192522,-0.981293
4,570300817074462722,2,1.0,2,1.0,0,jnardino,0,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45-08:00,Unknown,Pacific Time (US & Canada),11.245833,0.19616,-0.980572


In [51]:
# to keep things clean, let's first drop all the columns that we won't be using

df = df.drop(['tweet_id', 'negative_reason', 'negative_reason_confidence', 
              'name', 'retweet_count', 'text', 'tweet_created', 'tweet_location',
              'user_timezone',], axis=1)
df.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,airline,fractional_hour,hour_sin,hour_cos
0,0,1.0,0,11.597778,0.105107,-0.994461
1,1,0.3486,0,11.266389,0.19088,-0.981613
2,0,0.6837,0,11.263333,0.191666,-0.98146
3,2,1.0,0,11.26,0.192522,-0.981293
4,2,1.0,0,11.245833,0.19616,-0.980572


In [52]:
airline_mapping = {
    0: 'virgin_america',
    1: 'united',
    2: 'southwest',
    3: 'delta',
    4: 'us_airways',
    5: 'american'
}

df['airline'] = df['airline'].map(airline_mapping)
df  = pd.get_dummies(df, columns=['airline'])

df.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,fractional_hour,hour_sin,hour_cos,airline_american,airline_delta,airline_southwest,airline_united,airline_us_airways,airline_virgin_america
0,0,1.0,11.597778,0.105107,-0.994461,False,False,False,False,False,True
1,1,0.3486,11.266389,0.19088,-0.981613,False,False,False,False,False,True
2,0,0.6837,11.263333,0.191666,-0.98146,False,False,False,False,False,True
3,2,1.0,11.26,0.192522,-0.981293,False,False,False,False,False,True
4,2,1.0,11.245833,0.19616,-0.980572,False,False,False,False,False,True


In [53]:
df = standardize_column(df, 'hour_sin')
df = standardize_column(df, 'hour_cos')

df.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,fractional_hour,hour_sin,hour_cos,airline_american,airline_delta,airline_southwest,airline_united,airline_us_airways,airline_virgin_america
0,0,1.0,11.597778,0.237814,-1.103342,False,False,False,False,False,True
1,1,0.3486,11.266389,0.357008,-1.082442,False,False,False,False,False,True
2,0,0.6837,11.263333,0.358099,-1.082193,False,False,False,False,False,True
3,2,1.0,11.26,0.359289,-1.08192,False,False,False,False,False,True
4,2,1.0,11.245833,0.364345,-1.080747,False,False,False,False,False,True


In [58]:
with pm.Model() as model:
    alpha = pm.Normal('Intercept', mu=0, sigma=10)
    airline_american_coeff = pm.Normal('airline_american_coeff', mu=0, sigma=10)
    airline_delta_coeff = pm.Normal('airline_delta_coeff', mu=0, sigma=10)
    airline_sw_coeff = pm.Normal('airline_sw_coeff', mu=0, sigma=10)
    airline_usa_coeff = pm.Normal('airline_usa_coeff', mu=0, sigma=10)
    airline_united_coeff = pm.Normal('airline_united_coeff', mu=0, sigma=10)
    airline_virgin_coeff = pm.Normal('airline_virgin_coeff', mu=0, sigma=10)
    hour_sin_coeff = pm.Normal('hour_sin_coeff', mu=0, sigma=10)
    hour_cos_coeff = pm.Normal('hour_cos_coeff', mu=0, sigma=10)
    sigma = pm.HalfNormal('sigma', sigma=10)

    df['airline_american'] = df['airline_american'].astype(int)

    mu = (alpha + 
          airline_american_coeff * df['airline_american'] + 
          airline_delta_coeff * df['airline_delta'] +
          airline_sw_coeff * df['airline_southwest'] +
          airline_usa_coeff * df['airline_us_airways'] +
          airline_united_coeff * df['airline_united'] +
          airline_virgin_coeff * df['airline_virgin_america'] +
          hour_sin_coeff * df['hour_sin'] +
          hour_cos_coeff * df['hour_cos']
         )

    sentiment_obs = pm.Normal('sentiment_obs', mu=mu, sigma=sigma, observed=df['airline_sentiment_confidence'])

    nuts_sampler_kwargs = {'target_accept': 0.95, 'max_treedepth': 12}
    trace = pm.sample(1000, tune=1000, nuts_sampler_kwargs=nuts_sampler_kwargs, return_inferencedata=True)

az.summary(trace)


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, airline_american_coeff, airline_delta_coeff, airline_sw_coeff, airline_usa_coeff, airline_united_coeff, airline_virgin_coeff, hour_sin_coeff, hour_cos_coeff, conf_coeff, sigma]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 1058 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 600 divergences after tuning. Increase `target_accept` or reparameterize.
Chain 0 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 1 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 3 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.


Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
Intercept,4.636,4.226,-1.807,10.604,1.819,1.531,6.0,13.0,1.75
airline_american_coeff,-3.72,4.226,-9.688,2.726,1.819,1.537,6.0,14.0,1.75
airline_delta_coeff,-3.766,4.226,-9.733,2.677,1.819,1.537,6.0,14.0,1.76
airline_sw_coeff,-3.749,4.226,-9.716,2.689,1.819,1.537,6.0,13.0,1.76
airline_usa_coeff,-3.715,4.226,-9.695,2.718,1.819,1.537,6.0,17.0,1.75
airline_united_coeff,-3.736,4.226,-9.712,2.695,1.819,1.537,6.0,13.0,1.76
airline_virgin_coeff,-3.76,4.227,-9.736,2.678,1.82,1.538,6.0,22.0,1.74
hour_sin_coeff,-0.001,0.001,-0.004,0.001,0.0,0.0,53.0,617.0,1.07
hour_cos_coeff,0.004,0.001,0.002,0.006,0.0,0.0,276.0,524.0,1.18
conf_coeff,4.126,10.29,-16.068,18.311,2.925,2.119,14.0,464.0,1.21


This model that uses `airline_sentiment_confidence` as the likelihood update mechanism struggling to converge, as shown by the ``r_hat`` values. I also have a suspicion that standardising the `hour_sin` and `hour_cos` values hurt the model, since this might jeapordize the underlying represetation of the hourly cycles of a day.