In [1]:
from header import *

In [2]:
dask.config.set(scheduler='threading')

<dask.config.set at 0x283cb65a2a0>

In [3]:
import dask_ml
from dask_ml import feature_extraction
from dask_tfidf import DaskTfidfTransformer
import dask.bag as db
import pandas as pd
import random
import scipy.spatial as sp
import numexpr
import plotly_express as px
import sklearn.feature_extraction.text as sktext
numexpr.set_num_threads(1)
import re
from plotly.subplots import make_subplots

In [4]:
ukraine_geo_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_cities_in_Ukraine")[0]
ukraine_cities = ukraine_geo_data['Name'].astype(str)
ukraine_oblasts = ukraine_geo_data['Administrative division'].astype(str)
nw = []
for w in ukraine_cities:
  nw.append(w[0:-3] if '[' in w else w)
pattern_str = '|'.join([f'\\b{w}\\b' for w in nw])
pattern_str = r'(?i)' + pattern_str
# pattern = re.compile(pattern_str, flags=re.IGNORECASE)

In [5]:
# full_sample = dd.read_parquet(cleaned_data_path_str + "/*").sample(frac = sampling_rate).dropna().drop_duplicates().persist()
# sample_with_cities = full_sample.text.str.contains(pattern_str)
# ddf = full_sample[sample_with_cities]
# ddf = ddf.sort_values('tweetcreatedts').reset_index(drop = True)
# ddf.persist()

In [6]:
ddf = dd.read_parquet(cleaned_data_path_str + "/*").sample(frac = 0.1).dropna().drop_duplicates().persist()

In [7]:
newsdf = dd.read_csv(news_csv_path_str).dropna()
newsdf['text'] = newsdf['headline'] + ' ' + newsdf['desc']
newsdflen = newsdf.shape[0].compute()
ddflen = ddf.shape[0].compute()

In [8]:
# sample_bag = dask.bag.concat([newsdf.text.to_bag(), ddf.text.to_bag()]).persist()
# vectorizer = dask_ml.feature_extraction.text.HashingVectorizer()
# vecs = vectorizer.fit_transform(sample_bag)
# vecs.persist()
# vecs.compute_chunk_sizes()
# # news_vecs = vecs[-1*newsdflen:].persist()
# # tweet_vecs = vecs[:ddflen].persist()
# tfidf = DaskTfidfTransformer().fit_transform(X = vecs)
# news_tfidf = tfidf[-1*newsdflen:].persist()
# tweet_tfidf = tfidf[:ddflen].persist()

In [9]:
min_dt, max_dt = ddf.tweetcreatedts.min().compute(), ddf.tweetcreatedts.max().compute()
all_news_articles = pd.read_csv(news_csv_path_str).dropna()
all_news_articles['date'] = pd.to_datetime(all_news_articles['date'])
news_articles = all_news_articles[(all_news_articles['date'] >= min_dt) & (all_news_articles['date'] <= max_dt)].copy()
news_articles['text'] = news_articles['headline'] + ' ' + news_articles['desc']

In [10]:
article_dt_ranges = [(date - pd.Timedelta(days = timedelta), date + pd.Timedelta(days = timedelta + 1)) for date in news_articles['date']]
# news_articles['dt_range'] = article_dt_ranges
dls, uls = zip(*article_dt_ranges)
news_articles['dt_lower'] = dls
news_articles['dt_upper'] = uls

In [11]:
import jax.experimental.pjit
from jax.extend.core.primitives import jit_p
jax.experimental.pjit.pjit_p = jit_p
import numpyro
import numpyro.distributions as dist
import jax.numpy as jnp
from numpyro.infer import *
import jax
import plotly.graph_objects as go

In [12]:
# numpyro.render_model(vectorized_piecewise_model, model_args=(jnp.array([1]),1, 1, jnp.array([1]), 1, 1), render_distributions=True)

In [13]:
from numpyro.diagnostics import summary
import asyncio
from numpyro.infer import SVI, Trace_ELBO
from numpyro.infer.autoguide import *
from numpyro.optim import Adam

In [14]:
i = 300

async def get_relevants(i, dff, news_articles):
    df_ind = news_articles.index[i]
    dl, ul = news_articles.dt_lower[df_ind], news_articles.dt_upper[df_ind]
    ddft = ddf.query(f'"{str(str(dl.date()))}" <= tweetcreatedts < "{str(ul.date())}"').repartition(npartitions = 10).drop_duplicates().persist()
    tfd = sktext.TfidfVectorizer()
    tfd.fit(dd.concat([ddft.text, newsdf.text]))
    tweets_tfidf = tfd.transform(ddft.text)
    news_tfidf = tfd.transform([news_articles.text[df_ind]])
    relevance = news_tfidf.dot(tweets_tfidf.T).toarray().flatten()
    dates = ddft.tweetcreatedts.compute().to_numpy()
    df = pd.DataFrame({'relevance': relevance, 'date': dates})
    rdf = df.assign(relevant = df.relevance > np.quantile(df.relevance, .95))
    return rdf

async def get_scaled_article_bins(i, ddft, news_articles):
    
    df_ind = news_articles.index[i]
    
    if Path(scaled_article_bins_path / f'article_{df_ind}.parquet').exists():
        return pd.read_parquet(scaled_article_bins_path / f'article_{df_ind}.parquet')
    try:
        # print("Here")
        dl, ul = news_articles.dt_lower[df_ind], news_articles.dt_upper[df_ind]
        ddft = ddf.query(f'"{str(str(dl.date()))}" <= tweetcreatedts < "{str(ul.date())}"').repartition(npartitions = 10).drop_duplicates().persist()
        # news_articles.text[i]

        tfd = sktext.TfidfVectorizer()
        tfd.fit(dd.concat([ddft.text, newsdf.text]))
        tweets_tfidf = tfd.transform(ddft.text)
        news_tfidf = tfd.transform([news_articles.text[df_ind]])
        relevance = news_tfidf.dot(tweets_tfidf.T).toarray().flatten()
        dates = ddft.tweetcreatedts.compute().to_numpy()
        df = pd.DataFrame({'relevance': relevance, 'date': dates})
        rdf = df.assign(relevant = df.relevance > np.quantile(df.relevance, .95))
        unscaled = rdf.groupby(pd.Grouper(key = 'date', freq = 'H'))['relevant'].sum().to_frame()

        x = rdf.groupby(rdf.date.dt.hour).date.count().to_numpy()
        hour_scaler = dict(enumerate(1  + (-1 * ((x - np.mean(x)) / x))))

        # x = rdf.groupby(rdf.date.dt.dayofweek).date.count().to_numpy()
        # day_scaler = dict(enumerate(1  + (-1 * ((x - np.mean(x)) / x))))

        unscaled['hour_scaler'] = [hour_scaler[x] for x in unscaled.index.hour]
        # unscaled['day_scaler'] = [day_scaler[x] for x in unscaled.index.dayofweek]

        scaled = unscaled.assign(corrected_relevant = unscaled.relevant * unscaled.hour_scaler)
        
        scaled.to_parquet(scaled_article_bins_path / f'article_{df_ind}.parquet')
        
        return scaled
    except ValueError as e:
        print(e)
        return None
    except Exception as e:
        print(e)
        print("here")
        return None
from numpyro.infer import *

def fit_model(scaled):
    data = jnp.array(scaled.corrected_relevant.to_numpy())
    time = jnp.array(range(len(data)))
    mu = jnp.mean(data)
    std = jnp.std(data)
    min = jnp.min(data)
    max = jnp.max(data)

    # mcmc = MCMC(ESS(vectorized_piecewise_model, moves={ESS.DifferentialMove() : 0.2, ESS.RandomMove() : 0.5, ESS.GaussianMove() : 0.3}), num_warmup=500, num_samples=1000, chain_method='vectorized', num_chains=20, progress_bar=True)
    # mcmc = MCMC(MixedHMC(HMC(vectorized_piecewise_model)), num_warmup=500, num_samples=1000, num_chains=12, progress_bar=True, chain_method='vectorized')
    # mcmc.run(jax.random.PRNGKey(0), time, mu,std, min, max, data)
    rng_key = jax.random.PRNGKey(0)
    guide = numpyro.infer.autoguide.AutoNormal(vectorized_piecewise_model)
    optimizer = Adam(step_size=0.01)
    svi = SVI(vectorized_piecewise_model, guide, optimizer, loss=Trace_ELBO())
    svi_result = svi.run(rng_key, num_steps=5000, data=data, time=time, mu=mu, std=std, min=min, max=max)

    # predictive = Predictive(vectorized_piecewise_model, mcmc.get_samples())
    predictive = Predictive(vectorized_piecewise_model, guide=guide, num_samples=1000)
    predictions = predictive(rng_key, time=time, mu = mu, std = std, min = min, max = max, data = None)
    
    return predictions,  summary(mcmc.get_samples(group_by_chain=True))

async def get_model_scaled(i, ddft, news_articles):
    scaled = await get_scaled_article_bins(i, ddft, news_articles)
    if scaled is None:
        return False, None, None, None
    predictions, model_summary = fit_model(scaled)
    return True, scaled, predictions, model_summary
# fig1 = px.bar(predictions['obs'].mean(axis = 0))
# fig2 = px.bar(data, opacity=0.5, color_discrete_sequence=['black'])
# go.Figure(data = fig1.data + fig2.data)


In [15]:
def vectorized_piecewise_model(time, mu, std, min, max, data):

    # print(jnp.mean(data))

    change_point = numpyro.sample("change_point", dist.Uniform(0, len(time)))
    
    obs_std = numpyro.sample("std", dist.LogNormal(0, 1))
    
    #dist.Normal(mu + std, std ** 2))
    
    pre_baseline = numpyro.sample("pre_baseline", dist.Uniform(min, max)) #dist.TruncatedNormal(mu - std, std ** 2))
    
    post_baseline = numpyro.sample("post_baseline", dist.Uniform(pre_baseline, max)) 
    
    relevance_decay = numpyro.sample("relevance_decay", dist.Uniform(0, jnp.log(max - min)))#dist.Exponential(0.1))
    
    relevance_beta = numpyro.sample("relevance_beta", dist.Uniform(0, jnp.log(max - min)))#dist.Normal(jnp.log(mu), jnp.log(std) ** 2))
    
    
    mask = time >= change_point
    pre_means = pre_baseline
    post_means = post_baseline + jnp.exp(-relevance_decay * (time - change_point) + relevance_beta)
    numpyro.sample("obs", dist.Normal(pre_means * (1 - mask) + post_means * mask, obs_std), obs=data)

# _, scaled, predictions, model_summary = await get_model_scaled(230, ddf, news_articles)

def final_pred(time, change_point = None, pre_baseline = None, post_baseline= None, relevance_decay = None, relevance_beta = None, std = None):
    mask = time >= change_point
    pre_means = pre_baseline
    post_means = post_baseline + np.exp(-relevance_decay * (time - change_point) + relevance_beta)
    return pre_means * (1 - mask) + post_means * mask
# time=np.arange(len(scaled.corrected_relevant))
# final_y = final_pred(time, **{var : model_summary[var]['median'] for var in model_summary.keys()})

# fig = make_subplots(rows = 1, cols = 1, shared_xaxes=True, vertical_spacing=0.02)
# fig.add_trace(go.Bar(x=scaled.index, y=scaled.corrected_relevant, name='Corrected Relevant Tweets', opacity=.5), row=1, col=1)
# fig.add_trace(go.Scatter(x=scaled.index, y=predictions['obs'].mean(axis = 0), mode='lines', name='Predicted Relevant Tweets', error_y = dict(type = 'data', array = predictions['obs'].std(axis = 0), visible = True), opacity=.5), row=1, col=1)
# fig.add_trace(go.Scatter(x=scaled.index, y=final_y, mode='lines', name='Median Predicted Relevant Tweets'), row=1, col=1)

In [16]:

# df.corrected_relevant

In [30]:
def pcp_intensity(time, pre_baseline, post_baseline, change_point, relevance_decay, relevance_beta, **kwargs):
    mask = time >= change_point
    pre_means = pre_baseline
    post_means = post_baseline + jnp.exp(-relevance_decay * (time - change_point) + relevance_beta)
    return pre_means * (1 - mask) + post_means * mask
def sinp_intensity(time, phase, amplitude, frequency, offset, **kwargs):
    return amplitude * (1 + jnp.sin(frequency * (time + offset))) + phase
def geom_inv_sinp_intensity(time, phase, amplitude, frequency, offset, mu, **kwargs):
    return mu / sinp_intensity(time, phase, amplitude, frequency, offset, **kwargs)

In [31]:
from functools import partial

In [19]:
def poisson_cp(min, max, underlying_intensity_scaler, data):
    change_point = numpyro.sample("change_point", dist.Uniform(0, 14))
    pre_baseline = numpyro.sample("pre_baseline", dist.Uniform(min, max)) #dist.TruncatedNormal(mu - std, std ** 2))
    post_baseline = numpyro.sample("post_baseline", dist.Uniform(pre_baseline, max)) 
    relevance_decay = numpyro.sample("relevance_decay", dist.Uniform(0, jnp.log(max - min)))#dist.Exponential(0.1))
    relevance_beta = numpyro.sample("relevance_beta", dist.Uniform(0, jnp.log(max - min)))#dist.Normal(jnp.log(mu), jnp.log(std) ** 2))
    
    # phase = numpyro.sample("phase", dist.Uniform(min, max))
    # # amplitude = numpyro.sample("amplitude", dist.Uniform(min, max))
    # amplitude = numpyro.sample("amplitude", dist.Uniform(0, 1))
    # offset = numpyro.sample("offset", dist.Uniform(0, 2 * jnp.pi))
    # frequency = numpyro.sample("frequency", dist.Normal(1.75, 2))  # Assuming a period of 14 days

    lam = partial(pcp_intensity, pre_baseline=pre_baseline, change_point=change_point, relevance_decay=relevance_decay, relevance_beta=relevance_beta, post_baseline=post_baseline)
    # lam2 = partial(sinp_intensity, phase=phase, amplitude=amplitude, frequency=frequency, offset=offset)
    
    nsamples = 1000
    samples = jnp.arange(0, 14, 14/nsamples)
    intensity_integral = jnp.trapezoid(lam(samples), x=samples)
    observed_sum = jnp.sum(underlying_intensity_scaler(data) * jnp.log(lam(data)))
    numpyro.factor("point_process_log_likelihood", observed_sum - intensity_integral)

def sinp(min, max, data):
    phase = numpyro.sample("phase", dist.Uniform(min, max))
    amplitude = numpyro.sample("amplitude", dist.Uniform(0, max - min))
    # frequency_std = numpyro.sample("frequency_std", dist.Uniform(0, 10))  # Assuming a period of 14 days
    frequency = numpyro.sample("frequency", dist.Uniform(1.25, 3.75))  # Assuming a period of 14 days
    offset = numpyro.sample("offset", dist.Uniform(0, 2 * jnp.pi))
    lam = partial(sinp_intensity, phase=phase, amplitude=amplitude, frequency=frequency, offset=offset)
    nsamples = 1000
    samples = jnp.arange(0, 14, 14/nsamples)
    intensity_integral = jnp.trapezoid(lam(samples), x=samples)
    observed_sum = jnp.sum(jnp.log(lam(data)))
    numpyro.factor("point_process_log_likelihood", observed_sum - intensity_integral)

# relevants = (await get_relevants(230, ddf, news_articles)).query('relevant')
df = await get_relevants(230, ddf, news_articles)
min_dt, max_dt = df.date.min(), df.date.max()
all_msgs = (df.date - min_dt) / (max_dt - min_dt) * 14
relevants = df.query('relevant')
data = (relevants.date - min_dt) / (max_dt - min_dt) * 14
all_data = jnp.array(all_msgs.to_numpy())
data = jnp.array(data.to_numpy())
mcmc_sinp = MCMC(ESS(sinp, moves={ESS.DifferentialMove() : 0.5, ESS.RandomMove() : 0.2, ESS.GaussianMove() : 0.3}), num_warmup=500, num_samples=1000, chain_method='vectorized', num_chains=30, progress_bar=True)
rng_key = jax.random.PRNGKey(0)
mcmc_sinp.run(rng_key, 0, 2000, all_data)

sample: 100%|██████████| 1500/1500 [00:54<00:00, 27.43it/s]


In [67]:
params = {k : jnp.median(v).item() for (k, v) in mcmc_sinp.get_samples().items()}
lam2 = partial(sinp_intensity, **params)
samples = np.arange(0, 14, 14/1000)
fig = px.line(x = samples, y = lam2(samples))
# add histogram of all_data to the plot
fig.add_histogram(x=all_data, name='All Messages', opacity=0.5, marker_color='blue')

In [68]:
samples = np.arange(0, 14, 14/1000)
jnp.trapezoid(lam2(samples), x = samples) / 14

Array(1177.0156, dtype=float32)

In [81]:
mu=jnp.trapezoid(lam2(samples), x = samples) / 14
mu

Array(10.916686, dtype=float32)

In [69]:
underlying_intensity = partial(geom_inv_sinp_intensity, **params, mu=jnp.trapezoid(lam2(samples), x = samples) / 14)

In [73]:
# truth = np.histogram(all_data, bins=140)[0]
xaxis = np.arange(0, 14, 14/1000)
fig = px.line(x = xaxis, y = underlying_intensity(xaxis))
fig.add_trace(go.Scatter(
    x = xaxis, y = lam2(xaxis), mode='lines', name='Rates',  yaxis="y2"))
fig.update_layout(
    yaxis2=dict(
        title="Intensity",
        overlaying="y",
        side="right"
    )
)

In [71]:
# mu = jnp.trapezoid(lam2(samples), x = samples) / 14
mcmc = MCMC(ESS(poisson_cp, moves={ESS.DifferentialMove() : 0.5, ESS.RandomMove() : 0.2, ESS.GaussianMove() : 0.3}), num_warmup=500, num_samples=1000, chain_method='vectorized', num_chains=30, progress_bar=True)
rng_key = jax.random.PRNGKey(0)
mcmc.run(rng_key, 0, 300, underlying_intensity, data)

sample: 100%|██████████| 1500/1500 [00:05<00:00, 291.45it/s]


In [118]:
params = {k : jnp.median(v).item() for (k, v) in mcmc.get_samples().items()}
lam1 = partial(pcp_intensity, **params)
samples = np.arange(0, 14, 14/1000)
n = 10
samples = np.arange(0, 1/n, 1/1000)
xaxis = np.arange(0, 14, 1/n)
rate_scalers = np.array([np.trapezoid(underlying_intensity(samples + i/n), x = samples).item() * n for i in range(14 * n)])
predictions = np.array([np.trapezoid(lam1(samples + i/n), x = samples).item() for i in range(14 * n)])
fig = px.bar(x= np.arange(0, 14, 1/n),y =np.histogram(data, bins=14 * n)[0])
# fig.add_trace(go.Scatter(
#     x = xaxis, y = lam1(xaxis) / mu, mode='lines', name='Rates'))
# fig.update_layout(
#     yaxis2=dict(
#         title="Change Point Density",
#         overlaying="y",
#         side="right"
#     )
# )
fig.add_trace(go.Bar(
    x = xaxis, y = predictions, name='Rates', opacity=0.5))
fig.add_trace(go.Bar(
    x = xaxis, y = rate_scalers * np.histogram(data, bins=14 * n)[0], name='Scaled Rates', opacity=0.5))

fig.update_layout(barmode='overlay')

In [38]:
params = {k : jnp.median(v).item() for (k, v) in mcmc.get_samples().items()}
lam1 = partial(pcp_intensity, **params)
samples = np.arange(0, 14, 14/1000)
fig = px.line(x = samples, y = lam1(samples))
fig.add_trace(go.Histogram(x=mcmc.get_samples()['change_point'], name='Change Points', histnorm='probability density', opacity=0.5, marker_color='red', yaxis="y2"))
fig.update_layout(
    yaxis2=dict(
        title="Change Point Density",
        overlaying="y",
        side="right"
    )
)

In [25]:
np.histogram(all_data, bins=14)[0] - rates

array([ 300.26268433,  566.7518927 ,  219.08836548,   34.21017578,
       -118.68215515,   86.32065979,  189.84020325,  133.19208801,
       -207.45097961, -263.07167175, -310.65684998, -209.37236389,
       -163.87263977, -106.61788086])

In [77]:
classical = await get_scaled_article_bins(230, ddf, news_articles)

In [78]:
px.bar(classical, x = classical.index, y = 'corrected_relevant')

In [30]:
mcmc.get_samples()['change_point'].std()

Array(1.6351753, dtype=float32)

In [41]:
# bin all data into 14 * 24 bins as a numpy array
# all_data_binned = np.histogram(all_data, bins=14)[0]
# np.mean(all_data_binned)

np.float64(1184.142857142857)

In [31]:
px.histogram(x = data, nbins = 30)

In [31]:
px.histogram(x = data,nbins = 100)

In [59]:
relevants = (await get_relevants(230, ddf, news_articles)).sample(frac=0.3)
data = (relevants.date - relevants.date.min()) / (relevants.date.max() - relevants.date.min()) * 14
data = jnp.array(data.to_numpy())
mcmc = MCMC(ESS(sinp, moves={ESS.DifferentialMove() : 0.2, ESS.RandomMove() : 0.5, ESS.GaussianMove() : 0.3}), num_warmup=250, num_samples=500, chain_method='vectorized', num_chains=20, progress_bar=True)
rng_key = jax.random.PRNGKey(0)
mcmc.run(rng_key, 0, 10, data)

sample: 100%|██████████| 750/750 [00:15<00:00, 47.43it/s] 


In [55]:
params = {k : jnp.mean(v).item() for (k, v) in mcmc.get_samples().items()}
temp = partial(sinp_intensity, frequency = 1.75)
lam = partial(temp, **{k:v for (k, v) in params.items() if k not in {'frequency_std'}})
samples = np.arange(0, 14, 14/1000)
px.line(x = samples, y = lam(samples))

In [53]:
px.histogram(x = data,nbins = 100)

In [30]:
params = {k : jnp.mean(v).item() for (k, v) in mcmc.get_samples().items()}
lam = partial(pcp_intensity, **params)
samples = np.arange(0, 14, 14/1000)
px.line(x = samples, y = lam(samples))

TypeError: pcp_intensity() got an unexpected keyword argument 'amplitude'

In [None]:
scaled = await get_scaled_article_bins(230, ddf, news_articles)
data = jnp.array(scaled.corrected_relevant.to_numpy())
time = jnp.array(range(len(data)))
mu = jnp.mean(data)
std = jnp.std(data)
min = jnp.min(data)
max = jnp.max(data)

mcmc = MCMC(ESS(vectorized_piecewise_model, moves={ESS.DifferentialMove() : 0.2, ESS.RandomMove() : 0.5, ESS.GaussianMove() : 0.3}), num_warmup=500, num_samples=1000, chain_method='vectorized', num_chains=20, progress_bar=True)
# mcmc = MCMC(MixedHMC(HMC(vectorized_piecewise_model)), num_warmup=500, num_samples=1000, num_chains=12, progress_bar=True, chain_method='vectorized')
rng_key = jax.random.PRNGKey(0)
mcmc.run(rng_key, time, mu,std, min, max, data)

# guide = numpyro.infer.autoguide.AutoBNAFNormal(vectorized_piecewise_model)
# optimizer = Adam(step_size=0.1)
# svi = SVI(vectorized_piecewise_model, guide, optimizer, loss=Trace_ELBO())
# svi_result = svi.run(rng_key, num_steps=5000, data=data, time=time, mu=mu, std=std, min=min, max=max)

predictive = Predictive(vectorized_piecewise_model, mcmc.get_samples())
# predictive = Predictive(vectorized_piecewise_model, guide=guide, num_samples=1000)

predictions = predictive(rng_key, time=time, mu = mu, std = std, min = min, max = max, data = None)
time=np.arange(len(scaled.corrected_relevant))
# final_y = final_pred(time, **{var : model_summary[var]['median'] for var in model_summary.keys()})

fig = make_subplots(rows = 1, cols = 1, shared_xaxes=True, vertical_spacing=0.02)
fig.add_trace(go.Bar(x=scaled.index, y=scaled.corrected_relevant, name='Corrected Relevant Tweets', opacity=.5), row=1, col=1)
fig.add_trace(go.Scatter(x=scaled.index, y=predictions['obs'].mean(axis = 0), mode='lines', name='Predicted Relevant Tweets', error_y = dict(type = 'data', array = predictions['obs'].std(axis = 0), visible = True), opacity=.5), row=1, col=1)
# fig.add_trace(go.Scatter(x=scaled.index, y=final_y, mode='lines', name='Median Predicted Relevant Tweets'), row=1, col=1)

sample: 100%|██████████| 1500/1500 [00:02<00:00, 661.08it/s] 


In [50]:
import scipy as sp

In [54]:
scaled = await get_relevants(230, ddf, news_articles)
relevants = scaled.query("relevant")
relevants['t'] = (relevants.date - relevants.date.min()) / (relevants.date.max() - relevants.date.min())
x = relevants.t.sort_values().to_list()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [56]:
px.scatter(sp.integrate.cumulative_trapezoid(x))

In [15]:
import scipy as sp

In [16]:
scaled.corrected_relevant

NameError: name 'scaled' is not defined

In [23]:
predictions['obs'][1]

Array([34.42397 , 32.816925, 35.524845, 30.875675, 30.098652, 36.586716,
       31.499216, 32.967186, 33.356564, 33.43457 , 30.955635, 32.03359 ,
       29.933657, 31.483946, 33.341724, 31.809212, 33.18432 , 33.25658 ,
       32.73649 , 30.117163, 32.670555, 34.910442, 31.905743, 32.487965,
       32.92082 , 33.36748 , 35.663124, 35.055397, 30.308071, 30.937735,
       34.731964, 32.666298, 32.723324, 34.19107 , 34.29744 , 32.35013 ,
       33.47869 , 31.117887, 32.048115, 33.17931 , 33.81697 , 33.148876,
       32.600994, 31.426304, 32.876724, 34.80502 , 31.340261, 32.672283,
       32.41156 , 34.963165, 33.05129 , 32.515812, 33.54454 , 31.76873 ,
       34.061993, 33.252945, 31.318024, 32.54927 , 31.246525, 33.704372,
       31.052942, 31.994587, 31.966537, 34.161366, 32.88356 , 33.120632,
       31.958313, 33.455395, 33.343258, 33.86094 , 32.381676, 34.753048,
       31.783682, 35.521862, 35.18689 , 32.587856, 72.712875, 68.36897 ,
       66.99183 , 67.86919 , 70.3611  , 67.481   , 

In [90]:
import json

In [92]:
np.save(model_summary)

TypeError: save() missing 1 required positional argument: 'arr'

In [81]:

# [var for var in model_summary.keys()]

fig = make_subplots(rows = 1, cols = 1, shared_xaxes=True, vertical_spacing=0.02)
fig.add_trace(go.Bar(x=scaled.index, y=scaled.corrected_relevant, name='Corrected Relevant Tweets', opacity=.5), row=1, col=1)
fig.add_trace(go.Scatter(x=scaled.index, y=final_y, visible = True), row=1, col=1)


In [None]:
# await asyncio.gather(*[get_scaled_article_bins(i, ddf, news_articles) for i in range(len(news_articles.index))]);

In [17]:
_, scaled, predictions, model_summary = await get_model_scaled(200, ddf, news_articles)
fig = make_subplots(rows = 1, cols = 1, shared_xaxes=True, vertical_spacing=0.02)
fig.add_trace(go.Bar(x=scaled.index, y=scaled.corrected_relevant, name='Corrected Relevant Tweets', opacity=.5), row=1, col=1)
fig.add_trace(go.Scatter(x=scaled.index, y=predictions['obs'].mean(axis = 0), mode='lines', name='Predicted Relevant Tweets'), row=1, col=1)

sample: 100%|██████████| 1500/1500 [00:02<00:00, 659.85it/s] 


In [None]:
model_summary

{'change_point': OrderedDict([('mean', 102.396065),
              ('std', 7.340388),
              ('median', 101.47816),
              ('5.0%', 101.044785),
              ('95.0%', 101.91685),
              ('n_eff', 1213.735665811439),
              ('r_hat', 1.0090249638489899)]),
 'post_baseline': OrderedDict([('mean', 26.37337),
              ('std', 2.8077643),
              ('median', 26.579819),
              ('5.0%', 22.257168),
              ('95.0%', 31.061129),
              ('n_eff', 3371.028698049127),
              ('r_hat', 1.0012136438678112)]),
 'pre_baseline': OrderedDict([('mean', 24.206583),
              ('std', 1.0600921),
              ('median', 24.111942),
              ('5.0%', 22.959293),
              ('95.0%', 25.264353),
              ('n_eff', 5127.485307291253),
              ('r_hat', 1.002612777985948)]),
 'relevance_beta': OrderedDict([('mean', 3.8848991),
              ('std', 0.1079204),
              ('median', 3.8954573),
              ('5.0%', 3

In [None]:
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)
fig.add_trace(go.Scatter(x=scaled.index, y=scaled.relevant, mode='lines', name='Relevant Tweets'), row=1, col=1)
fig.add_trace(go.Scatter(x=scaled.index, y=scaled.corrected_relevant, mode='lines', name='Corrected Relevant Tweets'), row=1, col=1)
fig.add_trace(go.Scatter(x=scaled.index, y=predictions['obs'].mean(axis = 0), mode='lines', name='Predicted Relevant Tweets'), row=2, col=1)