# 0. Setup

In [156]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import pandas as pd
import scipy.stats as stats
import json
import pickle
import powerlaw as pwl
from sklearn.linear_model import LinearRegression

In [157]:
def get_count_from_file(filepath):
    """ Opens filepath and returns a numpy array with the results count """

    if filepath.endswith("pkl"):
        with open(filepath, "rb") as f:
            data = pickle.load(f)

        counts = []
        for k in data:
            counts.append([c for _, c in k])

        return np.array(counts)
    else:
        with open(filepath, "r") as f:
            return json.load(f)

### 0.1. Plotting functions

In [158]:
def plot_bars_results(counts, ignore_first_ks=0):
    """ Plots the results count from the given file.
     :param filename: name of the file to be plotted """

    count = np.mean(counts, axis=1)

    start, end = ignore_first_ks, len(count)
    count = count[ignore_first_ks:]

    plt.figure(figsize=(12, 4))
    plt.bar(range(start, end), count, color='C1', alpha=0.5, edgecolor='none')
    plt.plot(range(start, end), count, color="C1", marker='o')
    # plt.fill_between(range(start, end),
    #                  (count - np.std(counts, axis=1)[ignore_first_ks:]),
    #                  (count + np.std(counts, axis=1)[ignore_first_ks:]),
    #                  color="C1", linestyle="--", alpha=0.5)

    plt.xlabel("Quantidade de k's")
    plt.ylabel("Quantidade de tweets")
    plt.xticks(range(start, end), [str(k+1) for k in range(start, end)])

    plt.tight_layout()
    plt.grid(False)

In [159]:
def plot_logscale_results(counts, ignore_first_ks=0):
    """ Plots the results count from the given file.
     :param filename: name of the file to be plotted """

    count = np.mean(counts, axis=1)

    start, end = ignore_first_ks, len(count)
    count = count[ignore_first_ks:]

    plt.figure(figsize=(12, 4))
    plt.plot(range(start, end), count, color="C1", marker='o', label="Data")

    # fit linear regression
    xmin = 9
    x = np.log(range(start, end)).reshape(-1, 1)
    y = np.log(count).reshape(-1, 1)
    reg = LinearRegression().fit(x[xmin:], y[xmin:])
    print(f"alpha: {reg.coef_[0][0]}, beta: {reg.intercept_[0]}")
    plt.plot(range(xmin, end), np.exp(reg.predict(np.log(range(xmin, end)).reshape(-1, 1))), color="C0", linestyle="--", label="Linear Regression")

    plt.xticks(range(start, end), [str(k + 1) for k in range(start, end)])
    plt.xlabel("Quantidade de k's")
    plt.ylabel("Quantidade de tweets")
    plt.yscale("log")
    plt.xscale("log")
    plt.legend()
    plt.tight_layout()

## 1. Results

In [160]:
counts = [np.mean(get_count_from_file(f), axis=1) for f in ["data/k_twitter_2s_1-50_count.json",
                                                            "data/ha_twitter_2s_1-50_count.json",
                                                            "data/he_twitter_1s_1-30_count.json",
                                                            "data/rs_twitter_1s_1-30_count.json",
                                                            "data/w_twitter_2s_1-50_count_cleaned.json",
                                                            "data/5_twitter_2s_1-50_count.json"]]
laughs = ["k", "ha", "he", "rs", "w", "5"]
countries = ["Brasil", "Brasil", "Brasil", "Brasil", "Japão", "Tailândia"]

df_name_ids, df_names, df_number, df_values, df_countries = [], [], [], [], []
for i, (laugh, count) in enumerate(zip(laughs, counts)):
    for j, val in enumerate(count):
        df_name_ids.append(i+1)
        df_names.append(laugh)
        df_number.append(j+1)
        df_values.append(val)
        df_countries.append(countries[i])
df = pd.DataFrame({"Risada ID": df_name_ids, "Risada": df_names, "n": df_number, "Count": df_values, "Country": df_countries})
df.loc[:, "Risada x Country"] = df["Risada"] + " (" + df["Country"].astype(str) + ")"
df

Unnamed: 0,Risada ID,Risada,n,Count,Country,Risada x Country
0,1,k,1,8430000.0,Brasil,k (Brasil)
1,1,k,2,195375.0,Brasil,k (Brasil)
2,1,k,3,416250.0,Brasil,k (Brasil)
3,1,k,4,563125.0,Brasil,k (Brasil)
4,1,k,5,523750.0,Brasil,k (Brasil)
...,...,...,...,...,...,...
208,6,5,46,3855.0,Tailândia,5 (Tailândia)
209,6,5,47,4065.0,Tailândia,5 (Tailândia)
210,6,5,48,3825.0,Tailândia,5 (Tailândia)
211,6,5,49,3060.0,Tailândia,5 (Tailândia)


In [161]:
df_br = df[df['Country'] == "Brasil"]
df_int = df[(df['Country'] != "Brasil") | (df["Risada"] == "k")]

In [162]:
sns.set_style("ticks")
with plt.xkcd():
    plt.figure(figsize=(12, 4))
    ax = plt.gca()

    df_ = df_br[df_br['n'] > 1].groupby("Risada").agg({"Risada ID": "first", "Count": "sum"}).sort_values(by="Risada ID").reset_index()
    sns.barplot(data=df_, y="Risada", x="Count", orient='h', palette="deep", linewidth=3)
    # put value inside barplot

    # ax.text(df_["Count"][0] - 400000, 0.125, str(int(df_["Count"][0])), color='black', fontweight='bold')
    # for i, v in enumerate(df_["Count"][1:]):
    #     ax.text(v + 20000, (i + 1) + 0.125, str(int(v)), color='black', fontweight='bold')

    ax.get_xaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))
    plt.xlabel("Quantidade de tweets", fontweight="bold")
    plt.ylabel("Risada", fontweight="bold")
    plt.title("Qual a risada mais comum no Brasil?", fontweight="bold")
    # plt.xticks(range(1, 31, 5), [str(k) for k in range(1, 31, 5)])
    plt.tight_layout()
    # plt.show()
    plt.savefig("figures/risada_br_bars.png", dpi=300)

findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font fam

In [163]:
sns.set_style("ticks")
with plt.xkcd():
    plt.figure(figsize=(12, 4))
    ax = plt.gca()

    sns.lineplot(data=df_br, x="n", y="Count", hue="Risada", marker=".", palette="deep", linewidth=3)

    ax.get_yaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))
    plt.xticks(range(1, 31, 5), [str(k) for k in range(1, 31, 5)])
    plt.ylim(0, 1.1 * max(df_br[df_br["n"] != 1]["Count"]))
    plt.xlim(0.5, 30)
    plt.ylabel("Quantidade de tweets", fontweight="bold")
    plt.xlabel("Quantidade de repetições", fontweight="bold")
    plt.title("Qual o comprimento normal das risadas?", fontweight="bold")
    plt.tight_layout()
    # plt.show()
    plt.savefig("figures/risadas_br_lineplot.png", dpi=300)
    # ax.xaxis.set_major_locator(plt.MultipleLocator(1))
    # plt.grid(True, axis="x", alpha=0.5, linewidth=0.5, linestyle="--", color="black")

findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font fam

In [183]:
sns.set_style("ticks")
with plt.xkcd():
    plt.figure(figsize=(12, 4))
    sns.lineplot(data=df_br, x="n", y="Count", hue="Risada", linewidth=3)
    plt.yscale("log")
    plt.xscale("log")
    plt.xticks(range(1, 51, 5), [str(k) for k in range(1, 51, 5)])
    plt.ylabel("Quantidade de tweets", fontweight="bold")
    plt.xlabel("Quantidade de repetições", fontweight="bold")
    plt.title("O tamanho das risadas segue uma lei de potência?", fontweight="bold")
    plt.tight_layout()
    # plt.show()
    plt.savefig("figures/risadas_loglog.png", dpi=300)

findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font fam

In [165]:
sns.set_style("ticks")
with plt.xkcd():
    plt.figure(figsize=(12, 4))
    ax = plt.gca()

    df_ = df_int[df_int['n'] > 1].groupby("Risada").agg({"Risada ID": "first", "Count": "sum", "Country": "first", "Risada x Country": "first"}).sort_values(by="Risada ID")
    df_ = df_.reset_index()
    sns.barplot(data=df_, y="Risada x Country", x="Count", orient='h', palette="Set2", linewidth=3)

    ax.get_xaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))
    plt.xlabel("Quantidade de tweets", fontweight="bold")
    plt.ylabel("Risada", fontweight="bold")
    plt.title("Quais as risadas mais comuns?", fontweight="bold")
    # plt.xticks(range(1, 31, 5), [str(k) for k in range(1, 31, 5)])
    plt.tight_layout()
    # plt.show()
    plt.savefig("figures/risada_int_bars.png", dpi=300)

findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font fam

In [182]:
sns.set_style("ticks")
with plt.xkcd():
    plt.figure(figsize=(12, 4))
    ax = plt.gca()

    sns.lineplot(data=df_int, x="n", y="Count", hue="Risada x Country", palette="Set2", marker=".", linewidth=3)

    ax.get_yaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))
    plt.xticks(range(1, 31, 5), [str(k) for k in range(1, 31, 5)])
    plt.ylim(0, 1.1 * max(df_int[df_int["n"] > 1]["Count"]))
    plt.xlim(0.5, 30)
    plt.xlabel("Quantidade de repetições", fontweight="bold")
    plt.ylabel("Quantidade de tweets", fontweight="bold")
    plt.title("Qual o comprimento normal de uma risada... globalmente?", fontweight="bold")
    plt.tight_layout()
    # plt.show()
    plt.savefig("figures/risada_int_lineplot_1.png", dpi=300)

findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font fam

In [181]:
sns.set_style("ticks")
with plt.xkcd():
    plt.figure(figsize=(12, 4))
    ax = plt.gca()

    sns.lineplot(data=df_int, x="n", y="Count", hue="Risada x Country", palette="Set2", marker=".", linewidth=3)

    ax.get_yaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))
    plt.xticks(range(1, 31, 5), [str(k) for k in range(1, 31, 5)])
    plt.ylim(0, 1.1 * max(df_int[df_int["n"] > 5]["Count"]))
    plt.xlim(0.5, 30)
    plt.xlabel("Quantidade de repetições", fontweight="bold")
    plt.ylabel("Quantidade de tweets", fontweight="bold")
    plt.title("Qual o comprimento normal de uma risada... globalmente?", fontweight="bold")
    plt.tight_layout()
    # plt.show()
    plt.savefig("figures/risada_int_lineplot_2.png", dpi=300)

findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font fam

In [168]:
sns.set_style("ticks")
with plt.xkcd():
    plt.figure(figsize=(12, 4))
    sns.lineplot(data=df_int, x="n", y="Count", hue="Risada x Country", linewidth=3, palette="Set2")
    plt.yscale("log")
    plt.xscale("log")
    x = list(range(1, 51, 5))
    plt.xticks(x, [str(k) for k in x])
    plt.ylabel("Quantidade de tweets", fontweight="bold")
    plt.xlabel("Quantidade de repetições", fontweight="bold")
    plt.title("O tamanho das risadas segue uma lei de potência? (pt. 2)", fontweight="bold")
    plt.tight_layout()
    # plt.show()
    plt.savefig("figures/risada_int_loglog.png", dpi=300)

findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font fam

In [169]:
def display_snippets(filename, pos=0):
    """ Displays the snippets from the given file.
     :param filename: name of the file to be plotted
     :param pos: position of the snippet to be displayed"""

    with open(filename, "rb") as f:
        data = pickle.load(f)

    for i, r in enumerate(data):
        m = r[0][0]['items'][pos]
        print(f"-" * 50)
        print(f"qt: {i+1}, total: {r[0][0]['searchInformation']['totalResults']}")
        print(m['title'])
        print("\t", m['snippet'])

In [170]:
counts = [get_count_from_file("data/k_twitter_2s_1-50_count_1.json"),
          get_count_from_file("data/k_twitter_2s_1-50_count_2.json"),
          get_count_from_file("data/k_twitter_2s_1-50_count_3.json"),
          get_count_from_file("data/k_twitter_2s_1-50_count_4.json")]
count = np.concatenate(counts, axis=1)
with open("data/k_twitter_2s_1-50_count.json", "w") as f:
    json.dump(count.tolist(), f, indent=4)

In [171]:
with plt.xkcd():
    plot_bars_results(counts=get_count_from_file("data/k_twitter_2s_1-50_count.json"), ignore_first_ks=1)

findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Humor Sans' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font fam

In [172]:
plot_logscale_results(counts=get_count_from_file("data/k_twitter_2s_1-50_count.json"), ignore_first_ks=1)

plt.xlim((1, 50))

alpha: -2.8905270729722647, beta: 19.036856628533275


(1, 50)

In [173]:
data = get_count_from_file("data/k_twitter_2s_1-50.pkl")
data = np.mean(data, axis=1)
data

array([9.840e+06, 1.755e+05, 4.970e+05, 6.250e+05, 5.140e+05, 3.980e+05,
       4.820e+05, 2.650e+05, 2.405e+05, 1.895e+05, 1.650e+05, 1.400e+05,
       1.300e+05, 9.575e+04, 7.360e+04, 7.655e+04, 6.890e+04, 6.310e+04,
       4.670e+04, 4.850e+04, 3.920e+04, 3.720e+04, 3.170e+04, 3.070e+04,
       2.710e+04, 1.980e+04, 1.900e+04, 1.345e+04, 1.035e+04, 1.102e+04,
       9.985e+03, 7.810e+03, 7.730e+03, 5.650e+03, 6.310e+03, 5.250e+03,
       4.650e+03, 4.510e+03, 4.520e+03, 4.150e+03, 4.240e+03, 4.030e+03,
       3.720e+03, 4.420e+03, 3.060e+03, 2.440e+03, 2.770e+03, 2.970e+03,
       2.135e+03, 2.540e+03])

In [174]:
fit = pwl.Fit(data)
print(f"alpha: {fit.alpha}, xmin: {fit.xmin}, c: {fit.sigma}")

Calculating best minimal value for power law fit
alpha: 1.9731395135844367, xmin: 130000.0, c: 0.26990033956990556


In [175]:
# pwl.plot_ccdf(data, color="C1", marker='o', label="Data")
fit.power_law.plot_ccdf(color="C0", linestyle="--", label="Power Law")

# fit.plot_ccdf(color="C1", marker='o', label="Data")
plt.scatter(data, np.linspace(0, len(data)/50, 50), color="C1", marker='o')
plt.yscale('log')
plt.xscale('log')
plt.xlabel("Quantidade de tweets")
plt.ylabel("Frequência")

Text(0, 0.5, 'Frequência')

In [176]:
fit = pwl.Fit(data[1:], xmax=10**5)
fig = fit.plot_cdf(linewidth=3)
fit.truncated_power_law.plot_cdf(ax=fig, linestyle="--", color="C5", label="Truncated Power Law")
fit.power_law.plot_cdf(ax=fig, linestyle="--", color="C1", label="Power Law")
fit.lognormal.plot_cdf(ax=fig, linestyle="--", color="C2", label="Lognormal")
fit.stretched_exponential.plot_cdf(ax=fig, linestyle="--", color="C3", label="Stretched Exponential")
plt.legend()

Calculating best minimal value for power law fit
xmin progress: 97%

<matplotlib.legend.Legend at 0x1fbc59db350>

In [177]:
pwl.Truncated_Power_Law

powerlaw.Truncated_Power_Law

In [178]:
# Fit data to power law distribution
fit = pwl.Fit(data[1:])

# Get estimated exponent of power law distribution
alpha = fit.alpha

# Perform goodness-of-fit test
D, p = fit.distribution_compare('truncated_power_law', 'lognormal')

print(f"Estimated exponent: {alpha}")
print(f"Distribution compare - Power law vs Lognormal: D = {D}, p = {p}")

Calculating best minimal value for power law fit
Estimated exponent: 1.41050677835177
Distribution compare - Power law vs Lognormal: D = 2.4734212739993264, p = 0.003446025617513327
