In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
import os
import errno
from helpers import *

## Directory for saving plots

In [None]:
try:
    os.makedirs("files/tex")
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

### 2D scatter plot of parameters of tag-cotag linear fit

In [None]:
final_df = pd.read_csv("results/stats.csv")
assert (len(final_df)==168)

In [None]:
final_df[final_df['num_tag']==max(final_df['num_tag'])]

In [None]:
final_df[final_df['num_tag']==min(final_df['num_tag'])]

In [None]:
print("average correlation", np.mean(final_df['r_value']))

In [None]:
fig = plt.figure(figsize = (6,4))
plt.rcParams.update({'font.size': 20})

ax1 = fig.add_subplot(111)

ax1.hist(final_df['slope'].values, bins = 20, density = True, color = 'gray')
ax1.set_xlabel("Slope (Data)")
ax1.set_ylabel("Frequency")

ax1.axvline(x=np.mean(final_df['slope']), c= 'red', label = "mean=%0.2f"%np.mean(final_df['slope']), linewidth = 5)
ax1.axvline(x=np.median(final_df['slope']), c= 'orange', label = "median=%0.2f"%np.median(final_df['slope']), linewidth = 5, linestyle = "--")
plt.legend(prop={'size': 15})
plt.savefig("files/tex/"+'slope.pdf', bbox_inches='tight', pad_inches=0.2)
plt.show()


In [None]:
final_df.iloc[np.argmax(final_df['r_value'])]["filename"]

## Clustering

In [None]:
final_df = pd.read_csv("results/clustering.csv", index_col = 0).sort_values('filename').reset_index()
final_df.drop("weighted_log", axis = 1)
final_dfnorm = pd.read_csv("results/clustering2.csv", index_col = 0).sort_values('filename').reset_index()
df_stats = pd.read_csv("results/stats.csv").sort_values('filename')
assert len(final_df)==168
assert len(final_dfnorm)==168
assert len(df_stats)==168

In [None]:
fig = plt.figure(figsize = (15,4))
plt.rcParams.update({'font.size': 20})

ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)

def plot_cc_normal_helper(ax,xlabel):
    
    #parameters
    if xlabel == "$CC$":
        data = final_df[final_df["unweighted"]>0.2]["unweighted"].values             
    if xlabel == "$\ln(CC_w)$":
        data = np.log(final_df["weighted"].values)         
    if xlabel == "$CC_{wl}$":
        data =  final_dfnorm["weighted_log"].values

    
    params = np.mean(data), np.std(data)
    
    #plotting
    if xlabel == "$CC$":
        data = final_df["unweighted"].values
    
    x = np.arange(min(data), max(data), (max(data)-min(data))/50)
    ax.hist(data, bins = 25, density = True, color = 'gray')
   
    fitted = st.norm.pdf(x, loc=params[-2], scale=params[-1])
    ax.plot(x,fitted, label=("N(%0.2f, %0.2f)" % params), linewidth = 5, color = 'black')
    ax.legend(prop={'size': 15},markerscale=0.3, handlelength=0.2, loc=2)
    ax.set_xlabel(xlabel)
    ax.set_ylabel("Frequency")

    
plot_cc_normal_helper(ax1,"$CC$")
plot_cc_normal_helper(ax2,"$\ln(CC_w)$")
plot_cc_normal_helper(ax3,"$CC_{wl}$")


plt.tight_layout()
plt.savefig("files/tex/"+'CC1.pdf', bbox_inches='tight', pad_inches=0.2)

plt.show()

In [None]:
fig = plt.figure(figsize = (15,4))
plt.rcParams.update({'font.size': 20})

ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)

def plot_cc_helper(ax,ylabel):
    if ylabel == "$CC$":
        data = final_df["unweighted"]
    if ylabel == "$\ln(CC_w)$":
        data = np.log(final_df["weighted"])
    if ylabel == "$CC_{wl}$":
        data =  final_dfnorm["weighted_log"]
        
    ax.scatter(df_stats["num_post"], data, s = 15, c = 'black')
    ax.set_xscale('log')
    ax.set_xlabel("Number of Questions")
    ax.set_ylabel(ylabel)
    corr = np.corrcoef(np.log(df_stats["num_post"]), data)
    ax.legend(['Corr = %0.3f'%corr[0][1]], handlelength=0, markerscale=0, prop = {"size":18})

    
plot_cc_helper(ax1,"$CC$")
plot_cc_helper(ax2,"$\ln(CC_w)$")
plot_cc_helper(ax3,"$CC_{wl}$")


plt.tight_layout()
plt.savefig("files/tex/"+'CC_indsize.pdf', bbox_inches='tight', pad_inches=0.2)

## Polynonimal Fit

In [None]:
df  = pd.read_csv("results/poly_params.csv", index_col = 0)
PC_dict = np.load('results/param_PC.npy').item()
assert (len(df)==168)
assert (len(PC_df)==168)

In [None]:
for k in PC_dict.keys():
    print(k, "\n", PC_dict[k])

## Theory

In [None]:
final_df = pd.read_csv("results/theory_cotag_u.csv")
assert (len(final_df)==168)

In [None]:
x,y = plot_CDF_helper(final_df['mre_log'])
plt.plot(x,y, linewidth = 5, color = 'black')
plt.axhline(y = 0.8, color = 'gray', linestyle = "--")
# plt.axvline(x = 30, color = 'gray', linestyle = "--")
plt.axvline(x = 0.5, color = 'gray', linestyle = "--")
plt.xlabel("Mean Error (ln(Expected+1) - ln(Data+1))")
plt.ylabel("CDF")
plt.savefig("files/tex/"+'mre_log.pdf', bbox_inches='tight', pad_inches=0.2)

In [None]:
final_df = pd.read_csv("results/var_zero_post.csv", index_col = 0)
x,y= plot_CDF_helper(final_df["sqrt_p"])
plt.axhline(y = 0.8, color = 'gray', linestyle = "--")
plt.axvline(x = 0.008, color = 'gray', linestyle = "--")

plt.plot(x,y,  linewidth = 5, color = 'black')
plt.xlabel("Standard dev.\ $\hat{N}$")
plt.ylabel('CDF')
plt.savefig("files/tex/"+'variance.pdf', bbox_inches='tight', pad_inches=0.2)

## Tag Frequency Lognormal Fit

In [None]:
df_final = pd.read_csv("./results/tagfreq.csv", index_col = 0).sort_values('filename').reset_index()
assert len(df_final) == 168

In [None]:
df_final[df_final["filename"] =="patents"]

In [None]:
# machine accuracy
df_final['pl_pm'] =  df_final['pl_p']
df_final.loc[df_final['pl_pm']<1e-16,['pl_pm']] = 1e-16

df_final['tpl_pm'] =  df_final['tpl_p']
df_final.loc[df_final['tpl_pm']<1e-16,['tpl_pm']] = 1e-16

df_final['sexp_pm'] =  df_final['sexp_p']
df_final.loc[df_final['sexp_pm']<1e-16,['sexp_pm']] = 1e-16

In [None]:
def plot_p_helper(data, label, color, linewidth, style):
    x,y = plot_CDF_helper(data)
    plt.plot(x,y, label = label, color = color, linewidth = linewidth, alpha = 0.7, linestyle = style)
plt.rcParams.update({'font.size': 20})
fig = plt.figure(figsize = (7,4))
data = df_final[df_final['pl_pm']>0]['pl_pm']
plot_p_helper(data, label = "Power Law", color = 'purple', linewidth = 5, style = ':')
data = df_final[df_final['tpl_pm']>0]['tpl_pm']
plot_p_helper(data, label = "Trunc. Power Law", color = 'r', linewidth = 5, style =':')
data = df_final[df_final['sexp_pm']>0]['sexp_pm']
plot_p_helper(data, label = "Stretched Exp.", color = 'g', linewidth = 5, style ='-.')

plt.axvline(x=0.1, color = 'black', label = "p = 0.1")
plt.xlabel("p-value for Distribution Comparisons")
plt.ylabel("CDF")

plt.legend( handlelength = 3)
plt.savefig("files/tex/"+'compare_p.pdf', bbox_inches='tight', pad_inches=0.2)

In [None]:
fig = plt.figure(figsize = (7,4))
plt.rcParams.update({'font.size': 20})
plt.scatter(df_final['pl_R'],df_final['pl_pm'], color = 'purple', s = 20, alpha = 0.3, label = 'Power Law', marker = '^')
plt.scatter(df_final['tpl_R'],df_final['tpl_pm'], color = 'r', s = 20, alpha = 0.3, label = 'Trunc. Power Law', marker = 's')
plt.scatter(df_final['sexp_R'],df_final['sexp_pm'], color = 'g', s = 20, alpha = 0.3, label = 'Stretched Exp.', marker = 'D')
plt.xlabel("R values")
plt.ylabel('Max(p, 1e-16)')
plt.yscale('log')
plt.ylim(bottom = 1e-17, top = 10)
plt.axvline(x=0)
plt.legend(markerscale = 4)
plt.savefig("files/tex/"+'p-r-scatter.pdf', bbox_inches='tight', pad_inches=0.2)

In [None]:
plt.rcParams.update({'font.size': 20})
fig = plt.figure(figsize = (7,4))
x,y = plot_CDF_helper(df_final['ks'])
plt.plot(x,y, linewidth = 5, color = 'orange', label = 'Lognormal', linestyle = "--")

x,y = plot_CDF_helper(df_final['pl_ks'])
plt.plot(x,y, label = "Power Law", color = 'purple', linewidth = 3, linestyle = ":")
x,y = plot_CDF_helper(df_final['tpl_ks'])
plt.plot(x,y, label = "Trunc. Power Law", color = 'r', linewidth = 3, linestyle = ":")
x,y = plot_CDF_helper(df_final['sexp_ks'])
plt.plot(x,y, label = "Stretched Exp.", color = 'g', linewidth = 3, linestyle = "-.")

plt.axhline(y = 0.8, linestyle = ':', color = 'gray')
plt.axvline(x = 0.06, linestyle = ':', color = 'gray')
plt.axhline(y = 0.99, linestyle = ':', color = 'gray')
plt.axvline(x = 0.15, linestyle = ':', color = 'gray')
plt.xlabel("KS Statistic")
plt.ylabel("CDF")
plt.legend(loc = 4)
plt.savefig("files/tex/"+'ks.pdf', bbox_inches='tight', pad_inches=0.2)

In [None]:
df_final = df_final[df_final["filename"] !="patents"]
fig = plt.figure(figsize = (7,4))
plt.rcParams.update({'font.size': 20})

ax1 = fig.add_subplot(111)
# ax1.ticklabel_format(style='sci',scilimits=(0,3), axis='both')
# ax2 = fig.add_subplot(122)
# ax2.ticklabel_format(style='sci',scilimits=(0,3), axis='both')

ax1.hist(df_final["mu"].values, bins = 25, density = True, color = 'blue', alpha = 0.6)

# params = st.norm.fit(data=)
# print(params)
data = df_final[df_final['mu']>-1]['mu']
params = np.mean(data), np.std(data)
x = np.arange(min(df_final['mu']), max(df_final['mu']), 0.1)
fitted = st.norm.pdf(x, loc=params[-2], scale=params[-1])
ax1.plot(x,fitted, label=("$\mu \sim$N(%0.2f, %0.2f)" % params), linewidth = 5, color = 'black', linestyle = ':')

ax1.set_xlabel("Parameters of Lognormal Fits")


ax1.hist(df_final["sigma"].values, bins = 25, density = True, color = 'green', alpha = 0.6)

# params = st.norm.fit(data=)
data = df_final[df_final['sigma']<2.5]['sigma']
params = np.mean(data), np.std(data)
x = np.arange(min(df_final['sigma']), max(df_final['sigma']), 0.1)
fitted = st.norm.pdf(x, loc=params[-2], scale=params[-1])
ax1.plot(x,fitted, label="$\sigma \sim$N(%0.2f, %0.2f)"%params, linewidth = 5, color = 'black', linestyle = '--')

# ax2.set_xlabel("$\sigma$")

ax1.set_ylabel("Frequency")
# ax2.set_ylabel("Frequency")
ax1.legend(prop = {'size':18},loc = 2, handlelength = 1)
# ax2.legend()
plt.savefig("files/tex/"+'tag_freq_params.pdf', bbox_inches='tight', pad_inches=0.2)
plt.show()

In [None]:
np.sum(df_final['pl_R']<0), np.sum(df_final['sexp_R']<0), np.sum(df_final['tpl_R']<0), 

In [None]:
def plot_lognorm_fit(filename, fig, xmin = 1):
    
    df_cotag = pd.read_csv("./cotag_data/"+"%s_cotag.csv" % filename, sep = ',', index_col = 0)
    data = df_cotag["ct"]

    fit = powerlaw.Fit(data, discrete=True, estimate_discrete=True, xmin=xmin)

    

    fit.power_law.plot_ccdf(ax=fig, color='purple', linestyle=':', label='Power Law', linewidth=2)      
    fit.truncated_power_law.plot_ccdf(ax=fig, color='r', linestyle=':', label='Trunc. Power Law', linewidth=2)      
    fit.stretched_exponential.plot_ccdf(ax=fig, color='g', linestyle='-.', label='Stretched Exp.', linewidth=2)
    # fit.lognormal_positive.plot_ccdf(ax=fig, color='burlywood', linestyle='-.', label='Lognormal + Fit')
    fit.plot_ccdf(linewidth=3, label='Empirical Data', ax = fig)
    fit.lognormal.plot_ccdf(ax=fig, color='orange', linestyle='--', label='Lognormal', linewidth=3)
    
    fig.set_ylabel("CDF")
    fig.set_xlabel("Tag Occurrences")
    if filename.endswith(".stackexchange.com"):
        fig.set_title(filename[:-18].upper())
    else:        
        fig.set_title(filename.upper())

## Examples

In [None]:
import powerlaw

plt.rcParams.update({'font.size': 15})
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(9,6))

filename = "apple"
plot_lognorm_fit(filename, ax1)
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles, labels, loc=3, handlelength=4,  prop = {'size': 10})
filename = "coffee"
plot_lognorm_fit(filename, ax2)
filename = "math.stackexchange.com"
plot_lognorm_fit(filename, ax3)
filename = "fitness"
plot_lognorm_fit(filename, ax4)
plt.tight_layout()
plt.savefig("./files/tex/tagfreq.pdf", bbox_inches='tight', pad_inches=0.2)

## patent is different

In [None]:
final_df = pd.read_csv("./results/patents.csv", index_col = 0)
assert len(final_df) == 169

In [None]:
final_df.sort_values('ratio2', ascending=False).head(n=5)

## Number of Tags on Post

In [None]:
df_stats = pd.read_csv("results/stats.csv").sort_values('filename').reset_index()
final_df = pd.read_csv("results/tag_num.csv").sort_values('filename').reset_index()
assert len(final_df) == 168
assert len(df_stats) == 168

In [None]:
plt.rcParams.update({'font.size': 20})
plt.scatter(df_stats['num_post'], final_df['n1p'], s = 7, label = '1', alpha = 0.8, marker = 'v')#, c = 'black')
plt.scatter(df_stats['num_post'], final_df['n2p'], s = 7, label = '2', alpha = 0.8, marker = '1')#, c = 'black')
plt.scatter(df_stats['num_post'], final_df['n3p'], s = 7, label = '3', alpha = 0.8, marker = 'D')#, c = 'black')
plt.scatter(df_stats['num_post'], final_df['n4p'], s = 7, label = '4', alpha = 0.8, marker = 's')#, c = 'black')
plt.scatter(df_stats['num_post'], final_df['n5p'], s = 7, label = '5', alpha = 0.8, marker = '+')#, c = 'black')
# plt.legend()
plt.xscale('log')
plt.xlabel("Number of Questions")
plt.ylabel('Percent of Questions')
plt.savefig("./files/tex/"+ 'num_tag_ppost.pdf', bbox_inches='tight', pad_inches=0.2)

In [None]:
"%.2f, %.2f, %.2f, %.2f, %.2f," %(np.mean(final_df['n1p']), np.mean(final_df['n2p']), np.mean(final_df['n3p']), np.mean(final_df['n4p']), np.mean(final_df['n5p']), )