In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from helpers import *
import os 
import errno
from scipy.stats import hypergeom
from scipy.special import comb

## Directory for saving plots

In [None]:
try:
    os.makedirs("files/tex")
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

### slope of tag-cotag linear fit

In [None]:
final_df = pd.read_csv("gen_results/gen_stats.csv").sort_values("filename").reset_index()
final_df_0 = pd.read_csv("results/stats.csv").sort_values("filename").reset_index()
assert len(final_df) ==168
assert len(final_df_0) ==168

In [None]:
diff = np.abs(final_df["num_post"] - final_df_0["num_post"])/ final_df_0["num_post"]
print("difference in total number of questions", np.mean(diff))
print("max difference", np.max(diff))

In [None]:
mse_model = np.sum((final_df['slope']   - final_df['theory_slope'])  **2)/len(final_df['slope'])
mse_data  = np.sum((final_df_0['slope'] - final_df_0['theory_slope'])**2)/len(final_df_0['slope'])
print("difference in slope between theory and model: %0.3f, data: %0.3f" % (mse_model, mse_data))

In [None]:


plt.rcParams.update({'font.size': 20})

plt.scatter(final_df['slope'],final_df_0['slope'], alpha = 0.4, s = 18, color = 'black')
plt.xlabel("Slope (Model)")
plt.ylabel("Slope (Data)")
corr = np.corrcoef(final_df['slope'],final_df_0['slope'])
plt.legend(['Corr = %0.3f'%corr[0][1]], handlelength=0, markerscale=0, loc = 4)

plt.savefig("files/tex/gen_"+'slope.pdf', bbox_inches='tight', pad_inches=0.2)

In [None]:
print("mean correlation in generated", np.mean(final_df['r_value']))

## Clustering

In [None]:
final_df = pd.read_csv("gen_results/gen_clustering.csv", index_col = 0).sort_values("filename").reset_index()
final_df.drop("weighted_log", axis = 1)
final_dfnorm = pd.read_csv("gen_results/gen_clustering2.csv", index_col = 0).sort_values("filename").reset_index()

final_df_0 = pd.read_csv("results/clustering.csv").sort_values("filename").reset_index()
final_df_0.drop("weighted_log", axis = 1)
final_df_0norm = pd.read_csv("results/clustering2.csv").sort_values("filename").reset_index()

df_stats = pd.read_csv("gen_results/gen_stats.csv", index_col = 0).sort_values("filename").reset_index()

assert len(final_dfnorm)==168
assert len(final_df)==168
assert len(final_df_0norm)==168

assert len(final_df_0)==168
assert  len(df_stats)==168

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2) #, sharey=True
f.set_size_inches(9,4)
plt.rcParams.update({'font.size': 20})


ax1.scatter(final_df["unweighted"].values,final_dfnorm["weighted_log"].values, s = 10, marker = 's', label = 'Model', alpha = 0.4)
ax1.scatter(final_df_0["unweighted"].values,final_df_0norm["weighted_log"].values, s = 10, marker = 'o', label = 'Data', alpha = 0.4)
ax1.set_ylabel("$C_{wl}$",labelpad=19)
ax1.set_xlabel("$C$",labelpad=10)
ax1.legend(prop={'size': 15},markerscale=3, loc=2)
corr = np.corrcoef(final_df_0["unweighted"],final_df_0norm["weighted_log"])
print(['Corr = %0.3f'%corr[0][1]])



ax2.scatter(final_df["unweighted"].values,  np.log(final_df["weighted"].values), s = 10, marker = 's', alpha = 0.4)
ax2.scatter(final_df_0["unweighted"].values,np.log(final_df_0["weighted"].values), s = 10, marker = 'o', alpha = 0.4)
corr = np.corrcoef(final_df_0["unweighted"],np.log(final_df_0["weighted"]))
print(['Corr = %0.3f'%corr[0][1]])
ax2.set_ylabel("$\ln(C_w)$",labelpad=10)
ax2.set_xlabel("$C$",labelpad=10)


plt.tight_layout()
plt.savefig("files/tex/"+'CC2.pdf', bbox_inches='tight', pad_inches=0.2)
plt.show()

In [None]:
fig = plt.figure(figsize = (15,4))
plt.rcParams.update({'font.size': 20})

ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)

ax1.scatter(final_df["unweighted"], final_df_0["unweighted"], alpha = 0.4,  s = 25, color = 'black')
ax1.set_xlabel("$C$ (model)")
ax1.set_ylabel("$C$ (data)")
corr = np.corrcoef(final_df["unweighted"],final_df_0["unweighted"])
ax1.legend(['Corr = %0.3f'%corr[0][1]], handlelength=0, markerscale=0, loc = 4)

ax2.scatter(np.log(final_df["weighted"]), np.log(final_df_0["weighted"]), alpha = 0.4,  s = 25, color = 'black')
ax2.set_xlabel("$\ln(C_w)$ (model)")
ax2.set_ylabel("$\ln(C_w)$ (data)")
corr = np.corrcoef(np.log(final_df["weighted"]),np.log(final_df_0["weighted"]))
ax2.legend(['Corr = %0.3f'%corr[0][1]], handlelength=0, markerscale=0, loc = 4)

ax3.scatter(final_dfnorm["weighted_log"], final_df_0norm["weighted_log"], alpha = 0.4,  s = 25, color = 'black')
ax3.set_xlabel("$C_{wl}$ (model)")
ax3.set_ylabel("$C_{wl}$ (data)")
corr = np.corrcoef(final_dfnorm["weighted_log"],final_df_0norm["weighted_log"])
ax3.legend(['Corr = %0.3f'%corr[0][1]], handlelength=0, markerscale=0, loc = 4)


plt.tight_layout()
plt.savefig("files/tex/gen_"+'CC.pdf', bbox_inches='tight', pad_inches=0.2)

In [None]:
fig = plt.figure(figsize = (15,4))
plt.rcParams.update({'font.size': 20})

ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)

def plot_cc_helper(ax,ylabel):
    if ylabel == "$CC$":
        data = final_df["unweighted"]
    if ylabel == "$\ln(CC_w)$":
        data = np.log(final_df["weighted"])
    if ylabel == "$CC_{wl}$":
        data =  final_dfnorm["weighted_log"]
        
    ax.scatter(df_stats["num_post"], data, s = 15, c = 'black')
    ax.set_xscale('log')
    ax.set_xlabel("Number of Questions")
    ax.set_ylabel(ylabel)
    corr = np.corrcoef(np.log(df_stats["num_post"]), data)
    ax.legend(['Corr = %0.3f'%corr[0][1]], handlelength=0, markerscale=0, prop = {"size":18})

    
plot_cc_helper(ax1,"$CC$")
plot_cc_helper(ax2,"$\ln(CC_w)$")
plot_cc_helper(ax3,"$CC_{wl}$")


plt.tight_layout()
plt.savefig("files/tex/"+'CC_indsize.pdf', bbox_inches='tight', pad_inches=0.2)

## Polynonimal Fit

In [None]:
df  = pd.read_csv("gen_results/gen_poly_params.csv", index_col = 0).dropna().sort_values("filename")
PC_dict = np.load('gen_results/gen_param_PC.npy').item()
df_0  = pd.read_csv("results/poly_params.csv", index_col = 0).dropna().sort_values("filename")
df_0 = df_0[df_0['filename']!='patents']
assert len(df) == 167
assert len(PC_df) == 168
assert len(df_0) == 167
assert (len(PC_df_0)==168)

In [None]:
for k in PC_dict.keys():
    print(k, "\n",  PC_dict[k])
PC1 = PC_dict["PC1"]
PC2 = PC_dict["PC2"]
PC_mean = PC_dict["PC_mean"]

In [None]:

def log_pre(df, x_log = True, y_log = True):
    return df[(df.T != 0).all()]


def plot_cotag(filename, ax1, params, info):
    
    csv_gen = "./files/generated/cotag_files/gen_%s_cotag.csv" % filename
    csv_data = "./cotag_data/%s_cotag.csv" % filename
    
    csv_lst = [csv_data, csv_gen]
        
    

    color_lst = ['b','r','g','c','m','y','k']
    counts_lst = []
    cotag_u_lst = []
    cotag_lst = []
    
    df_lst =  [None] * len(csv_lst)
    for i in range(len(csv_lst)):
        
        df_lst[i] = pd.read_csv(csv_lst[i], index_col=0)

        df_lst[i] = log_pre(df_lst[i])
        df_lst[i] = df_lst[i].sort_values('ct').reset_index()

        counts_lst.append(np.array((df_lst[i])['ct']))
        cotag_u_lst.append(np.array((df_lst[i])['cotag_u']))
        cotag_lst.append(np.array((df_lst[i])['cotag']))
    
    
    ## fitted polynomial
    
    
    
    ax1.set_xscale("log")
    ax1.set_yscale("log")
    ax1.set_ylabel("Number of Unique Cotags, $k_U$")
    ax1.set_xlabel("Number of Tag Occurrences, $x_i$")
    ax1.set_title(filename.upper())
    
    
    legend_name = ["Data", "Model"]
    markers = ['^','s']
    styles = ['-', '--']
    cs = ['green','orange']
    for i in range(len(df_lst)):

        ax1.scatter(counts_lst[i], cotag_u_lst[i], alpha=0.4, color = color_lst[i], label = legend_name[i], marker = markers[i], s = 5)
        
        param_df = params[i]
        df_ori = df_lst[i]  
        fn = filename
        if i == 1: 
            fn = 'gen_%s'%filename
        
        d3 = float(param_df[param_df['filename']==fn]['d3'])
        d2 = float(param_df[param_df['filename']==fn]['d2'])
        d1 = float(param_df[param_df['filename']==fn]['d1'])
        d0 = float(param_df[param_df['filename']==fn]['d0'])
        f = lambda x: d3*x**3 + d2 * x**2 + d1 * x + d0
        fitted = np.exp(f(np.log(df_ori["ct"]+1)))
        ax1.plot(df_ori["ct"], fitted, label = legend_name[i] + " fit" , linewidth = 3, linestyle = styles[i], c = cs[i])
    
    
    ## theory
    theory_df = pd.read_csv("./files/theory_unique/"+filename+".csv")
    ax1.scatter(theory_df["ct"], theory_df['expected'], label = "Expected", s = 10, marker = '+')
        
        

## Examples

In [None]:
fig = plt.figure(figsize = (14,11))
plt.rcParams.update({'font.size': 20})
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)
fig.subplots_adjust(hspace=0.5)
filename = "apple"
plot_cotag(filename, ax1, params =[df_0,df], info = df_stats)
ax1.legend(markerscale=5, prop = {"size":20})
filename = "coffee"
plot_cotag(filename, ax2, params =[df_0,df], info = df_stats)
filename = "math.stackexchange.com"
plot_cotag(filename, ax3, params =[df_0,df], info = df_stats)
filename = "fitness"
plot_cotag(filename, ax4, params =[df_0,df], info = df_stats)
plt.tight_layout()
plt.savefig("./files/tex/"+ 'post_uniquec.pdf', bbox_inches='tight', pad_inches=0.2)

In [None]:
df_stats = pd.read_csv("results/stats.csv", index_col = 0)
print(df_stats[df_stats["filename"]=="apple.txt"][["filename",  "num_post", "total_tag", "num_tag" ]])
print(df_stats[df_stats["filename"]=="coffee.txt"][["filename",  "num_post", "total_tag", "num_tag" ]])
print(df_stats[df_stats["filename"]=="math.stackexchange.com.txt"][["filename",  "num_post", "total_tag", "num_tag" ]])
print(df_stats[df_stats["filename"]=="fitness.txt"][["filename",  "num_post", "total_tag", "num_tag" ]])

In [None]:
fig = plt.figure(figsize = (6,4))
plt.rcParams.update({'font.size': 15})

ax1 = fig.add_subplot(111)

X,y = plot_CDF_helper(df['mse'])
ax1.plot(X,y, color='blue',linewidth=5, label = "Degree 3 Fit (Model)", linestyle = '-.')
ax1.set_xlabel("MSE")
ax1.set_ylabel("CDF")

X,y = plot_CDF_helper(df['mse_lin'])
ax1.plot(X,y, color='red',linewidth=5, label = "Degree 1 Fit (Model)")


X,y = plot_CDF_helper(df_0['mse'])
ax1.plot(X,y, color='orange',linewidth=5, label = "Degree 3 Fit (Data)", linestyle = ":")
ax1.set_xlabel("MSE")
ax1.set_ylabel("CDF")

X,y = plot_CDF_helper(df_0['mse_lin'])
ax1.plot(X,y, color='green',linewidth=5, label = "Degree 1 Fit (Data)", linestyle = "--")


# ax1.axhline(y = 0.8)
ax1.legend(markerscale=0.2,handlelength=2.6)
plt.savefig("files/tex/"+'MSE.pdf', bbox_inches='tight', pad_inches=0.2)
plt.show()

## Number of tags on Questions

In [None]:
df_stats = pd.read_csv("gen_results/gen_stats.csv").sort_values('filename').reset_index()
final_df_0 = pd.read_csv("results/tag_num.csv").sort_values('filename').reset_index()
final_df = pd.read_csv("gen_results/gen_tag_num.csv").sort_values('filename').reset_index()
assert len(df_stats)==168
assert len(final_df_0)==168
assert len(final_df)==168

In [None]:
from matplotlib import ticker
plt.rcParams.update({'font.size': 20})

fig1, ax1 = plt.subplots()
corr = np.corrcoef(final_df['n1p'],final_df_0['n1p'])[0][1]
ax1.scatter(final_df['n1p'],final_df_0['n1p'], s = 15, label = 'c1=%0.2f'%corr, alpha = 0.5, marker = 'v')#, c='black')
corr = np.corrcoef(final_df['n2p'],final_df_0['n2p'])[0][1]
ax1.scatter(final_df['n2p'],final_df_0['n2p'], s = 15, label = 'c2=%0.2f'%corr, alpha = 0.5, marker = '1')#, c='black')
corr = np.corrcoef(final_df['n3p'],final_df_0['n3p'])[0][1]
ax1.scatter(final_df['n3p'],final_df_0['n4p'], s = 15, label = 'c3=%0.2f'%corr, alpha = 0.5, marker = 'D')#, c='black')
corr = np.corrcoef(final_df['n4p'],final_df_0['n4p'])[0][1]
ax1.scatter(final_df['n4p'],final_df_0['n4p'], s = 15, label = 'c4=%0.2f'%corr, alpha = 0.5, marker = 's')#, c='black')
corr = np.corrcoef(final_df['n5p'],final_df_0['n5p'])[0][1]
ax1.scatter(final_df['n5p'],final_df_0['n5p'], s = 15, label = 'c5=%0.2f'%corr, alpha = 0.5, marker = '+')#, c='black')
ax1.legend(loc = 'upper left',handlelength=1, markerscale=2, prop = {"size" : 16})
ax1.set_xlabel('Fraction (Model)')
ax1.set_ylabel('Fraction (Data)')


plt.savefig("./files/tex/"+ 'gen_num_tag_ppost.pdf', bbox_inches='tight', pad_inches=0.2)

In [None]:
print("average percentage of morethan 5 tag questions", np.mean(final_df['nm']/df_stats['num_post']))

In [None]:
x,y = plot_CDF_helper(final_df['nm']/df_stats['num_post'])
plt.plot(x,y, linewidth = 5, color = 'black')
plt.axhline(y = 0.8, color = 'gray', linestyle = "--")
plt.axvline(x = 0.045, color = 'gray', linestyle = "--")
plt.xlabel("Fraction of Questions with More Than 5 Tags")
plt.ylabel("CDF")
plt.savefig("files/tex/"+'more_than_5.pdf', bbox_inches='tight', pad_inches=0.2)