## This notebook reproduces the results reported in Section 6.1 "Empirical study: online knowledge community"

In [1]:
import io
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
import statsmodels.api as sm
import statsmodels.formula.api as smf

from statsmodels.iolib.summary2 import summary_col
from statsmodels.discrete.discrete_model import Poisson

from patsy import dmatrices

#### Read in original dataset, the stackexchange dataset

In [2]:
df = pd.read_csv('data/stackexchange.csv')
df.head()

Unnamed: 0,category,post,answer,post_score,answer_score,Sequence,post_idx,answer_idx,logwords,dummy,AnswerHepfulness,QuestionHelpfulness
0,gaming,<p>In Darksiders 3 there are many enemies who ...,"<p>The ways I found to beat blocking enemies, ...",0,0,1.0,0,1,3.433987,0,0.0,0.0
1,gaming,"<p>In Level 2-3 of Super Mario 3D Land, by Mar...",<p>You need to hit it with your Tanooki's suit...,4,7,1.0,2,3,1.791759,1,1.94591,1.386294
2,gaming,"<p>In massively multiplayer games, the servers...",<p>All MMO's have massive amounts of data in d...,46,43,1.0,4,5,3.401197,1,3.7612,3.828641
3,gaming,"<p>In massively multiplayer games, the servers...",<p>It's also a question of cost and predictabi...,46,21,2.0,4,6,4.89784,1,3.044522,3.828641
4,gaming,"<p>In massively multiplayer games, the servers...",<p>They may need to deploy updates to binaries...,46,6,3.0,4,7,3.367296,1,1.791759,3.828641


#### Total number of documents

In [3]:
len(df.post.unique()) + len(df.answer.unique())

51647

#### Read in data (the bow file and vocab file) for topic modeling.

In [4]:
texts = []
with io.open('data/stackexchange.bow', 'r', encoding='utf-8') as f:
    texts = f.read().splitlines()

vocabs = []
with io.open('data/stackexchange.vocab', 'r', encoding='utf-8') as f:
    vocabs = f.read().splitlines()

#### using StableLDA to infer topic vectors. 

In [7]:
from stability import *
from stablelda import StableLDA

first run

In [8]:
bow_file = 'data/stackexchange.bow'
vocab_file = 'data/stackexchange.vocab'

num_topics = 50
num_words = 5000
alpha, beta, eta = 1, 0.01, 1000
epochs = 2
rand_seed = 42

In [None]:
# first model
output_dir = 'data/output/'
stablelda = StableLDA(num_topics, num_words, alpha, beta, eta, rand_seed, output_dir )
stablelda.train(bow_file, vocab_file, epochs)

docs, vocab, theta, phi = load_topic_model_results(bow_file, vocab_file,
                                                     output_dir+'theta.dat', output_dir+'phi.dat')
tm = TopicModel(num_topics, theta, phi, docs, vocab)

tm.print_top_n_words(10)

generate QASimilarity variable, which is the cosine similarity between question and answer topic vector. Note: This variable can be quite unstable if it were generated by LDA model due to its instability.

In [10]:
df['stablelda_sim'] = df.apply(lambda x:1-cosine(theta[x['post_idx']], theta[x['answer_idx']]), axis=1)

run regression: 

AnswerHelpfulness = $\beta_0$ + $\beta_1$StableLDASimilarity + $\beta_2$Sequence + $\beta_3$QuestionHelpfulness + $\beta_4$log(words) + $\epsilon$

In [11]:
y, X = dmatrices(' AnswerHepfulness ~ stablelda_sim  + Sequence + QuestionHelpfulness  + logwords ', data=df, 
                 return_type='dataframe')
model = sm.OLS(y, X)
ols_res = model.fit()
print(ols_res.summary())

                            OLS Regression Results                            
Dep. Variable:       AnswerHepfulness   R-squared:                       0.340
Model:                            OLS   Adj. R-squared:                  0.340
Method:                 Least Squares   F-statistic:                     4246.
Date:                Tue, 30 Aug 2022   Prob (F-statistic):               0.00
Time:                        16:09:16   Log-Likelihood:                -37809.
No. Observations:               32899   AIC:                         7.563e+04
Df Residuals:                   32894   BIC:                         7.567e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -0.2592    

In [12]:
y, X = dmatrices('dummy ~ stablelda_sim  + Sequence + QuestionHelpfulness  + logwords', data=df, 
                 return_type='dataframe')
model = sm.Logit(y, X)
logit_res = model.fit()
print(logit_res.summary())

Optimization terminated successfully.
         Current function value: 0.453723
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  dummy   No. Observations:                32899
Model:                          Logit   Df Residuals:                    32894
Method:                           MLE   Df Model:                            4
Date:                Tue, 30 Aug 2022   Pseudo R-squ.:                  0.1415
Time:                        16:09:16   Log-Likelihood:                -14927.
converged:                       True   LL-Null:                       -17388.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -0.8991      0.149     -6.029      0.000      -1.191      -0.607
st

Note that due to stochasticity, the regression result (effect size) may be different across runs. So the result is slightly different from that reported in Table I. Stable LDA offers much more stable results than that of LDA, in terms of effect size and p-value.

In [13]:
results = summary_col([ols_res, logit_res],stars=True,float_format='%0.3f',
                  model_names=['OLS', 'Logit'],
                  info_dict={'Log-Likelihood': lambda x: "%#8.5g" % x.llf,
                     'AIC': lambda x: "%#8.5g" % x.aic} )
print(results)


                       OLS      Logit  
---------------------------------------
Intercept           -0.259*** -0.899***
                    (0.043)   (0.149)  
stablelda_sim       0.178***  0.399*** 
                    (0.043)   (0.151)  
Sequence            -0.216*** -0.523***
                    (0.003)   (0.011)  
QuestionHelpfulness 0.472***  0.539*** 
                    (0.004)   (0.016)  
logwords            0.221***  0.672*** 
                    (0.005)   (0.018)  
AIC                 75628.    29864.   
Log-Likelihood      -37809.   -14927.  
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


second run. We retrain a topic model, re-calculate the QASimilarity variable, and re-run the regression model

In [None]:
# second model
output_dir = 'data/output/'
random_seed = 24
stablelda = StableLDA(num_topics, num_words, alpha, beta, eta, rand_seed, output_dir )
stablelda.train(bow_file, vocab_file, epochs)

docs, vocab, theta, phi = load_topic_model_results(bow_file, vocab_file,
                                                     output_dir+'theta.dat', output_dir+'phi.dat')
tm = TopicModel(num_topics, theta, phi, docs, vocab)

tm.print_top_n_words(10)

In [15]:
df['stablelda_sim'] = df.apply(lambda x:1-cosine(theta[x['post_idx']], theta[x['answer_idx']]), axis=1)

In [16]:
## linear regression
y, X = dmatrices('AnswerHepfulness ~ stablelda_sim  + Sequence + QuestionHelpfulness  + logwords ', data=df, 
                 return_type='dataframe')
model = sm.OLS(y, X)
ols_res = model.fit()
print(ols_res.summary())

                            OLS Regression Results                            
Dep. Variable:       AnswerHepfulness   R-squared:                       0.340
Model:                            OLS   Adj. R-squared:                  0.340
Method:                 Least Squares   F-statistic:                     4245.
Date:                Tue, 30 Aug 2022   Prob (F-statistic):               0.00
Time:                        16:12:16   Log-Likelihood:                -37809.
No. Observations:               32899   AIC:                         7.563e+04
Df Residuals:                   32894   BIC:                         7.567e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -0.2567    

In [17]:
## logit regression
y, X = dmatrices('dummy ~ stablelda_sim  + Sequence + QuestionHelpfulness  + logwords', data=df, 
                 return_type='dataframe')
model = sm.Logit(y, X)
logit_res = model.fit()
print(logit_res.summary())

Optimization terminated successfully.
         Current function value: 0.453709
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  dummy   No. Observations:                32899
Model:                          Logit   Df Residuals:                    32894
Method:                           MLE   Df Model:                            4
Date:                Tue, 30 Aug 2022   Pseudo R-squ.:                  0.1416
Time:                        16:12:16   Log-Likelihood:                -14927.
converged:                       True   LL-Null:                       -17388.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -0.9206      0.148     -6.201      0.000      -1.212      -0.630
st

In [18]:
results = summary_col([ols_res, logit_res],stars=True,float_format='%0.3f',
                  model_names=['OLS', 'Logit'],
                  info_dict={'Log-Likelihood': lambda x: "%#8.5g" % x.llf,
                     'AIC': lambda x: "%#8.5g" % x.aic} )
print(results)


                       OLS      Logit  
---------------------------------------
Intercept           -0.257*** -0.921***
                    (0.042)   (0.148)  
stablelda_sim       0.175***  0.422*** 
                    (0.042)   (0.150)  
Sequence            -0.215*** -0.523***
                    (0.003)   (0.011)  
QuestionHelpfulness 0.472***  0.539*** 
                    (0.004)   (0.016)  
logwords            0.221***  0.673*** 
                    (0.005)   (0.018)  
AIC                 75628.    29863.   
Log-Likelihood      -37809.   -14927.  
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


#### takeaways

In the first linear regression model, the coefficient of QASimilarity (stablelda_sim) is 0.1784, and pvalue is 0.000

In the second linear regression model, the coefficient of QASimilarity (stablelda_sim) is 0.1749, and pvalue is 0.000

In the first logit regression model, the coefficient of QASimilarity (stablelda_sim) is 0.3992, and pvalue is 0.008

In the second logit regression model, the coefficient of QASimilarity (stablelda_sim) is 0.4215, and pvalue is 0.005

The effect size and pvalue is stable. To reproduce results in Figure 4 and Figure 5, please run Stable LDA 10 times, save the QASimilarity and regression results, and examine the effect size and p-value of QASimilarity.

#### using LDA to infer topic vectors. 

In [27]:
import pickle
import gensim

In [23]:
gensimcorpus = pickle.load( open('data/stackexchange.gaming.corpus.gensim', 'rb'))
id2word = pickle.load( open('data/stackexchange.gaming.id2word.gensim', 'rb'))

In [26]:
len(gensimcorpus), len(id2word)

(51647, 5000)

LDA first run

In [43]:
lda_model = gensim.models.LdaMulticore(gensimcorpus, num_topics= num_topics, alpha='symmetric', id2word=id2word, passes=10)

In [44]:
lda_theta = []
for bow in gensimcorpus:
    prob = [ i[1] for i in lda_model.get_document_topics(bow, minimum_probability=0)]
    lda_theta.append(prob)
df['lda_sim'] = df.apply(lambda x:1-cosine(lda_theta[x['post_idx']], lda_theta[x['answer_idx']]), axis=1)

In [45]:
y, X = dmatrices(' AnswerHepfulness ~ lda_sim  + Sequence + QuestionHelpfulness  + logwords ', data=df, 
                 return_type='dataframe')
model = sm.OLS(y, X)
ols_res = model.fit()
print(ols_res.summary())

                            OLS Regression Results                            
Dep. Variable:       AnswerHepfulness   R-squared:                       0.340
Model:                            OLS   Adj. R-squared:                  0.340
Method:                 Least Squares   F-statistic:                     4239.
Date:                Tue, 06 Sep 2022   Prob (F-statistic):               0.00
Time:                        10:59:35   Log-Likelihood:                -37818.
No. Observations:               32899   AIC:                         7.565e+04
Df Residuals:                   32894   BIC:                         7.569e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -0.0975    

In [46]:
y, X = dmatrices('dummy ~ lda_sim  + Sequence + QuestionHelpfulness  + logwords', data=df, 
                 return_type='dataframe')
model = sm.Logit(y, X)
logit_res = model.fit()
print(logit_res.summary())

Optimization terminated successfully.
         Current function value: 0.453824
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  dummy   No. Observations:                32899
Model:                          Logit   Df Residuals:                    32894
Method:                           MLE   Df Model:                            4
Date:                Tue, 06 Sep 2022   Pseudo R-squ.:                  0.1413
Time:                        10:59:36   Log-Likelihood:                -14930.
converged:                       True   LL-Null:                       -17388.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -0.5329      0.058     -9.237      0.000      -0.646      -0.420
ld

LDA second run

In [39]:
lda_model = gensim.models.LdaMulticore(gensimcorpus, num_topics= num_topics, alpha='symmetric', id2word=id2word, passes=10)

In [40]:
lda_theta = []
for bow in gensimcorpus:
    prob = [ i[1] for i in lda_model.get_document_topics(bow, minimum_probability=0)]
    lda_theta.append(prob)
df['lda_sim'] = df.apply(lambda x:1-cosine(lda_theta[x['post_idx']], lda_theta[x['answer_idx']]), axis=1)

In [41]:
y, X = dmatrices(' AnswerHepfulness ~ lda_sim  + Sequence + QuestionHelpfulness  + logwords ', data=df, 
                 return_type='dataframe')
model = sm.OLS(y, X)
ols_res = model.fit()
print(ols_res.summary())

                            OLS Regression Results                            
Dep. Variable:       AnswerHepfulness   R-squared:                       0.340
Model:                            OLS   Adj. R-squared:                  0.340
Method:                 Least Squares   F-statistic:                     4239.
Date:                Tue, 06 Sep 2022   Prob (F-statistic):               0.00
Time:                        10:56:32   Log-Likelihood:                -37817.
No. Observations:               32899   AIC:                         7.564e+04
Df Residuals:                   32894   BIC:                         7.569e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -0.0992    

In [42]:
y, X = dmatrices('dummy ~ lda_sim  + Sequence + QuestionHelpfulness  + logwords', data=df, 
                 return_type='dataframe')
model = sm.Logit(y, X)
logit_res = model.fit()
print(logit_res.summary())

Optimization terminated successfully.
         Current function value: 0.453726
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  dummy   No. Observations:                32899
Model:                          Logit   Df Residuals:                    32894
Method:                           MLE   Df Model:                            4
Date:                Tue, 06 Sep 2022   Pseudo R-squ.:                  0.1415
Time:                        10:56:32   Log-Likelihood:                -14927.
converged:                       True   LL-Null:                       -17388.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -0.5193      0.058     -9.004      0.000      -0.632      -0.406
ld

#### takeaways

In the first linear regression model, the coefficient of QASimilarity (lda_sim) is 0.0029, and pvalue is 0.991

In the second linear regression model, the coefficient of QASimilarity (lda_sim) is 0.0252, and pvalue is 0.349

In the first logit regression model, the coefficient of QASimilarity (lda_sim) is  -0.0471, and pvalue is 0.594 

In the second logit regression model, the coefficient of QASimilarity (lda_sim) is -0.2347, and pvalue is 0.009

The effect size and pvalue is unstable. Using Logit regression, the variable\'s estimation is insignificant in the first run, but becomes significanly negative in the second run.

This is the problem  using LDA for variable generation in regression analysis -- sometimes you get significant results but sometimes you get completely opposite results. 


#### We conduct robustness check using TF-IDF. 

In [42]:
vectorizer = TfidfVectorizer(vocabulary=vocabs)

In [43]:
X = vectorizer.fit_transform(texts)

In [44]:
tfidfsim = []
for idx, row in df.iterrows():
    post_tfidf = X[row.post_idx].todense()
    answer_tfidf = X[row.answer_idx].todense()
    tfidfsim.append( 1-cosine(post_tfidf, answer_tfidf) )
df['tfidfsim'] = pd.Series(list(tfidfsim))

run regression to examine the relationship between QA similarity and answer helpfulness

In [46]:
y, X = dmatrices('AnswerHepfulness ~ tfidfsim  + Sequence + QuestionHelpfulness  + logwords', data=df, 
                 return_type='dataframe')
model = sm.OLS(y, X)
ols_res = model.fit()
print(ols_res.summary())

                            OLS Regression Results                            
Dep. Variable:       AnswerHepfulness   R-squared:                       0.342
Model:                            OLS   Adj. R-squared:                  0.342
Method:                 Least Squares   F-statistic:                     4267.
Date:                Sat, 27 Aug 2022   Prob (F-statistic):               0.00
Time:                        15:03:36   Log-Likelihood:                -37780.
No. Observations:               32898   AIC:                         7.557e+04
Df Residuals:                   32893   BIC:                         7.561e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -0.1350    

In [47]:
y, X = dmatrices('dummy ~ tfidfsim  + Sequence + QuestionHelpfulness  + logwords', data=df, 
                 return_type='dataframe')
model = sm.Logit(y, X)
logit_res = model.fit()
print(logit_res.summary())

Optimization terminated successfully.
         Current function value: 0.453120
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  dummy   No. Observations:                32898
Model:                          Logit   Df Residuals:                    32893
Method:                           MLE   Df Model:                            4
Date:                Sat, 27 Aug 2022   Pseudo R-squ.:                  0.1427
Time:                        15:03:42   Log-Likelihood:                -14907.
converged:                       True   LL-Null:                       -17388.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -0.6344      0.059    -10.738      0.000      -0.750      -0.519
tf

reproduce Table H 

In [48]:
results = summary_col([ols_res, logit_res],stars=True,float_format='%0.3f',
                  model_names=['OLS', 'Logit'],
                  info_dict={'Log-Likelihood': lambda x: "%#8.5g" % x.llf,
                     'AIC': lambda x: "%#8.5g" % x.aic} )
print(results)


                       OLS      Logit  
---------------------------------------
Intercept           -0.135*** -0.634***
                    (0.018)   (0.059)  
tfidfsim            0.202***  0.550*** 
                    (0.024)   (0.082)  
Sequence            -0.213*** -0.516***
                    (0.003)   (0.011)  
QuestionHelpfulness 0.473***  0.542*** 
                    (0.004)   (0.016)  
logwords            0.207***  0.634*** 
                    (0.005)   (0.017)  
AIC                 75571.    29823.   
Log-Likelihood      -37780.   -14907.  
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
