## Read Data (replies already been excluded)

In [1]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords, wordnet
import pandas as pd

text = pd.read_csv('Jan6(excludes replies).csv')
print('The number of tweets (excludes replies) before data pre-processing:',len(text))

The number of tweets (excludes replies) before data pre-processing: 6501


## Data Pre-processing

In [2]:
import gensim
import gensim.corpora as corpora
import re
porter = PorterStemmer()
stop_words = stopwords.words('english')
b = []
for i,u in text.iterrows():
    a = []
    word =''
    for words in str(u['tweets']).split(): #tokenization
        if '@' not in words: #remove @users
            words = words.replace('#','') #remove hashtag symbol
            if '#' not in words:
                if 'http' not in words: #remove URLs
                    if'&amp' not in words: #remove symbol
                        words = re.sub(r'[^a-zA-Z]', ' ', words)
                        words = words.lower()# lower form
                        #words = re.sub(r'[^a-zA-Z]', ' ', words) #remove non-alphabetic characters
                        if len(words)>2:
                            word += (words+' ')
    doc = ''
    for token in word.split():
        if len(token) >2: # remove words that have less than 3 characters
            if token not in stop_words:# remove stopwords
                token = porter.stem(token) #stemming
                doc += (token+' ')
    b.append(doc)
text['processed']=[i for i in b]

# exclude tweets that are not in English
non_english_list = ['temiz','rkiy','erik','nda','konu','dan','da','ba','temiz','al','viand','para','na','dann','uft','laboratorio','dieser','kalbimi',
                   'restoranda','evento','komo','ind','tica','futuro','sonra','yla','cre','ili','daki',
                   'zaman']
index_axis = []
for index,i in text.iterrows():
    if len(i['processed']) == 0:
        index_axis.append(index)
    else:
        for word in i['processed'].split():
            if word in non_english_list:
                index_axis.append(index)
                break
text.drop(text.index[index_axis],inplace=True)
print("number of tweets after cleaning:",len(text))

number of tweets after cleaning: 6248


### Number of tweets from each company after pre-processing:

In [3]:
text.groupby('Company')['tweets'].count().sort_values(ascending=False)

Company
Memphis Meats       1545
biftek.co 🔬👩‍🔬🐄🥗    1429
Aleph Farms          704
Finless Foods        516
shiokmeats           463
SuperMeat            315
New Age Meats        306
BlueNalu             273
CUBIQ FOODS          243
Meatable             114
Mosa Meat            108
Wildtype              89
Future Fields         44
Vow                   36
FutureMeat            32
Balletic Foods        25
Mission Barns          2
LabFarmFoods           2
Avant Meats            2
Name: tweets, dtype: int64

## GuidedLDA

In [4]:
import guidedlda
import numpy as np
seed_topic_list = [['environ','carbon','greenhous','futur','emiss','better','planet','save','reduc',
                   'system','land','plastic','sustain','climat','environment','climatechang'], #0 Environmental Impact
                   ['plant','vegan','vegetarian','anim','altern','protein','plantbas'], #1 Vegetarian
                   ['without','harm','slaughter','cruelti','welfar','kill','cow','human','farm','save','free','anim'],#2 Animal Welfare
                   ['consumpt','popul','demand','secur','wast','grow','global','sustain','need',
                   'consum','eat','growth','feed','solut'],#3 Food Security
                   ['antibiot','antimicrobi','resist','contamin','safe','free','overus',
                   'metal','bacteria'],#4 Food Safety
                   ['regulatori','usda','fda','safe','joint','label','regul','clariti'],#5 Regulations
                   ['industri','suppli','convent','system','chain','compani',
                    'market','consum','store','groceri','product','demand','grow','launch','scale','cost','sell'],#6 Food Industry & Market
                   ['seafood','ocean','fish','aquacultur','shrimp','salmon','wild','lobster','tuna','sushi','crustacean'],#7 Seafood
                   ['chicken','duck','poultri','burger','steak','meatbal','beef','sausag',
                    'tast','tender','textur','juici','test','delici','tasti'],#8 Poultry and Meat
                   ['stem','muscl','divis','biolog','cellular','technolog','agricultur','biotech','tech'],#9 Process
                   ['seri','fund','rais','invest','dollar','pound','round','startup',
                   'opportun','first','seed','isra','acceler','announc','happi','thrill','excit','pleas','contribut','honor','proud','statement'],#10 Fundraising Announcement
                   ['thank','support','shoutout','help','question','mention'],#11 Appreciation
                   ['confer','regist','live','symposium','stream','livestream','summit','founder','talk','speak','ceo'], #12 Conference and Summit
                   ['hire','join','team','bring','welcom','work','appli','research','specialist','student','career','posit'],#13 Hiring Information
                   ['congrat','congratul','list','award','winner','finalist','partner','prize'],#14 Congratulations
                   ['check','post','paper','interview','articl','blog','news','break','read',
                   'latest','watch','podcast','listen','episod','stori','coverag','campaign','video','discuss',
                    'chat','report','ceo','media','present']]#15 Media

from sklearn.feature_extraction.text import CountVectorizer
model = guidedlda.GuidedLDA(n_topics=19,n_iter=1000,random_state=5,refresh=10,alpha=0.01,eta=0.01)#5 ok, try some other random seed
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text['processed'])

vocab = vectorizer.get_feature_names()
word2id = dict((v,idx) for idx,v in enumerate(vocab))
seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[word2id[word]] = t_id

model.fit(X.toarray(),seed_topics=seed_topics,seed_confidence=0.6) #set seed confidence to 0.7
topic_word = model.topic_word_
n_top_words = 30
vocab = tuple(vocab)

for i, topic_dist in enumerate(topic_word): #Print out results
    print('\n')
    print('Topic:',i)
    words_probability = np.array(-topic_dist)
    for index in range(n_top_words):
        print(round(abs(np.sort(words_probability))[:(n_top_words)][index],4),'*',
              np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1][index],sep='',end='  ')
        

INFO:guidedlda:n_documents: 6248
INFO:guidedlda:vocab_size: 7520
INFO:guidedlda:n_words: 74569
INFO:guidedlda:n_topics: 19
INFO:guidedlda:n_iter: 1000
INFO:guidedlda:<0> log likelihood: -1002371
INFO:guidedlda:<10> log likelihood: -622343
INFO:guidedlda:<20> log likelihood: -607303
INFO:guidedlda:<30> log likelihood: -600488
INFO:guidedlda:<40> log likelihood: -596588
INFO:guidedlda:<50> log likelihood: -593598
INFO:guidedlda:<60> log likelihood: -592134
INFO:guidedlda:<70> log likelihood: -590259
INFO:guidedlda:<80> log likelihood: -588339
INFO:guidedlda:<90> log likelihood: -587346
INFO:guidedlda:<100> log likelihood: -586629
INFO:guidedlda:<110> log likelihood: -585520
INFO:guidedlda:<120> log likelihood: -585061
INFO:guidedlda:<130> log likelihood: -585046
INFO:guidedlda:<140> log likelihood: -584076
INFO:guidedlda:<150> log likelihood: -583688
INFO:guidedlda:<160> log likelihood: -582997
INFO:guidedlda:<170> log likelihood: -583063
INFO:guidedlda:<180> log likelihood: -582738
INFO



Topic: 0
0.0276*meat  0.0266*chang  0.0253*food  0.0199*climat  0.0186*anim  0.0165*world  0.0123*peopl  0.0107*eat  0.0105*use  0.0102*planet  0.0088*could  0.0088*feed  0.0086*impact  0.0084*environ  0.0082*sustain  0.0079*environment  0.0077*reduc  0.0073*human  0.0071*need  0.0071*one  0.0065*product  0.0063*system  0.0061*way  0.0059*agricultur  0.0059*help  0.0057*save  0.0057*grow  0.0057*earth  0.0057*like  0.0057*fish  

Topic: 1
0.033*base  0.0316*plant  0.0266*meat  0.0181*protein  0.017*fat  0.017*burger  0.0146*food  0.0143*altern  0.014*vegan  0.014*year  0.0124*plantbas  0.011*product  0.0107*beyond  0.0107*market  0.0091*new  0.0088*anim  0.0082*industri  0.0082*billion  0.008*tast  0.0074*next  0.0074*sale  0.0071*report  0.0071*beef  0.0071*cubiq  0.0066*milk  0.0063*omega  0.006*consum  0.0058*smart  0.0058*egg  0.0058*oil  

Topic: 2
0.1071*meat  0.0615*anim  0.0353*without  0.0227*make  0.0215*slaughter  0.0183*real  0.017*eat  0.0148*cleanmeat  0.0146*futur  0.0

## Categorize Tweets according to their most salient topic

In [5]:
doc_topic = model.transform(X)
topic_number = []
number = []

for i in range(len(doc_topic)):
    topic_number.append(doc_topic[i].argmax())
    number.append('1')
data = pd.DataFrame(data=[i for i in topic_number],columns=['topic_number'])
data['number'] = [i for i in number]
number_of_tweets = pd.DataFrame(data.groupby('topic_number')['number'].count())
number_of_tweets['proportion'] = [str(round(i/len(text)*100,2))+'%' for i in number_of_tweets['number']]
number_of_tweets

  if sparse and not np.issubdtype(doc_word.dtype, int):


Unnamed: 0_level_0,number,proportion
topic_number,Unnamed: 1_level_1,Unnamed: 2_level_1
0,422,6.75%
1,276,4.42%
2,419,6.71%
3,349,5.59%
4,167,2.67%
5,251,4.02%
6,367,5.87%
7,195,3.12%
8,445,7.12%
9,219,3.51%


## Prominent factor

In [6]:
prominent_factor = []
for i in range(len(doc_topic)):
    prominent_factor.append(sorted(doc_topic[i])[-1]/sorted(doc_topic[i])[-2])
count = 0
for i in prominent_factor:
    if i >= 1.4:
        count += 1
print('There are ',round(count/len(text)*100,2),'% of tweets with prominent factor higher or equal to 1.4',sep='',)

There are 78.94% of tweets with prominent factor higher or equal to 1.4


## Topic-Author Analysis

In [7]:
doc_topic = model.transform(X)
topic_number1 = []


for i in range(len(doc_topic)):
    topic_number1.append(doc_topic[i].argmax())
text['topic number'] = [i for i in topic_number1]
topic_author = text.groupby(['topic number','Company'])['tweets'].count()
topic_author_3d = topic_author.unstack()
topic_author_3d = topic_author_3d[['Memphis Meats','biftek.co 🔬👩‍🔬🐄🥗','Aleph Farms','SuperMeat',
                                  'Finless Foods','shiokmeats','BlueNalu','New Age Meats','CUBIQ FOODS',
                                  'Mosa Meat','Wildtype','Meatable','Future Fields','Vow',
                                  'FutureMeat','Balletic Foods','LabFarmFoods','Avant Meats','Mission Barns']]

topic = ['Environmental Impact','Vegetarian and Alternative Protein','Animal Welfare',
        'Food Security', 'Food Safety', 'Regulations','Food Industry and Market',
         'Seafood','Poultry and Meat','Process','Announcement(fundraising)',
        'Appreciation','Conference and Summit','Hiring Informaiton','Congratulations',
        'Media','Unseeded Topic 1','Unseeded Topic 2','Unseeded Topic 3']

company = ['Memphis Meats','Biftek.co','Aleph Farms','SuperMeat','Fineless Foods','Shiok Meats','BlueNalu',
          'New Age Meats','Cubiq Foods','Mosa Meat','Wild Type','Meatable','Future Fields','Vow','Future Meat',
           'Balletic Foods','Lab Farm Foods','Avant Meats','Mission Barns']

  if sparse and not np.issubdtype(doc_word.dtype, int):


## Heatmap based on proportion of each topic for each company

In [8]:
topic_author_3d = topic_author_3d.fillna(0) #replace NaN by 0

In [9]:
topic_author_3d.div(topic_author_3d.sum(axis=0),axis=1)

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Company,Memphis Meats,biftek.co 🔬👩‍🔬🐄🥗,Aleph Farms,SuperMeat,Finless Foods,shiokmeats,BlueNalu,New Age Meats,CUBIQ FOODS,Mosa Meat,Wildtype,Meatable,Future Fields,Vow,FutureMeat,Balletic Foods,LabFarmFoods,Avant Meats,Mission Barns
topic number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,0.066667,0.09937,0.048295,0.114286,0.081395,0.028078,0.051282,0.009804,0.061728,0.064815,0.022472,0.070175,0.022727,0.027778,0.03125,0.0,0.0,0.0,0.0
1,0.009061,0.107768,0.015625,0.009524,0.021318,0.008639,0.018315,0.003268,0.26749,0.018519,0.0,0.017544,0.0,0.027778,0.0625,0.04,0.0,0.0,0.0
2,0.116505,0.043387,0.041193,0.27619,0.021318,0.015119,0.021978,0.058824,0.016461,0.009259,0.022472,0.087719,0.022727,0.0,0.03125,0.0,0.0,0.0,0.0
3,0.040777,0.055983,0.039773,0.12381,0.034884,0.12095,0.062271,0.081699,0.057613,0.018519,0.0,0.008772,0.090909,0.027778,0.0,0.04,0.0,0.0,0.0
4,0.013592,0.062281,0.011364,0.022222,0.023256,0.023758,0.003663,0.006536,0.041152,0.009259,0.0,0.04386,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.060841,0.031491,0.009943,0.057143,0.044574,0.017279,0.014652,0.065359,0.004115,0.046296,0.05618,0.105263,0.0,0.083333,0.0625,0.12,0.5,0.0,0.0
6,0.023301,0.088873,0.103693,0.003175,0.042636,0.045356,0.131868,0.029412,0.069959,0.037037,0.067416,0.078947,0.068182,0.0,0.0625,0.0,0.0,0.5,0.0
7,0.008414,0.014696,0.004261,0.0,0.108527,0.032397,0.18315,0.003268,0.012346,0.009259,0.359551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.061489,0.058782,0.294034,0.069841,0.011628,0.010799,0.003663,0.009804,0.032922,0.037037,0.0,0.017544,0.022727,0.027778,0.15625,0.04,0.0,0.0,0.0
9,0.021359,0.045486,0.06108,0.022222,0.044574,0.023758,0.007326,0.013072,0.057613,0.027778,0.022472,0.026316,0.113636,0.0,0.09375,0.04,0.0,0.0,0.0


In [10]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
plt.imshow(topic_author_3d.div(topic_author_3d.sum(axis=0),axis=1),cmap="Blues")
plt.colorbar().ax.set_ylabel('proportion of each topic for each company')
plt.xticks(range(len(topic_author_3d)), company,rotation=90)
plt.yticks(range(len(topic_author_3d)), topic)
plt.xlabel('Company')
plt.ylabel('Topic')
plt.show()

<Figure size 1000x800 with 2 Axes>

## PyLDAvis Visualization

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
data = pyLDAvis.sklearn.prepare(model, X, vectorizer)

pyLDAvis.display(data)

  if sparse and not np.issubdtype(doc_word.dtype, int):


### This visualization seems not to match the results in guidedLDA. The index of topics in this visualization was different from the index of the topics in guidedLDA.


## Traditional LDA

In [None]:
#unigram
from gensim.models import CoherenceModel
unigram = []
unigram_list = []
for index, i in text.iterrows():
    unigram=[]
    for word in i['processed'].split():
        unigram.append(word)
    unigram_list.append(unigram)
data_words = [i for i in unigram_list]
id2word = corpora.Dictionary(data_words)
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start,limit,step):
        model=gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics,random_state=3)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values
#coherence score
limit=21; start=10; step=1
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=start, limit=limit, step=step)
# Show graph
import matplotlib.pyplot as plt
x = range(start, limit, step)
plt.plot(x, coherence_values,label='All tweets')
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(loc='best')

In [None]:
from pprint import pprint
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=11, random_state=0,
                                            alpha='auto', per_word_topics=True)
pprint(lda_model.print_topics(num_words=30))

In [None]:
def check(word):
    if word in stop_words:
        print(word, 'is a stopword')
    else:
        print(word,'is not a stopword')

In [None]:
check('would')