# Load library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!pip install bertopic
from bertopic import BERTopic

In [None]:
# from google.colab import drive
# import os
# drive.mount('/content/drive')

# Bertopic

In [5]:
combined_data = pd.read_csv('combined_rumor_target.csv',header = 0,index_col = 0,encoding = 'utf-8')

## Rumor Source tweets

[report](https://docs.google.com/spreadsheets/d/1iWx_pUEnssY3wUK11iJSiqdn7iZQAGxn0dofwTdludY/edit?usp=sharing)

In [6]:
src = combined_data[combined_data.src_reply=='src']
src.isnull().sum()

tweet_id        0
user_id         0
tweet           0
cleaned_text    0
targetlabel     0
date            0
rumorlabel      0
sentiment       0
src_reply       0
dtype: int64

In [10]:
target_docs = src.loc[src.targetlabel ==1,'cleaned_text'].to_list()
nontarget_docs = src.loc[src.targetlabel ==0,'cleaned_text'].to_list()

In [11]:
print("# of target tweets: {}".format(len(target_docs)) )
print("# of non-target tweets: {}".format(len(nontarget_docs)) )

# of target tweets: 36
# of non-target tweets: 408


In [16]:
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True,nr_topics=10)

### topics of target tweets

In [17]:
topics, probs = topic_model.fit_transform(target_docs)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2022-12-22 15:10:59,619 - BERTopic - Transformed documents to Embeddings
2022-12-22 15:11:03,372 - BERTopic - Reduced dimensionality
2022-12-22 15:11:03,392 - BERTopic - Clustered reduced embeddings
2022-12-22 15:11:03,563 - BERTopic - Reduced number of topics from 3 to 3


In [19]:
topic_model.get_topic_freq().sort_values(['Topic'])

Unnamed: 0,Topic,Count
2,-1,10
0,0,14
1,1,12


In [21]:
topic_words = topic_model.get_topics()
topiclist = []
for topic in range(2):
  wordlist = []
  for words in range(10):
        wordlist.append(topic_words[topic][words][0])
  topiclist.append(wordlist)

In [22]:
df = pd.DataFrame(topiclist)

In [23]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,suspect,french,police,martyr,say,die,want,tell,hebdo,charlie
1,akbar,charliehebdo,shout,allahu,gunmen,attack,video,witness,report,shoot


In [None]:
fig = topic_model.visualize_barchart(n_words=10,top_n_topics=10)
fig

### topics of non-targets

In [24]:
topics, probs = topic_model.fit_transform(nontarget_docs)

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

2022-12-22 15:17:24,282 - BERTopic - Transformed documents to Embeddings
2022-12-22 15:17:27,740 - BERTopic - Reduced dimensionality
2022-12-22 15:17:27,782 - BERTopic - Clustered reduced embeddings
2022-12-22 15:17:27,880 - BERTopic - Reduced number of topics from 2 to 2


In [25]:
topic_model.get_topic_freq().sort_values(['Topic'])

Unnamed: 0,Topic,Count
1,0,27
0,1,381


In [27]:
topic_words = topic_model.get_topics()
topiclist = []
for topic in range(2):
  wordlist = []
  for words in range(10):
        wordlist.append(topic_words[topic][words][0])
  topiclist.append(wordlist)

In [28]:
nontarget_df = pd.DataFrame(topiclist)

In [29]:
nontarget_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,banksy,jesuischarlie,banksys,charliehebdo,powerful,isnt,tribute,response,not,attack
1,paris,charliehebdo,police,suspect,break,attack,french,kill,shoot,report


In [30]:
nontarget_fig = topic_model.visualize_barchart(n_words=10,top_n_topics=10)
nontarget_fig

## Rumor Reply tweets 

[report](https://docs.google.com/spreadsheets/d/1iWx_pUEnssY3wUK11iJSiqdn7iZQAGxn0dofwTdludY/edit?usp=sharing)

In [31]:
reply = combined_data[combined_data.src_reply=='reply']
reply.isnull().sum()

tweet_id        0
user_id         0
tweet           0
cleaned_text    0
targetlabel     0
date            0
rumorlabel      0
sentiment       0
src_reply       0
dtype: int64

In [32]:
target_docs = reply.loc[reply.targetlabel ==1,'cleaned_text'].to_list()
nontarget_docs = reply.loc[reply.targetlabel ==0,'cleaned_text'].to_list()

In [33]:
print("# of target tweets: {}".format(len(target_docs)) )
print("# of non-target tweets: {}".format(len(nontarget_docs)) )

# of target tweets: 2361
# of non-target tweets: 3818


In [34]:
from bertopic import BERTopic

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True,nr_topics="auto")

### topics of target tweets

In [35]:
topics, probs = topic_model.fit_transform(target_docs)

Batches:   0%|          | 0/74 [00:00<?, ?it/s]

2022-12-22 15:22:37,303 - BERTopic - Transformed documents to Embeddings
2022-12-22 15:22:51,901 - BERTopic - Reduced dimensionality
2022-12-22 15:22:52,368 - BERTopic - Clustered reduced embeddings
2022-12-22 15:22:54,046 - BERTopic - Reduced number of topics from 42 to 13


In [36]:
topic_model.get_topic_freq().sort_values(['Topic'])

Unnamed: 0,Topic,Count
0,-1,900
1,0,900
2,1,194
3,2,112
4,3,87
5,4,33
6,5,31
7,6,26
8,7,24
9,8,18


In [37]:
topic_words = topic_model.get_topics()
topiclist = []
for topic in range(12):
  wordlist = []
  for words in range(10):
        wordlist.append(topic_words[topic][words][0])
  topiclist.append(wordlist)

In [38]:
df = pd.DataFrame(topiclist)

In [39]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,not,kill,religion,people,muslims,islam,no,terrorist,peace,say
1,france,french,le,police,de,paris,les,la,pas,je
2,israel,jews,gaza,never,hamas,kill,not,people,anti,dont
3,charliehebdo,hebdo,charlie,attack,paris,police,dead,french,mt,satirical
4,que,de,la,por,el,se,como,los,en,lo
5,tweet,twitter,understand,fake,victim,not,check,mam,reply,information
6,kosher,supermarket,store,grocery,jewish,synagogue,market,mean,bury,jesuisjuif
7,trkiyede,ss,kii,kaide,koavi,tmgeneral,var,israilli,aviv,el
8,banksy,banksys,account,gogh,lucille,clerc,fan,van,not,draw
9,het,niet,dit,ik,dat,belga,zou,bij,uit,toch


In [40]:
fig = topic_model.visualize_barchart(n_words=10,top_n_topics=10)
fig

### topics of non-targets

In [41]:
topics, probs = topic_model.fit_transform(nontarget_docs)

Batches:   0%|          | 0/120 [00:00<?, ?it/s]

2022-12-22 15:30:02,566 - BERTopic - Transformed documents to Embeddings
2022-12-22 15:30:27,836 - BERTopic - Reduced dimensionality
2022-12-22 15:30:29,410 - BERTopic - Clustered reduced embeddings
2022-12-22 15:30:33,999 - BERTopic - Reduced number of topics from 90 to 61


In [42]:
topic_model.get_topic_freq().sort_values(['Topic']).head(11)

Unnamed: 0,Topic,Count
0,-1,1439
1,0,755
2,1,117
3,2,96
4,3,83
5,4,60
6,5,51
7,6,48
8,7,46
9,8,40


In [43]:
topic_words = topic_model.get_topics()
topiclist = []
for topic in range(10):
  wordlist = []
  for words in range(10):
        wordlist.append(topic_words[topic][words][0])
  topiclist.append(wordlist)

In [44]:
nontarget_df = pd.DataFrame(topiclist)

In [45]:
nontarget_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,paris,french,france,charliehebdo,break,tweet,news,charlie,kill,police
1,islam,muslim,muslims,allah,nothing,islamic,muhammad,im,billion,know
2,hostages,hostage,pray,women,hope,two,situations,terrorists,storm,news
3,dont,say,doubt,word,understand,get,offend,not,relevance,no
4,religion,peace,strike,peaceful,god,prophet,piece,agenda,facts,believe
5,irony,think,ignorant,brainwash,process,talk,mean,ironic,morons,lot
6,gun,police,cop,policeman,officer,man,policemen,shoot,carry,policewoman
7,oh,wow,smh,dear,wth,hi,kk,duhh,damm,gh
8,safe,fear,stay,crisis,tragedy,missile,cuban,afraid,necessary,risk
9,source,please,follow,oblige,nicki,understatement,sourceon,philly,stinkin,followfriday


In [46]:
nontarget_fig = topic_model.visualize_barchart(n_words=10,top_n_topics=10)
nontarget_fig