In [1]:
# all imports here
import networkx as nx
import matplotlib.pyplot as plt
import os
import json
import pandas as pd
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:

all_text_data = []

stop_words = set(stopwords.words('english'))


def process_tweet(tweet):
    # Split the tweet into words
    words = str(tweet).split()
    
    # List comprehension to filter out stop words and short words
    filtered_words = [
        word for word in words 
        if word.lower() not in stop_words 
        and len(word) > 2 
        and not word.startswith('http')
        and word.isnumeric() == False
    ]
    
    # Join the filtered words back into a string
    processed = ' '.join(filtered_words)
    
    return processed

In [3]:
 

base_directory = "CrisisLexT26-v1.0\CrisisLexT26"

text_data=pd.DataFrame()

for folder_name in os.listdir(base_directory):
    folder_path = os.path.join(base_directory, folder_name)
    if os.path.isdir(folder_path):
        json_file_path = os.path.join(folder_path, f"{folder_name}-event_description.json")
        
        #get event description. compare accuracy later
        with open(json_file_path, 'r', encoding='utf-8') as f:
            event_data = json.load(f)
        
        
        tweets_csv_path = os.path.join(folder_path, f"{folder_name}-tweets_labeled.csv")
                
        df = pd.read_csv(tweets_csv_path)
                
        # Filter relevant tweets
        df = df[df[' Informativeness'].isin(['Related - but not informative', 'Related and informative'])]

        text_data = df[' Tweet Text'].apply(process_tweet).values

        all_text_data.extend(text_data)





In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from nltk.tokenize import word_tokenize
from gensim.models import LdaModel
      

tokens = [word_tokenize(doc) for doc in all_text_data]
tokens = [[word for word in doc if word not in stop_words] for doc in tokens]

# Create Gensim dictionary
dictionary = Dictionary(tokens)

# Convert your tokenized texts to a bag-of-words format
corpus = [dictionary.doc2bow(text) for text in tokens]

vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, stop_words='english', ngram_range=(1, 2))
text_tfidf = vectorizer.fit_transform(all_text_data)


lda_gensim = LdaModel(corpus, num_topics=26, id2word=dictionary, passes=15)

lda = LatentDirichletAllocation(random_state=42)
lda.fit(text_tfidf)

# Print topics
for index, topic in lda_gensim.print_topics(-1):
    print(f"Topic #{index + 1}")
    print(topic)



Topic #1
0.312*"West" + 0.108*"building" + 0.048*"say" + 0.027*"officials" + 0.017*"stop" + 0.015*"Lord" + 0.014*"across" + 0.011*"Sunday" + 0.010*"Friday" + 0.010*"Times"
Topic #2
0.081*"video" + 0.050*"deadly" + 0.043*"survivors" + 0.031*"North" + 0.025*"Latest" + 0.017*"weather" + 0.017*"YouTube" + 0.016*"LIVE" + 0.014*"Disaster" + 0.013*"Mayor"
Topic #3
0.079*"“" + 0.056*"AP" + 0.043*"”" + 0.034*"police" + 0.029*"man" + 0.025*"time" + 0.024*"VIDEO" + 0.021*"UPDATE" + 0.021*"morning" + 0.016*"hope"
Topic #4
0.078*"help" + 0.039*"need" + 0.037*"relief" + 0.034*"Death" + 0.032*"know" + 0.029*"may" + 0.026*"see" + 0.025*"please" + 0.022*"$" + 0.019*"still"
Topic #5
0.212*"?" + 0.075*"injured" + 0.065*"affected" + 0.049*"week" + 0.031*"tragedy" + 0.024*"City" + 0.023*"town" + 0.017*"people" + 0.017*"today" + 0.015*"back"
Topic #6
0.060*"Video" + 0.033*"cause" + 0.027*"Photo" + 0.027*"storm" + 0.026*"youtube" + 0.022*"Que" + 0.020*"--" + 0.019*"via" + 0.016*"change" + 0.015*"HuffingtonPo

In [5]:
from sklearn.decomposition import NMF

n_topics = 26

# Apply NMF
nmf = NMF(n_components=n_topics, random_state=42, l1_ratio=.5).fit(text_tfidf)

# Print the topics
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf.components_):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))



Topic #1:
derailment train derailment metro north metro north train nyc nyc train dead north train
Topic #2:
costa costa rica rica terremoto costa tsunami terremoto alerta alerta tsunami quake hits costa
Topic #3:
bangladesh collapse building building collapse bangladesh building factory factory collapse bangladesh factory garment collapse bangladesh
Topic #4:
brazil nightclub nightclub brazil praying affected sad hear sad brazil tragedy tragedy praying hear nightclub ddlovato
Topic #5:
helicopter glasgow helicopter crash crash pub glasgow helicopter clutha police crash glasgow police helicopter
Topic #6:
typhoon haiyan philippines typhoon haiyan bopha typhoon bopha pablo typhoon pablo super super typhoon
Topic #7:
tren santiago accidente compostela santiago compostela accidente tren españa del tren santiago tren españa
Topic #8:
colorado flooding springs colorado springs floods colorado floods wildfire colorado flooding wildfires colorado wildfire
Topic #9:
texas explosion plant ferti

In [6]:
from gensim.models.coherencemodel import CoherenceModel

# Extract top words for each NMF topic
top_words = []
feature_names = vectorizer.get_feature_names_out()
for topic in nmf.components_:
    top_word_indices = topic.argsort()[-10:][::-1]  # You can change 10 to any other number
    top_words.append([feature_names[i] for i in top_word_indices])

# Compute coherence for NMF topics
texts = tokens
nmf_topics = [[dictionary.token2id[word] for word in topic if word in dictionary.token2id] for topic in top_words]

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_gensim, texts=tokens, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for LDA: ', coherence_lda)

coherence_model_nmf = CoherenceModel(topics=nmf_topics, texts=tokens, dictionary=dictionary, coherence='c_v')
coherence_nmf = coherence_model_nmf.get_coherence()
print('\nCoherence Score for NMF: ', coherence_nmf)



Coherence Score for LDA:  0.5055515340333058

Coherence Score for NMF:  0.5557482579334588
