In [1]:
import os
import json
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import itertools

def apply_keygraph(text_data):
    vectorizer = CountVectorizer(min_df=5)
    term_document_matrix = vectorizer.fit_transform(text_data).toarray()
    features = vectorizer.get_feature_names_out()
    
    co_occurrence_graph = nx.Graph()

    # Get term co-occurrences
    co_occurrences = (term_document_matrix.T @ term_document_matrix)
    
    for i, feature1 in enumerate(features):
        for j, feature2 in enumerate(features):
            if i != j:  # Avoid self-loops
                weight = co_occurrences[i, j]
                if weight > 0:
                    co_occurrence_graph.add_edge(feature1, feature2, weight=weight)
                
    return co_occurrence_graph

# Initialize an empty list to collect all text data
all_text_data = []

base_directory = "CrisisLexT26-v1.0\CrisisLexT26"

for folder_name in os.listdir(base_directory):
    folder_path = os.path.join(base_directory, folder_name)
    if os.path.isdir(folder_path):
        json_file_path = os.path.join(folder_path, f"{folder_name}-event_description.json")
        
        # Read the JSON file
        with open(json_file_path, 'r', encoding='utf-8') as f:
            event_data = json.load(f)
        
        # Assuming tweets are stored in a CSV file within each folder
        tweets_csv_path = os.path.join(folder_path, f"{folder_name}-tweets_labeled.csv")
        
        # Read the CSV file
        df = pd.read_csv(tweets_csv_path)
        
        df = df[df[' Informativeness'].isin(['Related - but not informative', 'Related and informative'])]

        # Filter the DataFrame to get the relevant tweets
        text_data = df[' Tweet Text'].values
        
        
        # Add to all_text_data
        all_text_data.extend(text_data)

# Apply KeyGraph algorithm to all the text data
G = apply_keygraph(all_text_data)

# You can plot the graph using NetworkX and matplotlib
plt.figure(figsize=(10, 6))
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, node_size=3000)
nx.draw_networkx_edges(G, pos)
nx.draw_networkx_labels(G, pos, font_size=10)
plt.title("Keygraph based on Co-occurrence of Words")
plt.show()


In [1]:
topics_or_keywords = apply_keygraph(all_text_data)


NameError: name 'apply_keygraph' is not defined