<a href="https://colab.research.google.com/github/vinniedean/collab/blob/main/SFO_Art_Audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/openai/whisper.git


In [None]:
import whisper

# Load the model, for example, the "small" model
model = whisper.load_model("small")


In [None]:
# Specify the path to your audio file
audio_path = "/content/International_Main_Hall_4.m4a"

# Transcribe the audio
result = model.transcribe(audio_path)

# Print the transcription
print(result["text"])


In [None]:
!pip install pymupdf nltk gensim matplotlib seaborn


In [None]:
import fitz  # PyMuPDF
import nltk
import gensim
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [9]:
pdf_path = '/content/Rupert Garcia.pdf'  # Update this path
doc = fitz.open(pdf_path)
text = ""
for page in doc:
    text += page.get_text()


In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('wordnet')

# Preprocess function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

cleaned_text = preprocess_text(text)


In [None]:
from gensim import corpora, models

# Tokenize cleaned text
tokens = [word for word in cleaned_text.split()]

# Create a dictionary and corpus for topic modeling
dictionary = corpora.Dictionary([tokens])
corpus = [dictionary.doc2bow(token) for token in [tokens]]

# Using LDA for topic modeling
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Print topics
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)


In [None]:
from collections import Counter
import seaborn as sns

word_counts = Counter(tokens)
most_common_words = word_counts.most_common(10)

words = [word[0] for word in most_common_words]
counts = [word[1] for word in most_common_words]

plt.figure(figsize=(10, 5))
sns.barplot(x=words, y=counts)
plt.title('Most Common Words')
plt.show()


In [None]:
!pip install networkx matplotlib


In [14]:
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations
from collections import Counter


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

# Assuming `cleaned_text` is your preprocessed text data
sentences = sent_tokenize(cleaned_text)
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]


In [16]:
# Define the window size (sentence-wise co-occurrence)
window_size = 2  # This is just illustrative; we're using sentences, so it's not used here

# Create a list of all word pairs within the same sentence
word_pairs = []
for sentence in tokenized_sentences:
    # Using combinations to create pairs of words in the same sentence
    word_pairs.extend(list(combinations(sentence, 2)))

# Count the occurrences of each pair to understand the strength of connections
pair_counts = Counter(word_pairs)


In [17]:
# Initialize a graph
G = nx.Graph()

# Add edges between words with weights
for pair, count in pair_counts.items():
    word1, word2 = pair
    G.add_edge(word1, word2, weight=count)


In [None]:
plt.figure(figsize=(12, 12))

# Layout for our nodes
pos = nx.spring_layout(G, k=0.1)

# Drawing the graph
nx.draw(G, pos, with_labels=True, node_color='skyblue', edge_color='k', linewidths=1, font_size=20, node_size=10, alpha=0.7)

# Considering edge weights
edge_weights = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_weights)

plt.title("Word Co-occurrence Network")
plt.show()


In [26]:
# Calculate degree centrality
degrees = dict(G.degree())

# Sort nodes by degree centrality
sorted_degrees = sorted(degrees.items(), key=lambda x: x[1], reverse=True)

# Let's say we're interested in the top 5 nodes
key_nodes = [node[0] for node in sorted_degrees[:5]]
print("Key nodes:", key_nodes)


Key nodes: ['bird', 'technology', 'work', 'symbolizes', 'natural']


In [27]:
# Choose a key node. For demonstration, let's use the first key node identified
key_node = key_nodes[0]

# Extract a subgraph centered around the key node
# This example uses ego_graph to get all nodes connected directly to the key node
subgraph = nx.ego_graph(G, key_node)

# For a larger radius (more steps away), you can use the radius parameter
# subgraph = nx.ego_graph(G, key_node, radius=2)


In [None]:
plt.figure(figsize=(8, 8))
pos = nx.spring_layout(subgraph, k=0.5)
nx.draw(subgraph, pos, with_labels=True, node_color='lightblue', edge_color='gray', linewidths=1, font_size=12, node_size=500, alpha=0.7)
plt.title(f"Subgraph Centered Around Node: {key_node}")
plt.show()
