In [1]:
import tensorflow as tf

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPU is ready to be used: {gpus[0].name}")
else:
    print("GPU is NOT available.")


GPU is NOT available.


In [3]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPUs available:", tf.config.list_physical_devices('GPU'))


TensorFlow version: 2.14.0
GPUs available: []


In [2]:
import tensorflow as tf
from tensorflow.python.client import device_lib

print("Available devices:")
print(device_lib.list_local_devices())

Available devices:
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3403532797827082677
xla_global_id: -1
]


In [3]:
import tensorflow as tf
print(tf.__version__)


2.18.0


In [1]:
# Import necessary libraries
import re
import nltk
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score
from scipy.sparse import lil_matrix, csr_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split, KFold

# For inline plots
%matplotlib inline

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load text file
filename = "text8.txt"  # Replace with your file path
with open(filename, "r", encoding="utf-8") as f:
    text = f.read()

# Clean text
def clean_text(txt):
    txt = txt.lower()
    txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)  # Remove non-ASCII characters
    txt = re.sub(r'\d+', '', txt)            # Remove digits
    txt = re.sub(r'[^\w\s]', '', txt)        # Remove punctuation
    return txt

text = clean_text(text)
print(f"Text after cleaning (sample): {text[:200]}")


Text after cleaning (sample):  anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term 


In [3]:
# Tokenize text
tokens = word_tokenize(text)

# Remove stopwords
stop_words = set(nltk.corpus.stopwords.words('english'))
tokens = [t for t in tokens if t not in stop_words and t.strip() != ""]
print(f"Number of tokens after cleaning and stopword removal: {len(tokens)}")


Number of tokens after cleaning and stopword removal: 10888361


In [4]:
# Count word frequencies
freq = Counter(tokens)
most_common = freq.most_common()

# Limit vocabulary size
vocab_size = 5000
vocab_list = [w for w, _ in most_common[:vocab_size]]
vocab = {word: idx for idx, word in enumerate(vocab_list)}

print(f"Vocabulary size: {len(vocab)}")
print(f"Sample vocabulary: {vocab_list[:10]}")


Vocabulary size: 5000
Sample vocabulary: ['one', 'zero', 'nine', 'two', 'eight', 'five', 'three', 'four', 'six', 'seven']


In [5]:
# Filter tokens based on vocabulary
filtered_tokens = [t for t in tokens if t in vocab_list]

# Train-test split
train_tokens, val_tokens = train_test_split(filtered_tokens, test_size=0.2, random_state=42)
print(f"Training tokens: {len(train_tokens)}, Validation tokens: {len(val_tokens)}")


KeyboardInterrupt: 

In [None]:
# Build co-occurrence matrix
window_size = 2
train_vocab = sorted(set(train_tokens))
word_index = {word: idx for idx, word in enumerate(train_vocab)}

co_matrix_sparse = lil_matrix((len(train_vocab), len(train_vocab)), dtype=int)

for i, word in enumerate(train_tokens):
    for j in range(max(0, i - window_size), min(len(train_tokens), i + window_size + 1)):
        if i != j:
            co_matrix_sparse[word_index[word], word_index[train_tokens[j]]] += 1

co_matrix_sparse = csr_matrix(co_matrix_sparse)
print(f"Non-zero entries in the co-occurrence matrix: {co_matrix_sparse.nnz}")


In [None]:
# Normalize and reduce dimensions
normalized_matrix = normalize(co_matrix_sparse, norm='l2', axis=1)
svd = TruncatedSVD(n_components=10, random_state=42)
reduced_matrix = svd.fit_transform(normalized_matrix)

print(f"Reduced matrix shape: {reduced_matrix.shape}")


In [None]:
# Calculate silhouette scores
k_values = range(2, 50)
silhouette_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(reduced_matrix)
    labels = kmeans.labels_
    score = silhouette_score(reduced_matrix, labels)
    silhouette_scores.append(score)
    print(f"K={k}, Silhouette Score={score:.4f}")

# Plot silhouette scores
plt.figure(figsize=(8, 5))
plt.plot(k_values, silhouette_scores, marker='o')
plt.title("Silhouette Scores for Different K")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Silhouette Score")
plt.grid()
plt.show()


In [None]:
# Elbow method to find optimal K
inertia_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(reduced_matrix)
    inertia_scores.append(kmeans.inertia_)

# Plot inertia scores (Elbow graph)
plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia_scores, marker='o')
plt.title("Elbow Graph for K-Means")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia")
plt.grid()
plt.show()

# Find optimal K
optimal_k = k_values[np.argmin(inertia_scores)]
print(f"Optimal number of clusters (Elbow Method): {optimal_k}")


In [None]:
# Cross-validation silhouette scores
def silhouette_scores_cv(X, max_k=10, n_folds=5):
    k_values = range(2, max_k + 1)
    avg_scores = []

    for k in k_values:
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        fold_scores = []

        for train_idx, test_idx in kf.split(X):
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(X[train_idx])
            labels = kmeans.predict(X[test_idx])
            score = silhouette_score(X[test_idx], labels, metric="cosine")
            fold_scores.append(score)

        avg_scores.append(np.mean(fold_scores))
        print(f"K={k}, Avg Silhouette Score: {np.mean(fold_scores):.4f}")

    return avg_scores

# Calculate and plot cross-validation silhouette scores
cv_scores = silhouette_scores_cv(reduced_matrix, max_k=20)
plt.figure(figsize=(8, 5))
plt.plot(range(2, 21), cv_scores, marker='o')
plt.title("Cross-Validation Silhouette Scores")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Silhouette Score")
plt.grid()
plt.show()
