In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer # term frequency inverse document frequency - create a bag of words
from sklearn.decomposition import LatentDirichletAllocation # LDA 

import numpy as np
from scipy import sparse
import pandas as pd

import random

In [2]:
text_file = open("corpus.txt", "r")
data_samples = text_file.readlines()
random.shuffle(data_samples) # ml algorithm sees the data in a random order for best results
print(len(data_samples))

1210


In [4]:
split_percentage = 90 # we want 90% of our data to be within the training list
X_train_document, X_test_document = np.split(data_samples, [int(len(data_samples)*(split_percentage/100))])

In [5]:
len(X_train_document)

1089

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_df = 0.95, # removes terms with document frequency higher than 95% of the documents
                                 min_df = 100, # the number of times a term has to appear within the document to be counted in the frequency 
                                 stop_words = 'english') # removes stop words using an english list
# better results achievable with pre-processing of the data

In [7]:
tfidf = tfidf_vectorizer.fit_transform(data_samples) # bag of words has been created

In [8]:
tfidf = tfidf.toarray()
l, _ = tfidf.shape

X_train, X_test = np.split(tfidf, [int(l*(split_percentage/100))])

X_train = sparse.csr_matrix(X_train)
X_test = sparse.csr_matrix(X_test)

print(X_train.shape)
print(X_test.shape)


(1089, 294)
(121, 294)


In [9]:
df = pd.DataFrame.sparse.from_spmatrix(X_train) 
df.columns = tfidf_vectorizer.get_feature_names_out()
df

Unnamed: 0,10,access,account,actions,add,adding,additional,address,algorithm,algorithms,...,versions,view,volume,volumes,want,way,web,windows,work,write
0,0.0,0.290029,0.23287,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.043385,0.000000,0.000000,0.000000,0.00000,0.000000
1,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.085307,0.048659,...,0.0,0.037095,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
2,0.0,0.012118,0.00000,0.000000,0.0,0.000000,0.015232,0.000000,0.000000,0.000000,...,0.0,0.000000,0.361815,0.537923,0.000000,0.038615,0.000000,0.000000,0.00000,0.072679
3,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.013281,0.014104,0.000000,0.000000,0.013736,0.014506,0.00000,0.000000
4,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1084,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.075328,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.213753,0.000000,0.000000,0.000000,0.00000,0.000000
1085,0.0,0.030181,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.130450,0.049607,...,0.0,0.000000,0.090112,0.000000,0.000000,0.000000,0.000000,0.000000,0.04671,0.000000
1086,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.200226,0.228421,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
1087,0.0,0.018776,0.00000,0.025097,0.0,0.031551,0.023601,0.029694,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000


In [10]:
topics = 3 # how many topics do we want?
model = LatentDirichletAllocation(n_components = topics)

In [11]:
model.fit(X_train) # unsupervised model

In [12]:
tf_feature_names = tfidf_vectorizer.get_feature_names_out()

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             
        for i in topic.argsort()[:-n_top_words - 1:-1]])
        
        print(message)
    return 
    
print_top_words(model, tf_feature_names, 10)

Topic #0: bucket s3 object choose amazon access aws objects console storage
Topic #1: sagemaker model training job amazon algorithm notebook data following metrics
Topic #2: instance instances ec2 amazon volume ami launch linux command aws


In [14]:
test_sample = 1 
p = model.transform(X_test[test_sample]) # lists the probabilities of three topics of the document
print(p)

[[0.48117706 0.05862501 0.46019793]]


In [15]:
t = p.argmax() 
print("Topic #{}".format(t)) # returns the topic of the sample

Topic #0


In [16]:
print(X_test_document[test_sample])

amazon s3 is a repository for internet data. amazon s3 provides access to reliable, fast, and inexpensive data storage infrastructure. it is designed to make web-scale computing easier by enabling you to store and retrieve any amount of data, at any time, from within amazon ec2 or anywhere on the web. amazon s3 stores data objects redundantly on multiple devices across multiple facilities and allows concurrent read or write access to these data objects by many separate clients or application threads. you can use the redundant data stored in amazon s3 to recover quickly and reliably from instance or application failures.  amazon ec2 uses amazon s3 for storing amazon machine images (amis). you use amis for launching ec2 instances. in case of instance failure, you can use the stored ami to immediately launch another instance, thereby allowing for fast recovery and business continuity. amazon ec2 also uses amazon s3 to store snapshots (backup copies) of the data volumes. you can use snapsh