In [1]:
import emojis
import nltk
import string
from nltk.corpus import stopwords
from gensim import corpora, models
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import CoherenceModel
from wordcloud import WordCloud

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def preprocess_text1(text):#for keyword generation
    # Remove emojis
    text = emojis.decode(text)

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove punctuation, stopwords, and non-alphabetic words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words and word not in string.punctuation and word.isalpha()]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return words



In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

def generate_keywords_standard(summary, num_keywords=6):
    print("Generating Keywords...")

    # Tokenize the summary into words
    words = preprocess_text1(summary)
    # Join the preprocessed words back into a string
    processed_text = " ".join(words)

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit the vectorizer on the summary
    vectorizer.fit([processed_text])

    # Transform the summary into TF-IDF matrix
    tfidf_matrix = vectorizer.transform([processed_text])

    # Extract feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Compute TF-IDF scores for each word
    scores = tfidf_matrix.toarray().flatten()

    # Sort the words based on TF-IDF scores
    keywords = [feature_names[i] for i in scores.argsort()[::-1][:num_keywords]]

    return keywords


In [5]:
import time
import psutil

def main():
    # Sample big paragraph
    paragraph = """
    Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed non risus. Suspendisse lectus tortor, dignissim sit amet, adipiscing nec, ultricies sed, dolor. Cras elementum ultrices diam. Maecenas ligula massa, varius a, semper congue, euismod non, mi. Proin porttitor, orci nec nonummy molestie, enim est eleifend mi, non fermentum diam nisl sit amet erat. Duis semper. Duis arcu massa, scelerisque vitae, consequat in, pretium a, enim. Pellentesque congue. Ut in risus volutpat libero pharetra tempor. Cras vestibulum bibendum augue. Praesent egestas leo in pede. Praesent blandit odio eu enim. Pellentesque sed dui ut augue blandit sodales. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Aliquam nibh. Mauris ac mauris sed pede pellentesque fermentum. Maecenas adipiscing ante non diam sodales...
    """

    # Start measuring time
    start_time = time.time()

    # Generate keywords
    keywords = generate_keywords_standard(paragraph)

    # End measuring time
    end_time = time.time()

    # Calculate the time taken
    time_taken = end_time - start_time

    # Print time taken
    print("Time taken:", time_taken, "seconds")

    # Print CPU usage
    cpu_usage = psutil.cpu_percent()
    print("CPU usage:", cpu_usage, "%")

    # Print memory usage
    memory_usage = psutil.virtual_memory().percent
    print("Memory usage:", memory_usage, "%")

    # Display keywords
    print("Keywords:", keywords)

if __name__ == "__main__":
    main()


Generating Keywords...
Time taken: 1.6271331310272217 seconds
CPU usage: 68.2 %
Memory usage: 7.4 %
Keywords: ['sed', 'non', 'enim', 'diam', 'adipiscing', 'pellentesque']
