## Imports

In [None]:
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

## Data loading

In [None]:
file_name = "train_with_summaries.csv"  # File name is predefined
df = pd.read_csv(file_name)

## Method1 (Counter + NLTK stopwords)

In [None]:
def extract_keywords(text, n=5):
    stop_words = set(stopwords.words('english'))  # Define stopwords
    words = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]  # Remove stopwords and non-alphabetic words
    word_counts = Counter(filtered_words)  # Count word frequencies
    return [word for word, _ in word_counts.most_common(n)]  # Return top n keywords

In [None]:
df['keywords_counter_top5'] = df['text'].apply(lambda x: extract_keywords(x))
df['keywords_counter_top3'] = df['keywords_counter_top5'].apply(lambda keywords: keywords[:3])

# Method2 (TF-IDF matrix + NLTK stopwords + lemmatization + name filtering)

In [None]:
def get_wordnet_pos(treebank_tag):
    """Convert Treebank POS tags to WordNet POS tags for lemmatization."""
    if treebank_tag.startswith('J'):
        return 'a'  # Adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # Verb
    elif treebank_tag.startswith('N'):
        return 'n'  # Noun
    elif treebank_tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'  # Default to noun

In [None]:
def preprocess_text_for_tfidf(text, only_nouns=False):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)

    # Perform POS tagging
    pos_tags = pos_tag(words)

    if only_nouns:
        # Keep only nouns
        pos_tags = [(word, tag) for word, tag in pos_tags if word.isalpha() and tag.startswith('NN') and not word[0].isupper()]
    else:
        # Filter words based on criteria
        pos_tags = [(word, tag) for word, tag in pos_tags if word.isalpha() and not word[0].isupper()]

    # Lemmatize words based on their POS tags
    lemmatized_words = [lemmatizer.lemmatize(word.lower(), pos=get_wordnet_pos(tag))
                        for word, tag in pos_tags if word.lower() not in stop_words]

    return " ".join(lemmatized_words)

In [None]:
def compute_tfidf(df, column_name):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df[column_name])
    feature_names = vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names

In [None]:
def extract_keywords_from_tfidf(tfidf_matrix, feature_names, doc_index, n=3):
    tfidf_scores = tfidf_matrix[doc_index].toarray()[0]  # Get TF-IDF scores for the specific document
    scored_keywords = list(zip(feature_names, tfidf_scores))
    sorted_keywords = sorted(scored_keywords, key=lambda x: x[1], reverse=True)  # Sort by TF-IDF score
    return [word for word, _ in sorted_keywords[:n]]  # Return top n keywords

In [None]:
# Processed text columns
columns_to_process = {
    'td_idf': False,  # For full text
    'td_idf_only_nouns': True  # For nouns only
}

for column, only_nouns in columns_to_process.items():
    df[column] = df['text'].apply(lambda x: preprocess_text_for_tfidf(x, only_nouns=only_nouns))

# Compute TF-IDF matrices
tfidf_data = {
    column: compute_tfidf(df, column) for column in columns_to_process.keys()
}

# Add keyword extraction results and most common words to DataFrame
for column, (tfidf_matrix, feature_names) in tfidf_data.items():
    keyword_column_5 = f"keywords_{column}_top5"
    df[keyword_column_5] = [extract_keywords_from_tfidf(tfidf_matrix, feature_names, i, n=5) for i in range(len(df))]

    # Generate top 3 keywords by slicing the top 5 keywords
    keyword_column_3 = f"keywords_{column}_top3"
    df[keyword_column_3] = df[keyword_column_5].apply(lambda keywords: keywords[:3])

In [None]:
output_file = "stories_summaries_keywords.csv"
df.to_csv(output_file, index=False)

In [None]:
df.head()

Unnamed: 0,text,summary,keywords_counter_top5,keywords_counter_top3,td_idf,td_idf_only_nouns,keywords_td_idf_top5,keywords_td_idf_top3,keywords_td_idf_only_nouns_top5,keywords_td_idf_only_nouns_top3
0,"One day, a little girl named Lily found a need...","Lily, a little girl, and her mother successful...","[lily, needle, mom, shirt, share]","[lily, needle, mom]",day little girl name find needle room know dif...,day girl needle room share needle mom button s...,"[needle, shirt, share, sew, difficult]","[needle, shirt, share]","[shirt, needle, share, button, mom]","[shirt, needle, share]"
1,"Once upon a time, there was a little car named...","A happy little car named Beep, fueled by good ...","[beep, fuel, leaves, go, play]","[beep, fuel, leaves]",upon time little car name love go fast play su...,time car sun car fuel fuel day park tree tree ...,"[fuel, leaf, fall, healthy, tree]","[fuel, leaf, fall]","[fuel, leaf, tree, car, drove]","[fuel, leaf, tree]"
2,"One day, a little fish named Fin was swimming ...",A little fish named Fin befriends a crab who i...,"[fin, feel, crab, sun, little]","[fin, feel, crab]",day little fish name swim near shore saw big c...,day fish shore crab friend fish crab crab feel...,"[crab, fine, feel, sun, fish]","[crab, fine, feel]","[crab, fish, sun, shore, feel]","[crab, fish, sun]"
3,"Once upon a time, in a land full of trees, the...","A small, weak cherry tree finds happiness and ...","[cherry, tree, trees, little, wind]","[cherry, tree, trees]",upon time land full tree little cherry tree ch...,time land tree tree cherry tree friend tree ch...,"[cherry, tree, wind, land, spring]","[cherry, tree, wind]","[cherry, tree, wind, land, tickle]","[cherry, tree, wind]"
4,"Once upon a time, there was a little girl name...","A young girl named Lily, along with her cat an...","[lily, cobweb, lived, cat, dog]","[lily, cobweb, lived]",upon time little girl name like pretend popula...,time girl princess castle friend cat dog day c...,"[cobweb, spider, cat, dog, castle]","[cobweb, spider, cat]","[cobweb, spider, cat, dog, castle]","[cobweb, spider, cat]"
