In [32]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
%%time
df = pd.DataFrame()
for i in range(1, 25):
    df = pd.concat([df, pd.read_csv(f'data/arxiv-lemmatized_preprocessed_NLTK_{i}.csv')])
df.processed_text = df.processed_text.astype(str)
agg_df = df.groupby('categories')['processed_text'].agg(lambda x: ' '.join(x)).reset_index()


CPU times: user 16.8 s, sys: 1.14 s, total: 18 s
Wall time: 18 s


In [34]:
# Function to extract keywords with scoring using TF-IDF
def extract_keywords_with_scores(text):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    dense = tfidf_matrix.todense().tolist()[0]
    
    # Create a list of tuples with (word, score)
    keyword_scores = list(zip(feature_names, dense))
    
    # Sort the list by TF-IDF score in descending order
    keyword_scores = sorted(keyword_scores, key=lambda x: x[1], reverse=True)

    # Extract the top 10 keywords with their scores
    top_keywords = keyword_scores[:10]
    
    return top_keywords

# 

In [35]:
%%time
agg_df['keywords_with_scores'] = agg_df['processed_text'].progress_apply(extract_keywords_with_scores)

# Display the DataFrame with keywords and scores


100%|████████████████████████████████████████████████████████████████████| 149/149 [01:06<00:00,  2.25it/s]

CPU times: user 1min 4s, sys: 1.72 s, total: 1min 6s
Wall time: 1min 6s





In [36]:
%%time
df_exploded = agg_df.drop(columns = 'processed_text').explode('keywords_with_scores')

# Create separate 'keyword' and 'score' columns
df_exploded[['keyword', 'score']] = pd.DataFrame(df_exploded['keywords_with_scores'].tolist(), index=df_exploded.index)

# Drop the original 'keywords_with_scores' column
df_exploded = df_exploded.drop('keywords_with_scores', axis=1).reset_index(drop = True)
df_exploded.to_csv('data/master_TFIDF_scoring.csv', index = False)

CPU times: user 10.4 ms, sys: 1.87 ms, total: 12.3 ms
Wall time: 10.9 ms
