Rename all _nltk with _spacy to get the results for Spacy based preprocessed words

In [1]:
import pandas as pd
import numpy as np
import spacy

In [2]:
spacy_df = pd.read_csv('data/features_nltk.csv')
spacy_df = spacy_df[['headline', 'headline_cleaned', 'tokenized_text_nltk', 'lemmatized_text_nltk']]
spacy_df.head()

Unnamed: 0,headline,headline_cleaned,tokenized_text_nltk,lemmatized_text_nltk
0,former versace store clerk sues over secret 'b...,former versace store clerk sues over secret bl...,"['former', 'versace', 'store', 'clerk', 'sues'...","['former', 'versace', 'store', 'clerk', 'sue',..."
1,the 'roseanne' revival catches up to our thorn...,the roseanne revival catches up to our thorny ...,"['the', 'roseanne', 'revival', 'catches', 'up'...","['roseanne', 'revival', 'catch', 'thorny', 'po..."
2,mom starting to fear son's web series closest ...,mom starting to fear sons web series closest t...,"['mom', 'starting', 'to', 'fear', 'sons', 'web...","['mom', 'starting', 'fear', 'son', 'web', 'ser..."
3,"boehner just wants wife to listen, not come up...",boehner just wants wife to listen not come up ...,"['boehner', 'just', 'wants', 'wife', 'to', 'li...","['boehner', 'just', 'want', 'wife', 'listen', ..."
4,j.k. rowling wishes snape happy birthday in th...,jk rowling wishes snape happy birthday in the ...,"['jk', 'rowling', 'wishes', 'snape', 'happy', ...","['jk', 'rowling', 'wish', 'snape', 'happy', 'b..."


## Word2Vec

In [3]:
from gensim.models import Word2Vec

In [4]:
spacy_df['tokenized_text_nltk'] = spacy_df['tokenized_text_nltk'].apply(lambda x: x.replace('[','').replace(']','').replace("'",'').replace(",",''))
spacy_df['tokenized_text_nltk'] = spacy_df['tokenized_text_nltk'].apply(lambda x: x.split())
spacy_df['lemmatized_text_nltk'] = spacy_df['lemmatized_text_nltk'].apply(lambda x: x.replace('[','').replace(']','').replace("'",'').replace(",",''))
spacy_df['lemmatized_text_nltk'] = spacy_df['lemmatized_text_nltk'].apply(lambda x: x.split())

In [5]:
tokenized_sentences = spacy_df['tokenized_text_nltk'].tolist()
lemmatized_sentences = spacy_df['lemmatized_text_nltk'].tolist()

print(tokenized_sentences[0])


# Train Word2Vec models
w2v_tokenized = Word2Vec(tokenized_sentences, min_count=1, vector_size=100)
w2v_lemmatized = Word2Vec(lemmatized_sentences, min_count=1, vector_size=100)

['former', 'versace', 'store', 'clerk', 'sues', 'over', 'secret', 'black', 'code', 'for', 'minority', 'shoppers']


In [6]:
print(type(tokenized_sentences[0]))

<class 'list'>


In [7]:
# Function to get word embeddings for a sentence
def get_word2vec_embeddings(model, sentence):
    embeddings = []
    for word in sentence.split():
        try:
            embeddings.append(model.wv[word])
        except KeyError:
            embeddings.append([0]*100)  # Default vector if word not found
    return embeddings


In [8]:
# Get word embeddings for tokenized and lemmatized sentences (for each word in the sentence)
spacy_df['tokenized_word2vec'] = spacy_df['tokenized_text_nltk'].apply(lambda x: [get_word2vec_embeddings(w2v_tokenized, y) for y in x])
spacy_df['lemmatized_word2vec'] = spacy_df['lemmatized_text_nltk'].apply(lambda x: [get_word2vec_embeddings(w2v_lemmatized, y) for y in x])

In [9]:
spacy_df.head()

Unnamed: 0,headline,headline_cleaned,tokenized_text_nltk,lemmatized_text_nltk,tokenized_word2vec,lemmatized_word2vec
0,former versace store clerk sues over secret 'b...,former versace store clerk sues over secret bl...,"[former, versace, store, clerk, sues, over, se...","[former, versace, store, clerk, sue, secret, b...","[[[-0.44300553, 0.5421461, 0.17735429, 0.12578...","[[[-0.3604961, 0.42469707, 0.24075222, 0.05691..."
1,the 'roseanne' revival catches up to our thorn...,the roseanne revival catches up to our thorny ...,"[the, roseanne, revival, catches, up, to, our,...","[roseanne, revival, catch, thorny, political, ...","[[[-0.34196183, 0.79046404, 0.36153653, 0.2158...","[[[-0.025584826, 0.013280583, 0.00817118, 0.00..."
2,mom starting to fear son's web series closest ...,mom starting to fear sons web series closest t...,"[mom, starting, to, fear, sons, web, series, c...","[mom, starting, fear, son, web, series, closes...","[[[-0.43636551, 0.51213676, 0.15836011, 0.1524...","[[[-0.5373482, 0.63286495, 0.34453112, 0.14433..."
3,"boehner just wants wife to listen, not come up...",boehner just wants wife to listen not come up ...,"[boehner, just, wants, wife, to, listen, not, ...","[boehner, just, want, wife, listen, not, come,...","[[[-0.103065215, 0.11661247, 0.057135496, 0.02...","[[[-0.09294353, 0.09803749, 0.053146668, 0.020..."
4,j.k. rowling wishes snape happy birthday in th...,jk rowling wishes snape happy birthday in the ...,"[jk, rowling, wishes, snape, happy, birthday, ...","[jk, rowling, wish, snape, happy, birthday, mo...","[[[-0.025916466, 0.023986552, 0.016223146, 0.0...","[[[-0.027325345, 0.032817025, 0.018137066, 0.0..."


In [10]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.matutils import corpus2dense
import numpy as np

In [11]:
# Create a dictionary
tokenized_dict = Dictionary(tokenized_sentences)
lemmatized_dict = Dictionary(lemmatized_sentences)

In [12]:
# Convert to bag-of-words format
tokenized_corpus = [tokenized_dict.doc2bow(text) for text in tokenized_sentences]
lemmatized_corpus = [lemmatized_dict.doc2bow(text) for text in lemmatized_sentences]

In [13]:
# Train LDA models
lda_tokenized = LdaModel(tokenized_corpus, num_topics=10, id2word=tokenized_dict)
lda_lemmatized = LdaModel(lemmatized_corpus, num_topics=10, id2word=lemmatized_dict)

In [14]:
# Get topic distributions for each document
tokenized_topics = lda_tokenized.get_document_topics(tokenized_corpus)
lemmatized_topics = lda_lemmatized.get_document_topics(lemmatized_corpus)

In [15]:
tokenized_topics[0]

[(0, 0.14148138),
 (1, 0.09167796),
 (4, 0.5092524),
 (6, 0.08440458),
 (8, 0.12791975)]

In [16]:
# Convert topic distributions to dense vectors
def get_dense_topics(topics):
    dense_topics = []
    for topic in topics:
        topic_vector = [0]*10  # Assuming 10 topics
        for topic_id, prob in topic:
            topic_vector[topic_id] = prob
        dense_topics.append(topic_vector)
    return dense_topics

In [17]:
tokenized_dense_topics = get_dense_topics(tokenized_topics)
lemmatized_dense_topics = get_dense_topics(lemmatized_topics)

print(tokenized_dense_topics[0])

# Add dense topic vectors to DataFrame
spacy_df['Tokenized_lda_topics'] = tokenized_dense_topics
spacy_df['Lemmatized_lda_topics'] = lemmatized_dense_topics


[0.14152764, 0.09167853, 0, 0, 0.5092569, 0, 0.08435021, 0, 0.12792276, 0]


In [89]:
spacy_df.to_csv('data/embeddings_nltk.csv', index=False)

Please avoid pushing this csv file to github. The files are large and will take 5-6 minutes to store as a csv locally

## CountVectorizer and TfidfVectorizer

Owen

In [28]:
# Cell 1: Import CountVectorizer and TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [20]:
# Cell 2: Convert lists of tokens back to strings for vectorizers
tokenized_texts = [' '.join(tokens) for tokens in tokenized_sentences]
lemmatized_texts = [' '.join(tokens) for tokens in lemmatized_sentences]

In [22]:
# Cell 3: Implement CountVectorizer
# Create and fit CountVectorizer (max_features=1000 to avoid overly sparse matrices)
count_vec_tokenized = CountVectorizer(max_features=1000)
count_vec_lemmatized = CountVectorizer(max_features=1000)


# Transform text data into count vectors
count_tokenized = count_vec_tokenized.fit_transform(tokenized_texts)
count_lemmatized = count_vec_lemmatized.fit_transform(lemmatized_texts)

# Get feature names
tokenized_count_features = count_vec_tokenized.get_feature_names_out()
lemmatized_count_features = count_vec_lemmatized.get_feature_names_out()

print(f"CountVectorizer features shape (tokenized): {count_tokenized.shape}")
print(f"CountVectorizer features shape (lemmatized): {count_lemmatized.shape}")


CountVectorizer features shape (tokenized): (26709, 1000)
CountVectorizer features shape (lemmatized): (26709, 1000)


In [23]:

# Cell 4: Implement TfidfVectorizer
# Create and fit TfidfVectorizer (max_features=1000 to avoid overly sparse matrices)
tfidf_vec_tokenized = TfidfVectorizer(max_features=1000)
tfidf_vec_lemmatized = TfidfVectorizer(max_features=1000)

# Transform text data into TF-IDF vectors
tfidf_tokenized = tfidf_vec_tokenized.fit_transform(tokenized_texts)
tfidf_lemmatized = tfidf_vec_lemmatized.fit_transform(lemmatized_texts)

# Get feature names
tokenized_tfidf_features = tfidf_vec_tokenized.get_feature_names_out()
lemmatized_tfidf_features = tfidf_vec_lemmatized.get_feature_names_out()

print(f"TfidfVectorizer features shape (tokenized): {tfidf_tokenized.shape}")
print(f"TfidfVectorizer features shape (lemmatized): {tfidf_lemmatized.shape}")


TfidfVectorizer features shape (tokenized): (26709, 1000)
TfidfVectorizer features shape (lemmatized): (26709, 1000)


In [24]:

# Cell 5: Create function to extract top features
# Function to extract top features from each document
def get_top_features(matrix, feature_names, top_n=10):
    top_features_per_doc = []
    
    for i in range(matrix.shape[0]):
        # Get feature values for document i
        doc_features = matrix[i].toarray().flatten()
        # Get indices of top features (by value)
        top_indices = doc_features.argsort()[-top_n:][::-1]
        # Get feature names and their values
        top_features = [(feature_names[idx], doc_features[idx]) for idx in top_indices]
        top_features_per_doc.append(top_features)
    
    return top_features_per_doc

In [25]:

# Cell 6: Get top features and add to dataframe
# Get top 5 features for each document
tokenized_count_top = get_top_features(count_tokenized, tokenized_count_features, 5)
lemmatized_count_top = get_top_features(count_lemmatized, lemmatized_count_features, 5)
tokenized_tfidf_top = get_top_features(tfidf_tokenized, tokenized_tfidf_features, 5)
lemmatized_tfidf_top = get_top_features(tfidf_lemmatized, lemmatized_tfidf_features, 5)

# Add to dataframe
spacy_df['tokenized_count_top'] = tokenized_count_top
spacy_df['lemmatized_count_top'] = lemmatized_count_top
spacy_df['tokenized_tfidf_top'] = tokenized_tfidf_top
spacy_df['lemmatized_tfidf_top'] = lemmatized_tfidf_top

In [26]:
# Cell 7: Add dense feature vectors to dataframe
# Convert sparse matrices to dense arrays for selected features
# We'll take the first 100 features to avoid making the dataframe too large
tokenized_count_dense = count_tokenized[:, :100].toarray()
lemmatized_count_dense = count_lemmatized[:, :100].toarray()
tokenized_tfidf_dense = tfidf_tokenized[:, :100].toarray()
lemmatized_tfidf_dense = tfidf_lemmatized[:, :100].toarray()

# Add count vector features to dataframe
spacy_df['tokenized_count_features'] = tokenized_count_dense.tolist()
spacy_df['lemmatized_count_features'] = lemmatized_count_dense.tolist()
spacy_df['tokenized_tfidf_features'] = tokenized_tfidf_dense.tolist()
spacy_df['lemmatized_tfidf_features'] = lemmatized_tfidf_dense.tolist()


In [27]:
# Cell 8: Display examples of top TFIDF features
# Display example of top TFIDF features for the first few headlines
for i in range(5):
    print(f"Headline: {spacy_df['headline'].iloc[i]}")
    print(f"Top TFIDF features (tokenized): {spacy_df['tokenized_tfidf_top'].iloc[i]}")
    print("\n")


Headline: former versace store clerk sues over secret 'black code' for minority shoppers
Top TFIDF features (tokenized): [('store', 0.5083117545808206), ('secret', 0.4641108821678654), ('former', 0.4520097563458886), ('black', 0.3928886613364516), ('over', 0.34817736616156264)]


Headline: the 'roseanne' revival catches up to our thorny political mood, for better and worse
Top TFIDF features (tokenized): [('worse', 0.4915417386795098), ('political', 0.43993415731592855), ('better', 0.41459558156287946), ('our', 0.3925549641699866), ('up', 0.29638310190796713)]


Headline: mom starting to fear son's web series closest thing she will have to grandchild
Top TFIDF features (tokenized): [('starting', 0.4073715548705109), ('fear', 0.4061818705229094), ('series', 0.3899743530784772), ('thing', 0.344629484832939), ('mom', 0.32718931745424296)]


Headline: boehner just wants wife to listen, not come up with alternative debt-reduction ideas
Top TFIDF features (tokenized): [('ideas', 0.4591788451

In [29]:
# Cell 9: Save the updated dataframe
# Replace your existing save cell with this
spacy_df.to_csv('data/embeddings_count_tfidf.csv', index=False) 