Rename all _nltk with _spacy to get the results for Spacy based preprocessed words

In [72]:
import pandas as pd
import numpy as np
import spacy

In [73]:
spacy_df = pd.read_csv('data/features_nltk.csv')
spacy_df = spacy_df[['headline', 'headline_cleaned', 'tokenized_text_nltk', 'lemmatized_text_nltk']]
spacy_df.head()

Unnamed: 0,headline,headline_cleaned,tokenized_text_nltk,lemmatized_text_nltk
0,former versace store clerk sues over secret 'b...,former versace store clerk sues over secret bl...,"['former', 'versace', 'store', 'clerk', 'sues'...","['former', 'versace', 'store', 'clerk', 'sue',..."
1,the 'roseanne' revival catches up to our thorn...,the roseanne revival catches up to our thorny ...,"['the', 'roseanne', 'revival', 'catches', 'up'...","['roseanne', 'revival', 'catch', 'thorny', 'po..."
2,mom starting to fear son's web series closest ...,mom starting to fear sons web series closest t...,"['mom', 'starting', 'to', 'fear', 'sons', 'web...","['mom', 'starting', 'fear', 'son', 'web', 'ser..."
3,"boehner just wants wife to listen, not come up...",boehner just wants wife to listen not come up ...,"['boehner', 'just', 'wants', 'wife', 'to', 'li...","['boehner', 'just', 'want', 'wife', 'listen', ..."
4,j.k. rowling wishes snape happy birthday in th...,jk rowling wishes snape happy birthday in the ...,"['jk', 'rowling', 'wishes', 'snape', 'happy', ...","['jk', 'rowling', 'wish', 'snape', 'happy', 'b..."


## Word2Vec

In [74]:
from gensim.models import Word2Vec

In [75]:
spacy_df['tokenized_text_nltk'] = spacy_df['tokenized_text_nltk'].apply(lambda x: x.replace('[','').replace(']','').replace("'",'').replace(",",''))
spacy_df['tokenized_text_nltk'] = spacy_df['tokenized_text_nltk'].apply(lambda x: x.split())
spacy_df['lemmatized_text_nltk'] = spacy_df['lemmatized_text_nltk'].apply(lambda x: x.replace('[','').replace(']','').replace("'",'').replace(",",''))
spacy_df['lemmatized_text_nltk'] = spacy_df['lemmatized_text_nltk'].apply(lambda x: x.split())

In [76]:
tokenized_sentences = spacy_df['tokenized_text_nltk'].tolist()
lemmatized_sentences = spacy_df['lemmatized_text_nltk'].tolist()

print(tokenized_sentences[0])


# Train Word2Vec models
w2v_tokenized = Word2Vec(tokenized_sentences, min_count=1, vector_size=100)
w2v_lemmatized = Word2Vec(lemmatized_sentences, min_count=1, vector_size=100)

['former', 'versace', 'store', 'clerk', 'sues', 'over', 'secret', 'black', 'code', 'for', 'minority', 'shoppers']


In [77]:
print(type(tokenized_sentences[0]))

<class 'list'>


In [78]:
# Function to get word embeddings for a sentence
def get_word2vec_embeddings(model, sentence):
    embeddings = []
    for word in sentence.split():
        try:
            embeddings.append(model.wv[word])
        except KeyError:
            embeddings.append([0]*100)  # Default vector if word not found
    return embeddings


In [79]:
# Get word embeddings for tokenized and lemmatized sentences (for each word in the sentence)
spacy_df['tokenized_word2vec'] = spacy_df['tokenized_text_nltk'].apply(lambda x: [get_word2vec_embeddings(w2v_tokenized, y) for y in x])
spacy_df['lemmatized_word2vec'] = spacy_df['lemmatized_text_nltk'].apply(lambda x: [get_word2vec_embeddings(w2v_lemmatized, y) for y in x])

In [80]:
spacy_df.head()

Unnamed: 0,headline,headline_cleaned,tokenized_text_nltk,lemmatized_text_nltk,tokenized_word2vec,lemmatized_word2vec
0,former versace store clerk sues over secret 'b...,former versace store clerk sues over secret bl...,"[former, versace, store, clerk, sues, over, se...","[former, versace, store, clerk, sue, secret, b...","[[[-0.4229426, 0.55270875, 0.19162792, 0.13248...","[[[-0.31757945, 0.4251927, 0.18504636, 0.11165..."
1,the 'roseanne' revival catches up to our thorn...,the roseanne revival catches up to our thorny ...,"[the, roseanne, revival, catches, up, to, our,...","[roseanne, revival, catch, thorny, political, ...","[[[-0.52555096, 0.723542, 0.41971695, 0.204297...","[[[-0.02185243, 0.012011358, 0.005177774, 0.00..."
2,mom starting to fear son's web series closest ...,mom starting to fear sons web series closest t...,"[mom, starting, to, fear, sons, web, series, c...","[mom, starting, fear, son, web, series, closes...","[[[-0.44208562, 0.54864866, 0.16279277, 0.1702...","[[[-0.46234107, 0.64025134, 0.25795487, 0.2200..."
3,"boehner just wants wife to listen, not come up...",boehner just wants wife to listen not come up ...,"[boehner, just, wants, wife, to, listen, not, ...","[boehner, just, want, wife, listen, not, come,...","[[[-0.10652897, 0.12577043, 0.057611495, 0.029...","[[[-0.0803288, 0.09589496, 0.03889328, 0.03054..."
4,j.k. rowling wishes snape happy birthday in th...,jk rowling wishes snape happy birthday in the ...,"[jk, rowling, wishes, snape, happy, birthday, ...","[jk, rowling, wish, snape, happy, birthday, mo...","[[[-0.032667488, 0.035985168, 0.01892513, 0.01...","[[[-0.017622098, 0.023935243, 0.010494243, 0.0..."


In [81]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.matutils import corpus2dense
import numpy as np

In [82]:
# Create a dictionary
tokenized_dict = Dictionary(tokenized_sentences)
lemmatized_dict = Dictionary(lemmatized_sentences)

In [83]:
# Convert to bag-of-words format
tokenized_corpus = [tokenized_dict.doc2bow(text) for text in tokenized_sentences]
lemmatized_corpus = [lemmatized_dict.doc2bow(text) for text in lemmatized_sentences]

In [84]:
# Train LDA models
lda_tokenized = LdaModel(tokenized_corpus, num_topics=10, id2word=tokenized_dict)
lda_lemmatized = LdaModel(lemmatized_corpus, num_topics=10, id2word=lemmatized_dict)

In [85]:
# Get topic distributions for each document
tokenized_topics = lda_tokenized.get_document_topics(tokenized_corpus)
lemmatized_topics = lda_lemmatized.get_document_topics(lemmatized_corpus)

In [86]:
tokenized_topics[0]

[(3, 0.6944189), (9, 0.23896416)]

In [87]:
# Convert topic distributions to dense vectors
def get_dense_topics(topics):
    dense_topics = []
    for topic in topics:
        topic_vector = [0]*10  # Assuming 10 topics
        for topic_id, prob in topic:
            topic_vector[topic_id] = prob
        dense_topics.append(topic_vector)
    return dense_topics

In [88]:
tokenized_dense_topics = get_dense_topics(tokenized_topics)
lemmatized_dense_topics = get_dense_topics(lemmatized_topics)

print(tokenized_dense_topics[0])

# Add dense topic vectors to DataFrame
spacy_df['Tokenized_lda_topics'] = tokenized_dense_topics
spacy_df['Lemmatized_lda_topics'] = lemmatized_dense_topics


[0, 0, 0, 0.6943857, 0, 0, 0, 0, 0, 0.23899733]


In [89]:
spacy_df.to_csv('data/embeddings_nltk.csv', index=False)

Please avoid pushing this csv file to github. The files are large and will take 5-6 minutes to store as a csv locally