In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Sample corpus (replace this with your own text data)
c = { \
'Lincoln1865':
'With malice toward none, with charity for all ...' +
'let us strive on to finish the work we are in ... ' +
'to do all which may achieve and cherish a just and lasting peace, ' +
'among ourselves, and with all nations.',
'TrumpMay26':
'There is NO WAY (ZERO!) that Mail-In Ballots ' +
'will be anything less than substantially fraudulent.',
'Wikipedia':
'In 1998, Oregon became the first state in the US ' +
'to conduct all voting exclusively by mail.',
'FortuneMay26':
'Over the last two decades, about 0.00006% of total ' +
'vote-by-mail votes cast were fraudulent.',
'TheHillApr07':
'Trump voted by mail in the Florida primary.',
'KingJamesBible':
'Wherefore laying aside all malice, and all guile, and ' +
'hypocrisies, and envies, and all evil speakings',
}

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(c)

# Create a DataFrame from the term-document matrix
tdm_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Print the DataFrame
print(tdm_df)


   fortunemay26  kingjamesbible  lincoln1865  thehillapr07  trumpmay26  \
0             0               0            1             0           0   
1             0               0            0             0           1   
2             0               0            0             0           0   
3             1               0            0             0           0   
4             0               0            0             1           0   
5             0               1            0             0           0   

   wikipedia  
0          0  
1          0  
2          1  
3          0  
4          0  
5          0  


In [23]:
import pandas as pd
from scipy.sparse import lil_matrix
d = {}
for j, dok in enumerate(c.keys()):
  tokens = [w.lemma_ for w in nlp(c[dok])
      if not w.is_stop and w.pos_ != 'PUNCT']
  for t in tokens:
    d[t] = d.setdefault(t, [])
    d[t] += [j]
A = lil_matrix((len(d.keys()), len(c.keys())), dtype=int)
for i, t in enumerate(d.keys()):
    for j in d[t]:
      A[i, j] = 1
Adf = pd.DataFrame(A.toarray(), index=d.keys(), columns=c.keys()); Adf

Unnamed: 0,Lincoln1865,TrumpMay26,Wikipedia,FortuneMay26,TheHillApr07,KingJamesBible
malice,1,0,0,0,0,1
charity,1,0,0,0,0,0
let,1,0,0,0,0,0
strive,1,0,0,0,0,0
finish,1,0,0,0,0,0
work,1,0,0,0,0,0
achieve,1,0,0,0,0,0
cherish,1,0,0,0,0,0
lasting,1,0,0,0,0,0
peace,1,0,0,0,0,0


In [17]:
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
def tokensfromdoc(doc):
  d = nlp(doc)
  matches = matcher(d)
  for match_id, start, end in matches:
    term = Span(d, start, end, label='myterms')
    d.ents = list(d.ents) + [term]
  tokens = [w.lemma_ for w in d
            # no pronouns
            if w.pos_ != 'PRON' \
            # no punctuations
            and w.pos_ != 'PUNCT' \
            # not Beginning of a named entity
            and w.ent_iob_ != 'B' \
            # not Inside a named entity
            and w.ent_iob_ != 'I' \
            # not a stop word
            and not w.is_stop]
  tokens += [de.text.rstrip().replace(' ', '_') for de in d.ents]

  return tokens
def dictokens(corpora):
  d = {}
  for j, dok in enumerate(corpora.keys()):
    for t in tokensfromdoc(corpora[dok]):
      d[t] = d.setdefault(t, [])
      d[t] += [j]
  return d
def tdmatrix(d, corpora):
  A = lil_matrix((len(d.keys()), len(corpora.keys())), dtype=int)
  for i, t in enumerate(d.keys()):
    for j in d[t]:
      A[i, j] = 1
  return A
d = dictokens(c)
A = tdmatrix(d, c)
Adf = pd.DataFrame(A.toarray(), index=d.keys(), columns=c.keys())

In [18]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Load the spaCy language model (make sure you have spaCy installed)
nlp = spacy.load("en_core_web_sm")

# Custom tokenizer function that uses spaCy for lemmatization
def custom_tokenizer(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
    return tokens

# Create a CountVectorizer object with the custom tokenizer
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, stop_words='english')

# Fit and transform the text data
X = vectorizer.fit_transform(c)

# Create a DataFrame from the term-document matrix
tdm_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Print the DataFrame
print(tdm_df)




   fortunemay26  kingjamesbible  lincoln1865  thehillapr07  trumpmay26  \
0             0               0            1             0           0   
1             0               0            0             0           1   
2             0               0            0             0           0   
3             1               0            0             0           0   
4             0               0            0             1           0   
5             0               1            0             0           0   

   wikipedia  
0          0  
1          0  
2          1  
3          0  
4          0  
5          0  




In [None]:
from sklearn.decomposition import TruncatedSVD

# Number of dimensions for LSA (set to 3 in this case)
num_dimensions = 3

# Perform LSA on the term-document matrix
lsa = TruncatedSVD(n_components=num_dimensions)
lsa_result = lsa.fit_transform(X)

# Get the document and word representations
document_representations = lsa_result[:len(c)]
word_representations = lsa_result[len(c):]

# Print the LSA vector representation of the word "vote"
word_index = vectorizer.get_feature_names_out().index("fortunemay26")
lsa_vote_representation = word_representations[word_index]
print("LSA vector representation of 'vote':", lsa_vote_representation)


In [None]:
# Find the index of the word "vote" in the feature names
word_to_find = "fortunemay26"
feature_names = vectorizer.get_feature_names_out()
word_index = next((index for index, word in enumerate(feature_names) if word == word_to_find), -1)

# Check if the word was found and get its LSA representation
if word_index != -1:
    lsa_vote_representation = word_representations[word_index]
    print("LSA vector representation of 'vote':", lsa_vote_representation)
else:
    print("Word 'vote' not found in feature names.")
