In [1]:
import numpy as np
import pandas as pd

data=pd.read_csv('abcnews-date-text.csv')
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


# Bag-of-Words (BoW)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


# Bag-of-Words (BoW) using CountVectorizer
vectorizer = CountVectorizer()
x_bow = vectorizer.fit_transform(data['headline_text'])
feature_names_bow = vectorizer.get_feature_names_out()

print("="*125)
print("\nFeature Names (BoW):\n", feature_names_bow)
print("="*125)
print("Bag-of-Words Matrix:\n", x_bow.toarray())


Feature Names (BoW):
 ['000' '03' '0388' ... 'zurich' 'zvonareva' 'zyl']
Bag-of-Words Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# TF-IDF

In [3]:
# # TF-IDF using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
x_tfidf = tfidf_vectorizer.fit_transform(data['headline_text'])
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()

print("="*125)
print("\nFeature Names (TF-IDF):\n", feature_names_tfidf)
print("="*125)
print("\nTF-IDF Matrix:\n", x_tfidf.toarray())


Feature Names (TF-IDF):
 ['000' '03' '0388' ... 'zurich' 'zvonareva' 'zyl']

TF-IDF Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# N-grams:

In [4]:
import nltk
from nltk import ngrams
from nltk.tokenize import sent_tokenize

# N-grams
n = 1
text = data['headline_text'].iloc[0]  
tokens = nltk.word_tokenize(text)
unigrams = list(ngrams(tokens, n))

print("="*125)
print(f"\nGenerated {n}-grams:\n", unigrams)



Generated 1-grams:
 [('aba',), ('decides',), ('against',), ('community',), ('broadcasting',), ('licence',)]


In [5]:

# N-grams
n = 2
text = data['headline_text'].iloc[0]  
tokens = nltk.word_tokenize(text)
bigrams = list(ngrams(tokens, n))

print("="*125)
print(f"\nGenerated {n}-grams:\n", bigrams)



Generated 2-grams:
 [('aba', 'decides'), ('decides', 'against'), ('against', 'community'), ('community', 'broadcasting'), ('broadcasting', 'licence')]


In [6]:
# N-grams
n = 3
text = data['headline_text'].iloc[0]  
tokens = nltk.word_tokenize(text)
trigrams = list(ngrams(tokens, n))

print("="*125)
print(f"\nGenerated {n}-grams:\n", trigrams)



Generated 3-grams:
 [('aba', 'decides', 'against'), ('decides', 'against', 'community'), ('against', 'community', 'broadcasting'), ('community', 'broadcasting', 'licence')]


# One-Hot Encoding

In [7]:
from sklearn.preprocessing import OneHotEncoder

# One-Hot Encoding
text_for_onehot = ['search for missing angler called off', 'two badly burned in ballarat car explosion',
                   'govts closer to ship sinking agreement', 'howard meets megawati at apec']
tokens_onehot = [word for sent in text_for_onehot for word in sent.lower().split()]
vocabulary_onehot = list(set(tokens_onehot))

encoder = OneHotEncoder(categories=[vocabulary_onehot], sparse=False)
one_hot_encoded = []

for sent in text_for_onehot:
    sent_encoded = []
    for word in sent.lower().split():
        word_index = vocabulary_onehot.index(word)
        word_vector = np.zeros(len(vocabulary_onehot))
        word_vector[word_index] = 1
        sent_encoded.append(word_vector)
    one_hot_encoded.append(sent_encoded)
    
    
print("="*125)
print("\nOne-Hot Encoded Matrix:")
for sent in one_hot_encoded:
    print(sent)



One-Hot Encoded Matrix:
[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.]), array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0.])]
[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0.]), array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
  

# Word2Vec

In [8]:
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument

# Tokenize the headlines
tokenized_headlines = [word_tokenize(sentence.lower()) for sentence in data['headline_text']]

# Word2Vec
word2vec_model = Word2Vec(sentences=tokenized_headlines, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec_model")

# Example usage of Word2Vec model
vector_harrased = word2vec_model.wv['harassed']
vector_ambulance = word2vec_model.wv['ambulance']
similarity = word2vec_model.wv.similarity('chemical', 'arrested')

print("="*125)
print(f"Vector for 'harrased':\n\n\t {vector_harrased}\n\n")
print('=' * 125)
print(f"Vector for 'ambulance':\n\n\t {vector_ambulance}\n\n")
print('=' * 125)
print(f"Similarity between 'Chemical' and 'arrested':\t{similarity}\n")


Vector for 'harrased':

	 [-1.25479409e-02  1.43873235e-02  6.83394819e-03 -8.43549613e-03
 -2.88833049e-04 -8.27173237e-03 -4.06600484e-05  2.17473470e-02
 -3.28645017e-03 -2.42309156e-03  1.51883741e-03 -1.73710417e-02
  2.55823089e-03  4.57047764e-03  4.97877086e-03 -4.88871150e-03
  9.53605678e-03 -9.14497126e-04 -6.80655008e-03 -1.51521424e-02
 -2.78057624e-03 -9.81673412e-03 -5.70116378e-03 -4.29448020e-03
 -1.26733258e-03  8.96376371e-03 -7.74835935e-03 -7.73743680e-03
  1.01984502e-03  1.05377240e-02  1.23139226e-03  5.66085288e-03
 -2.35751766e-04 -3.42668639e-03 -8.23798683e-03  4.21335874e-03
  5.64261898e-03 -2.94797472e-03 -1.57702044e-02 -1.60490796e-02
  4.23522986e-04 -9.47563630e-03 -1.47837764e-02 -3.59171862e-03
 -9.47497902e-04 -3.27237276e-03  7.07197294e-04 -6.35047443e-03
 -2.07854202e-03  1.34857814e-03  1.13158906e-02 -9.88053903e-03
 -1.76063110e-03  9.77788214e-03 -4.16142540e-03 -1.83997001e-03
  5.19989012e-03 -1.24334581e-02 -9.29252710e-03  3.54603818e-03

# Doc2Vec

In [9]:
from gensim.models import Doc2Vec

#sample text
documents=['death toll continues to climb in south korean subway',
           'funds to go to cadell upgrade',
           'man charged over cooma murder']

#Tokenize & tag documents
tagged_data=[TaggedDocument(words=word_tokenize(doc.lower()),
                            tags=[str(i)]) for i,doc in enumerate(documents)]
print('=' * 125)
print("Tagged data:\n\n",tagged_data)
print('=' * 125)
#Train Doc2vec model
model=Doc2Vec(vector_size=100,window=2,min_count=1,workers=5,epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data,total_examples=model.corpus_count,epochs=model.epochs)


vector_doc_1=model.infer_vector(word_tokenize("man charged over cooma murder"))
                                              
#find the most similar document
similar_doc=model.dv.most_similar(positive=[vector_doc_1])
print('=' * 125)
print(f"vector for 'man charged over cooma murder':\n\n{vector_doc_1}")
print('=' * 125)
print(f"Most similar document:\n\n{similar_doc}")


Tagged data:

 [TaggedDocument(words=['death', 'toll', 'continues', 'to', 'climb', 'in', 'south', 'korean', 'subway'], tags=['0']), TaggedDocument(words=['funds', 'to', 'go', 'to', 'cadell', 'upgrade'], tags=['1']), TaggedDocument(words=['man', 'charged', 'over', 'cooma', 'murder'], tags=['2'])]
vector for 'man charged over cooma murder':

[ 3.02760839e-03  3.62603250e-03  4.74082446e-03 -7.72489177e-04
 -7.22767552e-04  4.34913952e-03  1.84735365e-03  2.54158303e-03
 -3.45497672e-03 -2.46411818e-03  1.21556615e-04  3.97946872e-03
  3.33664147e-03  6.42365660e-04 -2.89805001e-03  3.55820230e-04
  4.91429633e-03  4.82639298e-03  1.42807909e-03 -2.81690550e-03
  3.78943840e-03  1.54533668e-03  4.84450208e-03  2.68553663e-03
  3.80434655e-03  7.92035658e-04 -6.26541616e-04 -2.73808814e-03
 -9.25975735e-04 -3.97102861e-03  7.59969000e-04  1.60412851e-03
  3.47616011e-03  4.06145072e-03 -2.20453949e-03 -9.78054362e-04
  4.05522529e-03 -3.31065315e-03  1.36917655e-03  4.17039124e-03
  9.2224