You are part of a team developing a text classification system for a news aggregator 
platform. The platform aims to categorize news articles into different topics automatically. 
The dataset contains news articles along with their corresponding topics. Perform only the 
Feature extraction techniques.

In [None]:
Data Exploration

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('news.csv')

# Explore the first few rows to understand the data
print(df.head())

# Check the unique topics/categories
unique_topics = df['headline_text'].unique()
print("Unique Topics: ", unique_topics)

# Check the distribution of articles across topics
topic_distribution = df['headline_text'].value_counts()
print("Topic Distribution: \n", topic_distribution)

   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers
Unique Topics:  ['aba decides against community broadcasting licence'
 'act fire witnesses must be aware of defamation'
 'a g calls for infrastructure protection summit' ...
 'carnaby cockatoo revival program'
 'collier convinced colin barnett has overwhelming support'
 'cowboys offer indigenous kids north queensland hope for future']
Topic Distribution: 
 national rural news                                               808
abc sport                                                         718
abc weather                                                       714
abc business news and market 

In [None]:
Bag-of-Words(BOW)Model:

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
text=['aba decides against community broadcasting licence','act fire witnesses must be aware of defamation',
      'ambitious olsson wins triple jump']
#Initialize the vector
vectorizer=CountVectorizer()
#transform
x=vectorizer.fit_transform(text)
#get feature names(words)
feature_names=vectorizer.get_feature_names_out()
#Display BOW-Matrix
print('Feature names:\n',feature_names)
print()
print('BOW Matrix:\n')
print(x.toarray())

Feature names:
 ['aba' 'act' 'against' 'ambitious' 'aware' 'be' 'broadcasting' 'community'
 'decides' 'defamation' 'fire' 'jump' 'licence' 'must' 'of' 'olsson'
 'triple' 'wins' 'witnesses']

BOW Matrix:

[[1 0 1 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0]
 [0 1 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 1]
 [0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0]]


# TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
text=['aba decides against community broadcasting licence','act fire witnesses must be aware of defamation',
      'ambitious olsson wins triple jump']
 
tfidf=TfidfVectorizer()
x_tfidf=tfidf.fit_transform(text)
features_names_tfidf=tfidf.get_feature_names_out()
print('Feature Names(TF-IDF):\n',features_names_tfidf)
print()
print('TF-IDF Matrix:\n',x_tfidf.toarray())

Feature Names(TF-IDF):
 ['aba' 'act' 'against' 'ambitious' 'aware' 'be' 'broadcasting' 'community'
 'decides' 'defamation' 'fire' 'jump' 'licence' 'must' 'of' 'olsson'
 'triple' 'wins' 'witnesses']

TF-IDF Matrix:
 [[0.40824829 0.         0.40824829 0.         0.         0.
  0.40824829 0.40824829 0.40824829 0.         0.         0.
  0.40824829 0.         0.         0.         0.         0.
  0.        ]
 [0.         0.35355339 0.         0.         0.35355339 0.35355339
  0.         0.         0.         0.35355339 0.35355339 0.
  0.         0.35355339 0.35355339 0.         0.         0.
  0.35355339]
 [0.         0.         0.         0.4472136  0.         0.
  0.         0.         0.         0.         0.         0.4472136
  0.         0.         0.         0.4472136  0.4472136  0.4472136
  0.        ]]


In [7]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
text=input('Enter a statement:\n\t')
sent_token=sent_tokenize(text)
#print('Sentences:\n',sent_token)
for sent in sent_token:
 print(sent)
 print(len(sent))
 
print()
tfidf=TfidfVectorizer()
x_tfidf=tfidf.fit_transform(sent_token)
features_names_tfidf=tfidf.get_feature_names_out()
print('Feature Names(TF-IDF):\n',features_names_tfidf)
print()
print('TF-IDF Matrix:\n',x_tfidf.toarray())

Enter a statement:
	commonwealth bank cuts fixed home loan rates community urged to help homeless youth
commonwealth bank cuts fixed home loan rates community urged to help homeless youth
83

Feature Names(TF-IDF):
 ['bank' 'commonwealth' 'community' 'cuts' 'fixed' 'help' 'home' 'homeless'
 'loan' 'rates' 'to' 'urged' 'youth']

TF-IDF Matrix:
 [[0.2773501 0.2773501 0.2773501 0.2773501 0.2773501 0.2773501 0.2773501
  0.2773501 0.2773501 0.2773501 0.2773501 0.2773501 0.2773501]]


# N-grams:

In [12]:
import nltk
from nltk import ngrams
text='big hopes for launceston cycling championship'
tokens=nltk.word_tokenize(text)
#Generate bigrams
n=2
bigrams=list(ngrams(tokens,n))
print(f"Original text:{text}")
print()
print(f"Generated {n}-grams:")
for gram in bigrams:
 print(gram)


Original text:big hopes for launceston cycling championship

Generated 2-grams:
('big', 'hopes')
('hopes', 'for')
('for', 'launceston')
('launceston', 'cycling')
('cycling', 'championship')


In [3]:
import nltk
from nltk import ngrams
text='aba decides against community broadcasting licence'
tokens=nltk.word_tokenize(text)
#Generate n-grams
n=3
ngrams=list(ngrams(tokens,n))
print(f"Original text:{text}")
print()
print(f"Generated {n}-grams:")
for gram in ngrams:
 print(gram)

Original text:aba decides against community broadcasting licence

Generated 3-grams:
('aba', 'decides', 'against')
('decides', 'against', 'community')
('against', 'community', 'broadcasting')
('community', 'broadcasting', 'licence')


In [4]:
import nltk
from nltk import ngrams
text='big hopes for launceston cycling championship'
tokens=nltk.word_tokenize(text)
#Generate n-grams
n=4
ngrams=list(ngrams(tokens,n))
print(f"Original text:{text}")
print()
print(f"Generated {n}-grams:")
for gram in ngrams:
 print(gram)


Original text:big hopes for launceston cycling championship

Generated 4-grams:
('big', 'hopes', 'for', 'launceston')
('hopes', 'for', 'launceston', 'cycling')
('for', 'launceston', 'cycling', 'championship')


In [5]:
#generate n-grams with list comprehension
n=2
text='big hopes for launceston cycling championship'
tokens=nltk.word_tokenize(text)
ngrams=[tokens[i:i+n] for i in range(len(tokens)-n+1)]
print(f"Original text:{text}")
print()
print(f"Generated {n}-grams:")
for gram in ngrams:
 print(gram)


Original text:big hopes for launceston cycling championship

Generated 2-grams:
['big', 'hopes']
['hopes', 'for']
['for', 'launceston']
['launceston', 'cycling']
['cycling', 'championship']


# onehot encoding:

In [11]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
text=['aba decides against community broadcasting licence','act fire witnesses must be aware of defamation',
      'ambitious olsson wins triple jump']
#step 1- tokens
tokens=[word for sent in text for word in sent.lower().split()]
#step 2- vocabulary
vocabulary=list(set(tokens)) # unique words in the text
#initialize encoder
encoder=OneHotEncoder(categories=[vocabulary],sparse=False)
#Perform the one-hot encoding
one_hot_encoded=[]
for sent in text:
    sent_encoded=[]
    for word in sent.lower().split():
        word_index=vocabulary.index(word)
        word_vector=np.zeros(len(vocabulary))
        word_vector[word_index]=1
        sent_encoded.append(word_vector)
        one_hot_encoded.append(sent_encoded)
 
for sent in one_hot_encoded:
    print(sent)

[array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.])]
[array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
   

In [12]:
sent[0]

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.])

In [13]:
sent[1]


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0.])

In [14]:
len(sent)


5

In [15]:
type(sent)


list

# Word2vec:

In [16]:
!pip install gensim

Collecting FuzzyTM>=0.4.0 (from gensim)
  Downloading FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Downloading pyFUME-0.2.25-py3-none-any.whl (67 kB)
     ---------------------------------------- 0.0/67.1 kB ? eta -:--:--
     ---------------------------------------- 67.1/67.1 kB 3.8 MB/s eta 0:00:00
Collecting simpful (from pyfume->FuzzyTM>=0.4.0->gensim)
  Obtaining dependency information for simpful from https://files.pythonhosted.org/packages/8d/93/8448d3f1aa9d2911b8cba2602aaa1af85eb31a26d28b7b737f1fa5b40c02/simpful-2.11.1-py3-none-any.whl.metadata
  Downloading simpful-2.11.1-py3-none-any.whl.metadata (4.8 kB)
Collecting fst-pso (from pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting miniful (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading miniful-0.0.6.tar.gz (2.8 kB)
  Preparing metadata 

In [19]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [23]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
#sample text
text=['freedom records net profit for third successive',
 'funds allocated for domestic violence victims',
 'funds allocated for youth at risk']
tokenized_text=[word_tokenize(sentence.lower()) for sentence in text]
#Train Word2vec model
model=Word2Vec(sentences=tokenized_text,vector_size=20,window=5,min_count=1,workers=4)
#Find word vectors
vector_youth=model.wv['youth']
vector_risk=model.wv['risk']
#similarity b/w words
similarity=model.wv.similarity('funds','successive')
print(f"Vector for 'youth':{vector_youth}")
print('='*100)
print(f"Vector for 'risk':{vector_risk}")
print('='*100)
print(f"Similarity b/w 'word' and 'successive':{similarity}")

Vector for 'youth':[-0.04309844  0.01832869  0.02594942  0.02870969  0.03733459 -0.03083838
  0.00552807  0.03023641 -0.01420025 -0.03086761 -0.00205112 -0.04184474
 -0.02800006  0.03552269  0.0167627   0.03612835  0.03400124  0.03765371
 -0.01894577 -0.00280903]
Vector for 'risk':[-0.00788826  0.00160686 -0.02070315 -0.03841344 -0.00754004  0.01234897
 -0.00444013  0.02766831 -0.01371489  0.01130033  0.02727897  0.04172977
 -0.0072687  -0.04604071  0.02185276  0.00285892  0.03720954 -0.00406641
 -0.01319207 -0.04376505]
Similarity b/w 'word' and 'successive':0.02916492521762848


# Doc2Vec:


In [1]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
#sample text
documents=['freedom records net profit for third successive',
 'funds allocated for domestic violence victims',
 'funds allocated for youth at risk']
#Tokenize & tag documents
tagged_data=[TaggedDocument(words=word_tokenize(doc.lower()),
 tags=[str(i)]) for i,doc in enumerate(documents)]
print(tagged_data)

[TaggedDocument(words=['freedom', 'records', 'net', 'profit', 'for', 'third', 'successive'], tags=['0']), TaggedDocument(words=['funds', 'allocated', 'for', 'domestic', 'violence', 'victims'], tags=['1']), TaggedDocument(words=['funds', 'allocated', 'for', 'youth', 'at', 'risk'], tags=['2'])]


In [2]:
#Train Doc2vec model
model=Doc2Vec(vector_size=100,window=2,min_count=1,workers=5,epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data,total_examples=model.corpus_count,epochs=model.epochs)

In [3]:
vector_doc_1=model.infer_vector(word_tokenize("freedom records net profit for third successive"))

In [4]:
vector_doc_1

array([-4.9496763e-03,  3.1143485e-03,  2.7254957e-03, -1.7791928e-03,
        4.0919718e-04, -4.1887192e-03, -1.6309367e-03, -1.6367417e-03,
        3.1562506e-03, -1.8598560e-03,  3.1855521e-03, -4.3522506e-03,
       -1.1124995e-03, -1.0317671e-03,  5.0594512e-04, -2.1773055e-03,
       -3.8218382e-03,  1.7934815e-03, -2.1344903e-03, -2.7294210e-03,
       -5.0259515e-04,  3.1099967e-03, -2.4377510e-03, -2.2734059e-03,
       -9.1357324e-05,  2.9990466e-03, -3.7868416e-03, -1.6672349e-03,
        3.5287319e-03, -4.2792954e-03, -2.7764883e-04, -1.7303550e-03,
        2.6221753e-03, -1.8597767e-03, -8.8109425e-04, -4.2428751e-03,
        2.8178556e-04,  7.5170200e-04,  2.7657275e-03,  2.7228573e-03,
       -4.6466233e-04, -2.6687635e-03, -4.0061614e-03,  7.3689659e-04,
        2.5591089e-03,  4.0506478e-03, -2.2260456e-03,  3.3218581e-03,
        3.7739768e-03,  2.1049415e-03, -1.9437169e-03,  2.3372011e-06,
        9.5767906e-04, -1.5174360e-03, -7.9578266e-04, -2.5370012e-03,
      

In [5]:
#find the most similar document
similar_doc=model.docvecs.most_similar(positive=[vector_doc_1])
print(f"vector for 'freedom records net profit for third successive':{vector_doc_1}")
print()
print(f"Most similar document:{similar_doc}")

vector for 'freedom records net profit for third successive':[-4.9496763e-03  3.1143485e-03  2.7254957e-03 -1.7791928e-03
  4.0919718e-04 -4.1887192e-03 -1.6309367e-03 -1.6367417e-03
  3.1562506e-03 -1.8598560e-03  3.1855521e-03 -4.3522506e-03
 -1.1124995e-03 -1.0317671e-03  5.0594512e-04 -2.1773055e-03
 -3.8218382e-03  1.7934815e-03 -2.1344903e-03 -2.7294210e-03
 -5.0259515e-04  3.1099967e-03 -2.4377510e-03 -2.2734059e-03
 -9.1357324e-05  2.9990466e-03 -3.7868416e-03 -1.6672349e-03
  3.5287319e-03 -4.2792954e-03 -2.7764883e-04 -1.7303550e-03
  2.6221753e-03 -1.8597767e-03 -8.8109425e-04 -4.2428751e-03
  2.8178556e-04  7.5170200e-04  2.7657275e-03  2.7228573e-03
 -4.6466233e-04 -2.6687635e-03 -4.0061614e-03  7.3689659e-04
  2.5591089e-03  4.0506478e-03 -2.2260456e-03  3.3218581e-03
  3.7739768e-03  2.1049415e-03 -1.9437169e-03  2.3372011e-06
  9.5767906e-04 -1.5174360e-03 -7.9578266e-04 -2.5370012e-03
 -4.3664537e-03  1.2438516e-03 -9.1554981e-04  2.0929915e-03
  3.2901191e-03 -1.88682

  similar_doc=model.docvecs.most_similar(positive=[vector_doc_1])
