In [4]:
!pip install datasets
import datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pickle

# File path
file_path = '/content/drive/MyDrive/1_Data/ds_clean.pkl'

# Load the dataset
with open(file_path, 'rb') as f:
    ds_clean = pickle.load(f)

# Check the dataset
print(ds_clean)

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})


Now we will use TfidVectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Accessing the columns
train_texts = ds_clean['train']['document']

# TF-IDF on the training data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)

# Applying to validation and test sets
val_texts = ds_clean['validation']['document']
X_val_tfidf = tfidf_vectorizer.transform(val_texts)

test_texts = ds_clean['test']['document']
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

In [9]:
print(tfidf_vectorizer.get_feature_names_out())

['10' '100' '1000' ... 'zone' 'zoo' 'zuckerberg']


In [10]:
dense_matrix = X_train_tfidf.todense()
print(dense_matrix[:5])

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.03918711 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [11]:
feature_names = tfidf_vectorizer.get_feature_names_out()
first_row = dense_matrix[0]
print("Tokenized Words and Their TF-IDF Scores:")
for idx, score in enumerate(first_row.tolist()[0]):
    if score > 0:
        print(f"{feature_names[idx]}: {score}")


Tokenized Words and Their TF-IDF Scores:
2009: 0.05269408097296444
according: 0.026422745823109262
added: 0.10249371727040586
after: 0.019789281083892194
again: 0.0364517024019076
all: 0.02085176860074119
am: 0.037602204729206744
americans: 0.0496715036670486
and: 0.050732117939452086
are: 0.03735913615107948
at: 0.05281387435089893
back: 0.027355235018514987
be: 0.05441881003582824
because: 0.04924173944560003
before: 0.024561000567680995
ben: 0.06747354724505507
best: 0.03834767757957508
but: 0.03654311324662927
by: 0.017880743859797505
chairman: 0.05583021997113871
color: 0.07078954435808603
comments: 0.04913504240593295
compared: 0.05542737573024871
consumer: 0.07103871224209025
continue: 0.039788521531848094
created: 0.046723928000837404
current: 0.045872169385360435
data: 0.04615488443721107
december: 0.051029781089627725
department: 0.03850783779676591
don: 0.03843901147618452
down: 0.028744608056027562
dropped: 0.05728814881555412
during: 0.02660743773917064
each: 0.03579311957

First we will try logistic regression model on our dataset.