In [1]:
import json
import pandas as pd

In [6]:


# Load the JSON data into Python
with open('News_Category_Dataset_v3.json') as f:
    data = f.readlines()

# Flatten the JSON data into a tabular format using pandas
df_list = []
for d in data:
    json_data = json.loads(d)
    df = pd.json_normalize(json_data)
    df_list.append(df)

# Concatenate the tabular data into a single DataFrame
df = pd.concat(df_list, ignore_index=True)

# Export the tabular data to a CSV file
df.to_csv('data.csv', index=False)


In [2]:
data=pd.read_csv("data.csv")
data.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",23-09-2022
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,23-09-2022
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,23-09-2022
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,23-09-2022
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,22-09-2022


In [3]:
description=data[["short_description"]]
description.head()

Unnamed: 0,short_description
0,Health experts said it is too early to predict...
1,He was subdued by passengers and crew when he ...
2,"""Until you have a dog you don't understand wha..."
3,"""Accidentally put grown-up toothpaste on my to..."
4,Amy Cooper accused investment firm Franklin Te...


In [8]:
import re

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def preprocess_text(text):
    # Convert input to string
    text = str(text)
    
    # Clean the text
    text = clean_text(text)
    
    return text

In [9]:
#description_clean=description.apply(clean_text)
data["short_description_clean"] = data["short_description"].apply(preprocess_text)

In [14]:
short_description_clean=data["short_description_clean"]
short_description_clean.head(10)

0    health experts said it is too early to predict...
1    he was subdued by passengers and crew when he ...
2    until you have a dog you dont understand what ...
3    accidentally put grownup toothpaste on my todd...
4    amy cooper accused investment firm franklin te...
5    the yearold woman was seen working at the sout...
6    whos that behind you an anchor for new yorks p...
7    more than half a million people remained witho...
8    in mija director isabel castro combined music ...
9    white house officials say the crux of the pres...
Name: short_description_clean, dtype: object

In [18]:
#Starting tokenization
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")


[nltk_data] Downloading package punkt to /Users/mandali/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [19]:
tokens=short_description_clean.apply(word_tokenize)

In [20]:
tokens.head()

0    [health, experts, said, it, is, too, early, to...
1    [he, was, subdued, by, passengers, and, crew, ...
2    [until, you, have, a, dog, you, dont, understa...
3    [accidentally, put, grownup, toothpaste, on, m...
4    [amy, cooper, accused, investment, firm, frank...
Name: short_description_clean, dtype: object

In [22]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mandali/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [26]:
from nltk.corpus import stopwords

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

def remove_stop_words(tokens,stop_words):
    # Remove stop words from the tokenized text
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    return filtered_tokens


In [27]:
#stop words removal
stop_words = set(stopwords.words('english'))
short_desc_filtered = tokens.apply(lambda x: remove_stop_words(x, stop_words))

In [28]:
short_desc_filtered.head(10)

0    [health, experts, said, early, predict, whethe...
1    [subdued, passengers, crew, fled, back, aircra...
2                [dog, dont, understand, could, eaten]
3    [accidentally, put, grownup, toothpaste, toddl...
4    [amy, cooper, accused, investment, firm, frank...
5    [yearold, woman, seen, working, south, carolin...
6    [whos, behind, anchor, new, yorks, pix, asked,...
7    [half, million, people, remained, without, wat...
8    [mija, director, isabel, castro, combined, mus...
9    [white, house, officials, say, crux, president...
Name: short_description_clean, dtype: object

In [30]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/mandali/nltk_data...


True

In [32]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/mandali/nltk_data...


True

In [33]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Define function to lemmatize tokens
def lemmatize_tokens(tokens):
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

# Apply lemmatization to short_desc_filtered
short_desc_processed = short_desc_filtered.apply(lambda x: lemmatize_tokens(x))

# Add processed text as new column to data dataframe
#data["short_description_processed"] = short_desc_processed


In [34]:
short_desc_processed.head(10)

0    [health, expert, said, early, predict, whether...
1    [subdued, passenger, crew, fled, back, aircraf...
2                [dog, dont, understand, could, eaten]
3    [accidentally, put, grownup, toothpaste, toddl...
4    [amy, cooper, accused, investment, firm, frank...
5    [yearold, woman, seen, working, south, carolin...
6    [who, behind, anchor, new, york, pix, asked, j...
7    [half, million, people, remained, without, wat...
8    [mija, director, isabel, castro, combined, mus...
9    [white, house, official, say, crux, president,...
Name: short_description_clean, dtype: object

In [36]:
data["short_description_processed"] = short_desc_processed

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the processed short descriptions
short_desc_vectors = vectorizer.fit_transform(data["short_description_processed"].astype(str))

# Convert the sparse matrix to a dense matrix
short_desc_vectors_dense = short_desc_vectors.todense()

# Print the shape of the dense matrix
print("Shape of short description vectors:", short_desc_vectors_dense.shape)


Shape of short description vectors: (209527, 82124)


In [5]:
import numpy as np

In [8]:
from bs4 import BeautifulSoup
import requests