In [1]:
import json
import pandas as pd
import re

file_path = 'news.article.json'

# Load the JSON file
with open(file_path, 'rb') as file:
    articles = json.load(file)

In [2]:
# Convert to DataFrame
df = pd.DataFrame(articles)

# Display the structure of the DataFrame
df.head(7)

Unnamed: 0,articleBody,dateModified,scrapedDate,source,title
0,"Sanjay Raut, a member of the Shiv Sena (UBT) p...",{'$date': '2023-10-25T06:35:50.000Z'},{'$date': '2023-10-27T13:12:18.339Z'},https://www.thehansindia.com/,Shiv Sena MP Sanjay Raut Responds To 'Hamas' R...
1,"Kozhikode (Kerala) [India], October 27 (ANI): ...",,{'$date': '2023-10-27T13:12:45.595Z'},https://www.aninews.in/,At IUML's pro-Palestine rally in Kerala Tharoo...
2,"Mumbai, Oct 24 (PTI) Maharashtra Chief Ministe...",{'$date': '2023-10-25T02:14:27.000Z'},{'$date': '2023-10-27T13:12:18.339Z'},https://thefederal.com/,Uddhav buried Bal Thackeray's 'Hindutva' for p...
3,"Sensex, Nifty rebound over 1 pc after six sess...",,{'$date': '2023-10-27T13:12:41.618Z'},https://english.varthabharati.in/,"New Bills replacing IPC, CrPC, Evidence Act wi..."
4,"October 26, 2023 08:15 pm | Updated 08:38 pm I...",{'$date': '2023-10-26T14:45:24.000Z'},{'$date': '2023-10-27T13:12:45.595Z'},https://www.thehindu.com/,"Israel biggest terrorist nation in the world, ..."
5,Eight former officers of the Indian Navy have ...,{'$date': '2023-10-26T11:22:00.000Z'},{'$date': '2023-10-27T13:12:47.852Z'},https://english.jagran.com/,Eight Ex-Indian Navy Officers Get Death Penalt...
6,"October 26, 2023 07:21 pm | Updated October 27...",{'$date': '2023-10-26T13:51:52.000Z'},{'$date': '2023-10-27T13:12:45.595Z'},https://www.thehindu.com/,Israel’s response to Hamas terrorist attack di...


In [3]:
df = df.drop(['dateModified', 'source'], axis=1)
df.shape

(37421, 3)

In [4]:
df.isna().sum()

articleBody    0
scrapedDate    0
title          0
dtype: int64

In [5]:
import nltk
import re

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\BASHA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
df.head(2)

Unnamed: 0,articleBody,scrapedDate,title
0,"Sanjay Raut, a member of the Shiv Sena (UBT) p...",{'$date': '2023-10-27T13:12:18.339Z'},Shiv Sena MP Sanjay Raut Responds To 'Hamas' R...
1,"Kozhikode (Kerala) [India], October 27 (ANI): ...",{'$date': '2023-10-27T13:12:45.595Z'},At IUML's pro-Palestine rally in Kerala Tharoo...


In [7]:
def clean_text(text):
    # Remove punctuation and special characters
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    # Remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text.strip()
    
df['articleBody'] = df['articleBody'].apply(clean_text)
df['title'] = df['title'].apply(clean_text)

In [8]:
df.head(2)

Unnamed: 0,articleBody,scrapedDate,title
0,Sanjay Raut a member of the Shiv Sena UBT part...,{'$date': '2023-10-27T13:12:18.339Z'},Shiv Sena MP Sanjay Raut Responds To Hamas Rem...
1,Kozhikode Kerala India October 27 ANI Pointing...,{'$date': '2023-10-27T13:12:45.595Z'},At IUMLs proPalestine rally in Kerala Tharoor ...


In [9]:
def extract_date(date_dict):
    return pd.to_datetime(date_dict['$date'])

# Apply the function to the 'scrapedDate' column
df['scrapedDate'] = df['scrapedDate'].apply(extract_date)

In [10]:
df.head(3)

Unnamed: 0,articleBody,scrapedDate,title
0,Sanjay Raut a member of the Shiv Sena UBT part...,2023-10-27 13:12:18.339000+00:00,Shiv Sena MP Sanjay Raut Responds To Hamas Rem...
1,Kozhikode Kerala India October 27 ANI Pointing...,2023-10-27 13:12:45.595000+00:00,At IUMLs proPalestine rally in Kerala Tharoor ...
2,Mumbai Oct 24 PTI Maharashtra Chief Minister E...,2023-10-27 13:12:18.339000+00:00,Uddhav buried Bal Thackerays Hindutva for powe...


In [13]:
new_column_names = {'scrapedDate': 'date', 'articleBody': 'desc'} 
df = df.rename(columns=new_column_names)

df.head()

Unnamed: 0,desc,date,title
0,Sanjay Raut a member of the Shiv Sena UBT part...,2023-10-27 13:12:18.339000+00:00,Shiv Sena MP Sanjay Raut Responds To Hamas Rem...
1,Kozhikode Kerala India October 27 ANI Pointing...,2023-10-27 13:12:45.595000+00:00,At IUMLs proPalestine rally in Kerala Tharoor ...
2,Mumbai Oct 24 PTI Maharashtra Chief Minister E...,2023-10-27 13:12:18.339000+00:00,Uddhav buried Bal Thackerays Hindutva for powe...
3,Sensex Nifty rebound over 1 pc after six sessi...,2023-10-27 13:12:41.618000+00:00,New Bills replacing IPC CrPC Evidence Act will...
4,October 26 2023 0815 pm Updated 0838 pm IST Ko...,2023-10-27 13:12:45.595000+00:00,Israel biggest terrorist nation in the world s...


In [14]:
df

Unnamed: 0,desc,date,title
0,Sanjay Raut a member of the Shiv Sena UBT part...,2023-10-27 13:12:18.339000+00:00,Shiv Sena MP Sanjay Raut Responds To Hamas Rem...
1,Kozhikode Kerala India October 27 ANI Pointing...,2023-10-27 13:12:45.595000+00:00,At IUMLs proPalestine rally in Kerala Tharoor ...
2,Mumbai Oct 24 PTI Maharashtra Chief Minister E...,2023-10-27 13:12:18.339000+00:00,Uddhav buried Bal Thackerays Hindutva for powe...
3,Sensex Nifty rebound over 1 pc after six sessi...,2023-10-27 13:12:41.618000+00:00,New Bills replacing IPC CrPC Evidence Act will...
4,October 26 2023 0815 pm Updated 0838 pm IST Ko...,2023-10-27 13:12:45.595000+00:00,Israel biggest terrorist nation in the world s...
...,...,...,...
37416,Lebanese media are reporting renewed IDF strik...,2024-03-31 03:10:17.646000+00:00,Lebanese media reports renewed IDF strikes in ...
37417,Amid escalating tensions and concerns over pot...,2024-03-31 03:09:38.331000+00:00,US approves additional bombs warplanes sales t...
37418,Haaretzcom the online English edition of Haare...,2024-03-31 03:09:46.683000+00:00,Israel Publishes Video of Islamic Jihad Terror...
37419,The UN secretary general António Guterres has ...,2024-03-31 03:09:50.586000+00:00,United Nations secretary general condemns expl...


In [15]:
import pickle

In [16]:
with open('articles.pickle', 'wb') as file:
    pickle.dump(df, file)