# Natural Language Processing for NEWS Articles

## INDEX
* Basic Imports
* Data Preprocessing
* Named Entity Recognition(NER)
* Nouns Phrases
* Tags
* Sentiment
* Saving Data

### Basic Imports

In [61]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
from textblob import TextBlob
import os
import re

In [2]:
os.getcwd()

'd:\\Programming\\Research\\news_project_iit_guwahati\\NLP'

In [62]:
folder_path = './Data/Data/allglobal_spider'

file_list = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

combined_df = pd.DataFrame()

for file in file_list:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    combined_df = pd.concat([combined_df, df], ignore_index=True)

print(combined_df)

        ByClass                                        Date_Time  \
0      Politics          By Zhang Changyue  |  2022/12/2 0:43:14   
1      Politics                  By Xinhua  |  2022/12/4 0:22:02   
2      Politics              By Xu Keyue  |  2022/11/30 12:16:43   
3      Politics  By Xinhua - Global Times  |  2022/12/1 19:09:29   
4      Politics                 By Xinhua  |  2022/12/1 18:56:22   
...         ...                                              ...   
21276  Military                By Wang Qi  |  2023/3/13 19:53:10   
21277  Military            By Liu Xuanzun  |  2023/3/14 20:13:21   
21278  Military            By Liu Xuanzun  |  2023/3/13 20:07:50   
21279  Military            By Liu Xuanzun  |  2023/3/19 18:29:47   
21280   Economy              By Hu Weijia  |  2023/3/21 23:10:25   

                                                Headline  \
0      Tokyo urged to heed rational voices on China a...   
1           Chinese people continue to mourn Jiang Zemin   
2  

In [63]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21281 entries, 0 to 21280
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ByClass       21281 non-null  object
 1   Date_Time     21281 non-null  object
 2   Headline      21281 non-null  object
 3   Article_Lead  21281 non-null  object
 4   Pic           21281 non-null  object
 5   Article_Body  21281 non-null  object
dtypes: object(6)
memory usage: 997.7+ KB


In [64]:
combined_df.duplicated().sum()

19397

In [65]:
combined_df.drop_duplicates(inplace=True)

In [66]:
combined_df.dropna(inplace=True)

In [67]:
combined_df.duplicated().sum()

0

In [68]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1884 entries, 0 to 21280
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ByClass       1884 non-null   object
 1   Date_Time     1884 non-null   object
 2   Headline      1884 non-null   object
 3   Article_Lead  1884 non-null   object
 4   Pic           1884 non-null   object
 5   Article_Body  1884 non-null   object
dtypes: object(6)
memory usage: 103.0+ KB


In [69]:
NER = spacy.load("en_core_web_sm")

### Named Entity Recognition(NER)

In [70]:
df = combined_df

In [71]:
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Tokenize
    doc = NER(text)
    # Remove stopwords and lemmatize
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    # Join tokens back into text
    text = ' '.join(tokens)
    return text

# Apply preprocessing to text data
df['Article_Body'] = df['Article_Body'].apply(preprocess)

# Define function to perform NER
def get_entities(text):
    doc = NER(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Apply NER to preprocessed text data
df['entities'] = df['Article_Body'].apply(get_entities)


In [50]:
df = pd.read_csv('processed_data_with_ner_mjs.csv')

In [74]:
type(df['entities'][0])

list

In [77]:
def extract_entities_by_type(entities, entity_type):
    return [ent[0] for ent in entities if ent[1] == entity_type]

entities_list = ['GPE','DATE','PERSON','NORP','FAC','ORG','LOC','PRODUCT','EVENT''WORK_OF_ART','LAW','LANGUAGE','TIME','PERCENT','MONEY','QUANTITY','ORDINAL','CARDINAL']

for entity in entities_list:
    df[f"{entity}"] = df['entities'].apply(lambda x: extract_entities_by_type(x, f"{entity}"))


In [78]:
df

Unnamed: 0,ByClass,Date_Time,Headline,Article_Lead,Pic,Article_Body,entities,GPE,DATE,PERSON,...,PRODUCT,EVENTWORK_OF_ART,LAW,LANGUAGE,TIME,PERCENT,MONEY,QUANTITY,ORDINAL,CARDINAL
0,Politics,By Zhang Changyue | 2022/12/2 0:43:14,Tokyo urged to heed rational voices on China a...,"Japan on Thursday expressed ""severe concerns"" ...",https://www.globaltimes.cn/Portals/0/attachmen...,japan thursday express severe concern chinar...,"[(japan, GPE), (thursday, DATE), (chinarussia,...","[japan, chinarussia, china, japan, china, japa...","[thursday, thursday, early month, thursday, ye...",[chao],...,[],[],[],[],[],[],[],[],[second],[]
1,Politics,By Xinhua | 2022/12/4 0:22:02,Chinese people continue to mourn Jiang Zemin,"For days, people throughout China are in mourn...",https://www.globaltimes.cn/Portals/0/attachmen...,day people china mourning leader jiang zemin...,"[(china, GPE), (jiang zemin, PERSON), (wednesd...","[china, shanghai, tianjin, tianjin, yanan, yan...","[wednesday, past year, january]","[jiang zemin, jiang deathwang, jiang zemin, fe...",...,[],[],[],[],"[age minute, kilometer]",[],[],[],[],[]
2,Politics,By Xu Keyue | 2022/11/30 12:16:43,Japan warned of being 'biggest destabilizing f...,"Considering Japan, a country with the pacifist...",https://www.globaltimes.cn/Portals/0/attachmen...,consider japan country pacifist constitution...,"[(japan, GPE), (chinese, NORP), (japanese, NOR...","[japan, taiwan, japan, japan, united states, j...","[monday, tuesday, year, year, recent year, tue...","[yasukazu hamada, meetingjapan long cap, joe b...",...,[],[],[],[],[],[],"[trillion yen, trillion yen]",[],[],[]
3,Politics,By Xinhua - Global Times | 2022/12/1 19:09:29,Jiang Zemin's remains transferred to Beijing,The remains of Comrade Jiang Zemin were transf...,https://www.globaltimes.cn/Portals/0/attachmen...,remain comrade jiang zemin transfer beijing ...,"[(jiang zemin, PERSON), (beijing, GPE), (shang...","[beijing, shanghai, jiang, shanghai, jiang, sh...","[thursday, wednesday, december, december, wedn...","[jiang zemin, li keqiang li zhanshu wang yang ...",...,[],[],[],[],"[noon, morning]",[],[],[],[],[half]
4,Politics,By Xinhua | 2022/12/1 18:56:22,Memorial meeting for Comrade Jiang Zemin to be...,A memorial meeting for Comrade Jiang Zemin wil...,https://www.globaltimes.cn/images/nopic.gif,memorial meeting comrade jiang zemin hold gr...,"[(jiang zemin hold great, PERSON), (beijing, G...",[beijing],[thursday],"[jiang zemin hold great, jiang zemins]",...,[],[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21197,Economy,By GT staff reporters | 2023/3/21 18:53:11,International capitals likely to seek 'safe ha...,The unfolding bank crisis in the US and Europe...,https://www.globaltimes.cn/Portals/0/attachmen...,unfold bank crisis europe highly controversi...,"[(europe, LOC), (tuesday, DATE), (asia, LOC), ...","[hong kong, singapore, singapore, switzerland,...","[tuesday, tuesday, friday, tuesday, monday, mo...",[],...,[],[],[],[],[],[],[billion swiss franc],[],[],[billion]
21198,Economy,By Yin Yeping | 2023/3/21 22:04:34,"Australian coal, timber trade with China resum...",Australian coal trade with China is set for a ...,https://www.globaltimes.cn/Portals/0/attachmen...,australian coal trade china set recovery exp...,"[(australian, NORP), (china, GPE), (february, ...","[china, australia, china, china, china, hong]","[february, half year, month, february, monday,...",[ton coke],...,[],[],[],[],[],[],[],[million ton],[],[]
21199,Economy,By Global Times | 2023/3/21 22:38:24,China's overnight funding rate soars to 2-year...,"China's overnight funding rate, a measure of m...",https://www.globaltimes.cn/Portals/0/attachmen...,china overnight funding rate measure market ...,"[(china, GPE), (overnight, TIME), (february, D...","[china, china, china]","[february, february, february, friday]","[zhou maohua, zhou saidchina, zhou saidzhou]",...,[],[],[],[],[overnight],[],[billion yuan],[],[],[]
21273,Military,By Liu Xuanzun | 2023/3/20 23:21:19,Peacefully intended China-Cambodia joint exerc...,The Golden Dragon-2023 joint military exercise...,https://www.globaltimes.cn/Portals/0/attachmen...,golden dragon joint military exercise china ...,"[(china, GPE), (cambodia, GPE), (monday, DATE)...","[china, cambodia, cambodia, cambodia, china, c...","[monday, sunday, monday, april, monday, monday]","[chen xiangmiao, warplanes chen]",...,[],[],[],[],[],[],[],[],[fifth],[]


In [27]:
df.to_csv('processed_data_with_ner_mjs.csv', index=False)

### Noun Phrases 

In [28]:
df["Noun Phrases"] = df["Article_Body"].apply(lambda x : TextBlob(str(x)).noun_phrases)

### Tags

In [29]:
df["Tags"] = df["Article_Body"].apply(lambda x : TextBlob(str(x)).tags)

### Sentiment

In [30]:
df["Sentiment"] = df["Article_Body"].apply(lambda x : TextBlob(str(x)).sentiment)

### Saving Data

In [17]:
data.head()

Unnamed: 0,ByClass,DateTime,Headline,Article_Lead,Date,Article_Body,NER,Noun Phrases,Tags,Sentiment
0,\t\t\t\tarmed forces armynews\t\t\t,2020-10-28 10:22,\t\t\t\t\t\tpla army debuts occupational quali...,recently the pla army launched its first occup...,2020-10-28 10:22:27,by li hao and han chengbeijing oct 28 recentl...,"(by, li, hao, and, han, chengbeijing, oct, 28,...","[li hao, pla army plaa, occupational qualifica...","[(by, IN), (li, JJ), (hao, NN), (and, CC), (ha...","(0.14375000000000002, 0.30069444444444443)"
1,\t\t\t\tarmed forces armynews\t\t\t,2020-11-18 19:24,\t\t\t\t\t\tarmy aviation brigade uses drones ...,a drone was used to guide a helicopter to cond...,2020-11-18 19:24:55,by gao junfeng and xiang xubeijing nov 18 a d...,"(by, gao, junfeng, and, xiang, xubeijing, nov,...","[gao junfeng, visual range bvr strike, livefir...","[(by, IN), (gao, JJ), (junfeng, NN), (and, CC)...","(-0.0021052631578947242, 0.44859649122807016)"
2,\t\t\t\tarmed forces armynews\t\t\t,2021-01-14 17:13,\t\t\t\t\t\tmilitary academy trains border def...,a total of 30 grassroots cadres and police off...,2021-01-14 17:13:40,by li qinghua and song peng urumqi jan 14 a t...,"(by, li, qinghua, and, song, peng, urumqi, jan...","[li qinghua, song peng urumqi jan, grassroots ...","[(by, IN), (li, JJ), (qinghua, NN), (and, CC),...","(0.14074074074074075, 0.46481481481481485)"
3,\t\t\t\tarmed forces armed police forcenews\...,2020-04-27 08:31,\t\t\t\t\t\tchinas top legislature reviews dra...,china on sunday drafted an amendment to the la...,2020-04-27 08:31:30,china on sunday drafted an amendment to the la...,"(china, on, sunday, drafted, an, amendment, to...","[police force, such things, emergency situatio...","[(china, NN), (on, IN), (sunday, NN), (drafted...","(0.0821127946127946, 0.28404882154882155)"
4,\t\t\t\tarmed forces armynews\t\t\t,2021-06-08 11:37,\t\t\t\t\t\tpla army academies release promoti...,recently the peoples liberation army pla relea...,2021-06-08 11:37:32,recently the peoples liberation army pla relea...,"(recently, the, peoples, liberation, army, pla...","[peoples liberation army pla, promotional vide...","[(recently, RB), (the, DT), (peoples, NNS), (l...","(0.04666666666666665, 0.23666666666666666)"


In [None]:
data.to_csv("processed_file_with_ner.csv")