# Accelerating Cleantech Advancements through NLP-Powered Text Mining and Knowledge Extraction
### Notebook 1: Data Cleaning and Preprocessing

Authors: Muhammed K. Ç., Karsanth P., Andrea V.


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import string
import re
import nltk
import spacy
from pathlib import Path
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

# String Operations

### Getting Know Data

In [None]:
# Read the dataset

file_path = Path("/cleantech_media_dataset_v1_20231109.csv")
df_x = pd.read_csv(file_path)

In [None]:
df = df_x.copy()

In [None]:
# Display basic information about the dataset
df.head()

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
0,1280,Qatar to Slash Emissions as LNG Expansion Adva...,2021-01-13,,"[""Qatar Petroleum ( QP) is targeting aggressiv...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
1,1281,India Launches Its First 700 MW PHWR,2021-01-15,,"[""• Nuclear Power Corp. of India Ltd. ( NPCIL)...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
2,1283,New Chapter for US-China Energy Trade,2021-01-20,,"[""New US President Joe Biden took office this ...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
3,1284,Japan: Slow Restarts Cast Doubt on 2030 Energy...,2021-01-22,,"[""The slow pace of Japanese reactor restarts c...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
4,1285,NYC Pension Funds to Divest Fossil Fuel Shares,2021-01-25,,"[""Two of New York City's largest pension funds...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...


In [None]:
# Display basic information about the dataset
df.shape

(9607, 7)

In [None]:
# Display basic information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9607 entries, 0 to 9606
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  9607 non-null   int64 
 1   title       9607 non-null   object
 2   date        9607 non-null   object
 3   author      31 non-null     object
 4   content     9607 non-null   object
 5   domain      9607 non-null   object
 6   url         9607 non-null   object
dtypes: int64(1), object(6)
memory usage: 525.5+ KB


In [None]:
# Check for null values in each column
for i in df.columns:
    print(i, df[i].isnull().sum())

Unnamed: 0 0
title 0
date 0
author 9576
content 0
domain 0
url 0


In [None]:
# Drop unnecessary columns
df.drop( columns = ["author", "url", "Unnamed: 0", "date"], inplace = True)

In [None]:
# Display information about the dataset after dropping columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9607 entries, 0 to 9606
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    9607 non-null   object
 1   content  9607 non-null   object
 2   domain   9607 non-null   object
dtypes: object(3)
memory usage: 225.3+ KB


In [None]:
# Explore the length distribution of the 'content' and 'title' columns
df["content"].str.len().describe()

count     9607.000000
mean      4905.994483
std       2833.833565
min       1188.000000
25%       2770.000000
50%       4228.000000
75%       6284.500000
max      19836.000000
Name: content, dtype: float64

Average 4905 chars long

In [None]:
df.title.str.len().describe()

count    9607.000000
mean       67.081503
std        20.741027
min        18.000000
25%        53.000000
50%        65.000000
75%        80.000000
max       200.000000
Name: title, dtype: float64

In [None]:
# Identify and display titles and content with maximum and minimum lengths
df.title[np.argmax(df.title.str.len())]
df.title[np.argmax(df.title.str.len())]

'ESB and Bord na Móna announce first solar farm to be developed as part of their Solar Co-Development Agreement – delivering up to 500MW of solar energy to the national grid – pv magazine International'

In [None]:
df.title[np.argmin(df.title.str.len())]

'A Bad Year for Gas'

In [None]:
df.content[np.argmax(df.content.str.len())]

'["By clicking `` Allow All \'\' you agree to the storing of cookies on your device to enhance site navigation, analyse site usage and support us in providing free open access scientific content. More info.", \'The electric vehicle battery market is growing rapidly as the demand for electric vehicles increases. According to some estimates, the global market for electric vehicle batteries is expected to reach $ 225.55 billion by 2027. This growth is driven by several factors, including increasing concerns about climate change, the falling cost of electric vehicle batteries, and government incentives and subsidies for electric vehicles.\', \'Electric vehicle batteries are a critical component of electric vehicles made from lithium-ion or other high-capacity cells. They provide the energy needed to propel the vehicle and perform other functions, such as powering the lights, the radio, and the heating and cooling system.\', \'Many companies produce electric vehicle batteries, including Tes

In [None]:
df.content[np.argmin(df.content.str.len())]

"['Italy’ s Falck Renewables will sell its founding family’ s 60% holding in the company to an investment vehicle run by JPMorgan Chase & Co, in a deal valuing the company at nearly $ 3 billion.', 'Shares in Falck Renewables rose as much as 15% in Milan after adding more than 3% on Tuesday, when Bloomberg reported the family was weighing options for its stake.', 'Falck Renewables will sell the holding to JPMorgan’ s Infrastructure Investments Fund, according to a statement.', 'The price of 8.81 euros ( $ 10.25) per share, a premium of 29.2% to the 3-month volume weighted average share price, values the full company at just below $ 3 billion.', 'The agreement will trigger a mandatory cash tender offer at closing.', 'IIF will partner with Falck Renewables “ to accelerate its growth plan and reinforce its leadership position in the renewable energy sector, ” the Italian company said in the statement.', 'The Falcks are among Italy’ s oldest industrial dynasties, with a history in the steel

There are many irrelevant characters in content column

In [None]:
# Explore the presence of specific keywords related to crime in the 'content' column
df.content.str.contains("terror").sum()

24

In [None]:
df.content.str.contains("love").sum()

457

In [None]:
# Create a DataFrame to check for the presence of multiple crime-related keywords
crime_list = ["terror", "criminal", "police", "law", "drug", "bribery", "burglary", "homicide", "victim", "corruption"]

In [None]:
df_crime =  pd.DataFrame({crime : df.content.str.contains(crime, re.IGNORECASE) for crime in crime_list})

In [None]:
df_crime.head()

Unnamed: 0,terror,criminal,police,law,drug,bribery,burglary,homicide,victim,corruption
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False,False,False,False


In [None]:
# Display content with both 'terror' and 'law' keywords
selection = df_crime.query("terror & law")
len(selection)

10

In [None]:
selection.head()

Unnamed: 0,terror,criminal,police,law,drug,bribery,burglary,homicide,victim,corruption
1511,True,False,False,True,False,False,False,False,False,True
1582,True,False,False,True,False,False,False,False,False,False
1710,True,False,False,True,False,False,False,False,False,False
2058,True,False,False,True,False,False,False,False,False,False
2594,True,False,True,True,False,False,False,False,False,True


In [None]:
df.loc[selection.index][["title", "domain"]]

Unnamed: 0,title,domain
1511,What Will Happen With Russia?,cleantechnica
1582,MidwayUSA's Record Solar Install Shows That Cl...,cleantechnica
1710,Unpacking The Iron Law of Projects: Black Swan...,cleantechnica
2058,European Countries Plan To Make The North Sea ...,cleantechnica
2594,The 10 best marathons in the Middle East,greenprophet
4782,Obscure Electric Truck Maker Named Mack Emerge...,cleantechnica
4851,Check Out This Ukraine Electric Vehicle Webinar,cleantechnica
4914,Corporate America Calls for Invoking 25th Amen...,naturalgasintel
5139,ConocoPhillips Evaluating ‘ Options’ After Wil...,naturalgasintel
5141,Federal Judge Voids OK for ConocoPhillips’ Wil...,naturalgasintel


# Preporecessing

### Punctuation

In [None]:
# Remove punctuation from 'title', 'content', and 'domain' columns
for i in df[["title", "content", "domain"]]:
    df[i] = df[i].apply(lambda x : x.translate(str.maketrans('', '',string.punctuation)))

In [None]:
df.head()

Unnamed: 0,title,content,domain
0,Qatar to Slash Emissions as LNG Expansion Adva...,Qatar Petroleum QP is targeting aggressive cu...,energyintel
1,India Launches Its First 700 MW PHWR,• Nuclear Power Corp of India Ltd NPCIL synch...,energyintel
2,New Chapter for USChina Energy Trade,New US President Joe Biden took office this we...,energyintel
3,Japan Slow Restarts Cast Doubt on 2030 Energy ...,The slow pace of Japanese reactor restarts con...,energyintel
4,NYC Pension Funds to Divest Fossil Fuel Shares,Two of New York Citys largest pension funds sa...,energyintel


### Numbers Removal

In [None]:
# Remove numbers from 'title', 'content', and 'domain' columns
for i in df[["title", "content", "domain"]]:
    df[i] = df[i].apply(lambda x :x.translate(str.maketrans('', '', string.digits)))

In [None]:
df.head()

Unnamed: 0,title,content,domain
0,Qatar to Slash Emissions as LNG Expansion Adva...,Qatar Petroleum QP is targeting aggressive cu...,energyintel
1,India Launches Its First MW PHWR,• Nuclear Power Corp of India Ltd NPCIL synch...,energyintel
2,New Chapter for USChina Energy Trade,New US President Joe Biden took office this we...,energyintel
3,Japan Slow Restarts Cast Doubt on Energy Plan,The slow pace of Japanese reactor restarts con...,energyintel
4,NYC Pension Funds to Divest Fossil Fuel Shares,Two of New York Citys largest pension funds sa...,energyintel


### Lowercasing

In [None]:
# Convert text to lowercase for 'title', 'content', and 'domain' columns
for i in df.columns:
    df[i] = df[i].apply(lambda x: x.lower())

In [None]:
df.head()

Unnamed: 0,title,content,domain
0,qatar to slash emissions as lng expansion adva...,qatar petroleum qp is targeting aggressive cu...,energyintel
1,india launches its first mw phwr,• nuclear power corp of india ltd npcil synch...,energyintel
2,new chapter for uschina energy trade,new us president joe biden took office this we...,energyintel
3,japan slow restarts cast doubt on energy plan,the slow pace of japanese reactor restarts con...,energyintel
4,nyc pension funds to divest fossil fuel shares,two of new york citys largest pension funds sa...,energyintel


### Removing weird chars and tabs

In [None]:
# Remove weird characters and leading/trailing whitespaces
for i in df.columns:
    df[i] = df[i].apply(lambda x: re.sub(re.compile('[^a-zA-Z0-9\s]'), '', x))

In [None]:
df.head()

Unnamed: 0,title,content,domain
0,qatar to slash emissions as lng expansion adva...,qatar petroleum qp is targeting aggressive cu...,energyintel
1,india launches its first mw phwr,nuclear power corp of india ltd npcil synchr...,energyintel
2,new chapter for uschina energy trade,new us president joe biden took office this we...,energyintel
3,japan slow restarts cast doubt on energy plan,the slow pace of japanese reactor restarts con...,energyintel
4,nyc pension funds to divest fossil fuel shares,two of new york citys largest pension funds sa...,energyintel


In [None]:
for i in df.columns:
    df[i] = df[i].apply(lambda x: x.lstrip())


In [None]:
df.head()

Unnamed: 0,title,content,domain
0,qatar to slash emissions as lng expansion adva...,qatar petroleum qp is targeting aggressive cu...,energyintel
1,india launches its first mw phwr,nuclear power corp of india ltd npcil synchro...,energyintel
2,new chapter for uschina energy trade,new us president joe biden took office this we...,energyintel
3,japan slow restarts cast doubt on energy plan,the slow pace of japanese reactor restarts con...,energyintel
4,nyc pension funds to divest fossil fuel shares,two of new york citys largest pension funds sa...,energyintel


### Tokenize Words

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def tokenize_words(text):
    doc = nlp(text)
    return [token.text for token in doc]


In [None]:
# Tokenize words using spaCy
%%time
df["token_title"] = df["title"].apply(lambda x: tokenize_words(x))

CPU times: user 43.3 s, sys: 119 ms, total: 43.5 s
Wall time: 43.7 s


In [None]:
%%time

df["token_words_content"] = df["content"].apply(lambda x: tokenize_words(x))

CPU times: user 12min 37s, sys: 24 s, total: 13min 1s
Wall time: 13min 5s


In [None]:
df["token_words_content"][1]

['nuclear',
 'power',
 'corp',
 'of',
 'india',
 'ltd',
 ' ',
 'npcil',
 'synchronized',
 'kakrapar',
 'in',
 'the',
 'western',
 'state',
 'of',
 'gujarat',
 'to',
 'the',
 'grid',
 'on',
 'jan',
 ' ',
 'making',
 'it',
 'the',
 'first',
 'of',
 'indias',
 ' ',
 'megawatt',
 'indigenously',
 'developed',
 'pressurized',
 'heavy',
 'water',
 'reactors',
 ' ',
 'phwrs',
 'to',
 'reach',
 'this',
 'milestone',
 ' ',
 'niw',
 'sep',
 'the',
 'news',
 'was',
 'tweeted',
 'by',
 'anil',
 'kakodkar',
 'former',
 'chairman',
 'of',
 'the',
 'department',
 'of',
 'atomic',
 'energy',
 'who',
 'said',
 'that',
 ' ',
 'more',
 'units',
 'of',
 'the',
 'same',
 'design',
 'will',
 'follow',
 'three',
 'of',
 'these',
 'are',
 'currently',
 'under',
 'construction',
 ' ',
 'another',
 'at',
 'kakrapar',
 'and',
 'two',
 'at',
 'npcils',
 'rajasthan',
 'plant',
 'they',
 'will',
 'be',
 'followed',
 'by',
 'two',
 'at',
 'the',
 'greenfield',
 'gorakhpur',
 'site',
 'in',
 'haryana',
 'and',
 'then

In [None]:
%%time

df["token_domain"] = df["domain"].apply(lambda x: tokenize_words(x))

CPU times: user 47.6 s, sys: 127 ms, total: 47.7 s
Wall time: 47.6 s


In [None]:
df.head()

Unnamed: 0,title,content,domain,token_title,token_words_content,token_domain
0,qatar to slash emissions as lng expansion adva...,qatar petroleum qp is targeting aggressive cu...,energyintel,"[qatar, to, slash, emissions, as, lng, expansi...","[qatar, petroleum, , qp, is, targeting, aggre...",[energyintel]
1,india launches its first mw phwr,nuclear power corp of india ltd npcil synchro...,energyintel,"[india, launches, its, first, , mw, phwr]","[nuclear, power, corp, of, india, ltd, , npci...",[energyintel]
2,new chapter for uschina energy trade,new us president joe biden took office this we...,energyintel,"[new, chapter, for, uschina, energy, trade]","[new, us, president, joe, biden, took, office,...",[energyintel]
3,japan slow restarts cast doubt on energy plan,the slow pace of japanese reactor restarts con...,energyintel,"[japan, slow, restarts, cast, doubt, on, , en...","[the, slow, pace, of, japanese, reactor, resta...",[energyintel]
4,nyc pension funds to divest fossil fuel shares,two of new york citys largest pension funds sa...,energyintel,"[nyc, pension, funds, to, divest, fossil, fuel...","[two, of, new, york, citys, largest, pension, ...",[energyintel]


### Tokenize Sentences

In [None]:
# Tokenize sentences using spaCy
def tokenize_sents(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

df["token_sents_content"] = df["content"].apply(lambda x: tokenize_sents(x))

In [None]:
df.head()

Unnamed: 0,title,content,domain,token_title,token_words_content,token_domain,token_sents_content
0,qatar to slash emissions as lng expansion adva...,qatar petroleum qp is targeting aggressive cu...,energyintel,"[qatar, to, slash, emissions, as, lng, expansi...","[qatar, petroleum, , qp, is, targeting, aggre...",[energyintel],[qatar petroleum qp is targeting aggressive c...
1,india launches its first mw phwr,nuclear power corp of india ltd npcil synchro...,energyintel,"[india, launches, its, first, , mw, phwr]","[nuclear, power, corp, of, india, ltd, , npci...",[energyintel],[nuclear power corp of india ltd npcil synchr...
2,new chapter for uschina energy trade,new us president joe biden took office this we...,energyintel,"[new, chapter, for, uschina, energy, trade]","[new, us, president, joe, biden, took, office,...",[energyintel],[new us president joe biden took office this w...
3,japan slow restarts cast doubt on energy plan,the slow pace of japanese reactor restarts con...,energyintel,"[japan, slow, restarts, cast, doubt, on, , en...","[the, slow, pace, of, japanese, reactor, resta...",[energyintel],[the slow pace of japanese reactor restarts co...
4,nyc pension funds to divest fossil fuel shares,two of new york citys largest pension funds sa...,energyintel,"[nyc, pension, funds, to, divest, fossil, fuel...","[two, of, new, york, citys, largest, pension, ...",[energyintel],[two of new york citys largest pension funds s...


In [None]:
df.columns

Index(['title', 'content', 'domain', 'token_title', 'token_words_content',
       'token_domain', 'token_sents_content'],
      dtype='object')

### Stopwords

In [None]:
# Remove stopwords using spaCy
stop_words_spacy = nlp.Defaults.stop_words

In [None]:
def remove_stopwords(text):
    return [word for word in text if word not in stop_words_spacy]

In [None]:
%%time

df["token_title"] = df["token_title"].apply(lambda x: remove_stopwords(x))

CPU times: user 21.5 ms, sys: 28 µs, total: 21.6 ms
Wall time: 21.3 ms


In [None]:
%%time

df["token_domain"] = df["token_domain"].apply(lambda x: remove_stopwords(x))

CPU times: user 500 ms, sys: 2.13 ms, total: 502 ms
Wall time: 495 ms


In [None]:
%%time

df["token_words_content"] = df["token_words_content"].apply(lambda x: remove_stopwords(x))

CPU times: user 846 ms, sys: 16.9 ms, total: 863 ms
Wall time: 862 ms


In [None]:
%%time

df["token_sents_content"] = df["token_sents_content"].apply(lambda x: remove_stopwords(x))

CPU times: user 137 ms, sys: 2.05 ms, total: 139 ms
Wall time: 138 ms


### Tokenize Sentences without processing

We might need to keep sentence meaning for the future analysis, so we are doing sentence tokenizing without any preprocessing

In [None]:
df_y = df_x.copy()

In [None]:
# Tokenize sentences without any processin
%%time

df["token_sents_content_without_process"] = df_y["content"].apply(lambda x: tokenize_sents(x))

CPU times: user 19min 59s, sys: 9 s, total: 20min 8s
Wall time: 20min 6s


#### Comparing without processing and with processing results for sentence tokenize

In [None]:
df["token_sents_content"][1]

['nuclear power corp of india ltd  npcil synchronized kakrapar in the western state of gujarat to the grid on jan  making it the first of indias  megawatt indigenously developed pressurized heavy water reactors  phwrs to reach this milestone  niw',
 'sep',
 'the news was tweeted by anil kakodkar former chairman of the department of atomic energy who said that  more units of the same design will follow three of these are currently under construction  another at kakrapar and two at npcils rajasthan plant they will be followed by two at the greenfield gorakhpur site in haryana and then a planned unit fleet at gorakhpur and three other sites kakrapar was five years past its  completion date achieving criticality in july   years after construction began commercial operations are slated to begin in march according to npcils website although that deadline will likely not be met indias nuclear suppliers should be feeling some relief over kakrapars startup although order flows will depend on ho

In [None]:
df["token_sents_content_without_process"][1]

['["• Nuclear Power Corp. of India Ltd. ( NPCIL) synchronized Kakrapar-3 in the western state of Gujarat to the grid on Jan. 10, making it the first of India\'s 700 megawatt indigenously developed pressurized heavy water reactors ( PHWRs) to reach this milestone ( NIW Sep.18\'20).',
 'The news was tweeted by Anil Kakodkar, former chairman of the Department of Atomic Energy, who said that 15 more units of the same design will follow.',
 "Three of these are currently under construction -- another at Kakrapar, and two at NPCIL's Rajasthan plant.",
 'They will be followed by two at the greenfield Gorakhpur site in Haryana, and then a planned 10-unit fleet at Gorakhpur and three other sites.',
 'Kakrapar-3 was five years past its 2015 completion date, achieving criticality in July 2020, 10 years after construction began.',
 "Commercial operations are slated to begin in March, according to NPCIL's website, although that deadline will likely not be met.",
 "India's nuclear suppliers should be

In [None]:
# Removing '•'

df["token_sents_content_without_process"] = df["token_sents_content_without_process"].apply(lambda x: [sentence.replace('•', '').strip() for sentence in x if sentence.replace('•', '').strip()])

In [None]:
df_t = df.copy()
df_z = df_t.copy()
df = df_z.copy()

In [None]:
df["token_sents_content_without_process"][1]

['[" Nuclear Power Corp. of India Ltd. ( NPCIL) synchronized Kakrapar-3 in the western state of Gujarat to the grid on Jan. 10, making it the first of India\'s 700 megawatt indigenously developed pressurized heavy water reactors ( PHWRs) to reach this milestone ( NIW Sep.18\'20).',
 'The news was tweeted by Anil Kakodkar, former chairman of the Department of Atomic Energy, who said that 15 more units of the same design will follow.',
 "Three of these are currently under construction -- another at Kakrapar, and two at NPCIL's Rajasthan plant.",
 'They will be followed by two at the greenfield Gorakhpur site in Haryana, and then a planned 10-unit fleet at Gorakhpur and three other sites.',
 'Kakrapar-3 was five years past its 2015 completion date, achieving criticality in July 2020, 10 years after construction began.',
 "Commercial operations are slated to begin in March, according to NPCIL's website, although that deadline will likely not be met.",
 "India's nuclear suppliers should be 

### Lemmatization

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
def nltk_lemma(text):
    nltk_lemmatizer = WordNetLemmatizer()
    nltk_lemmas = [nltk_lemmatizer.lemmatize(w) for w in text]
    return nltk_lemmas

In [None]:
df.columns

Index(['title', 'content', 'domain', 'token_title', 'token_words_content',
       'token_domain', 'token_sents_content',
       'token_sents_content_without_process'],
      dtype='object')

In [None]:
# Lemmatization using NLTK
df["lemma_title"] = df["token_title"].apply(nltk_lemma)

In [None]:
df["lemma_content"] = df["token_words_content"].apply(nltk_lemma)

In [None]:
df.columns

Index(['title', 'content', 'domain', 'token_title', 'token_words_content',
       'token_domain', 'token_sents_content',
       'token_sents_content_without_process', 'lemma_title', 'lemma_content'],
      dtype='object')

In [None]:
df["lemma_content"][1]

['nuclear',
 'power',
 'corp',
 'india',
 'ltd',
 ' ',
 'npcil',
 'synchronized',
 'kakrapar',
 'western',
 'state',
 'gujarat',
 'grid',
 'jan',
 ' ',
 'making',
 'india',
 ' ',
 'megawatt',
 'indigenously',
 'developed',
 'pressurized',
 'heavy',
 'water',
 'reactor',
 ' ',
 'phwrs',
 'reach',
 'milestone',
 ' ',
 'niw',
 'sep',
 'news',
 'tweeted',
 'anil',
 'kakodkar',
 'chairman',
 'department',
 'atomic',
 'energy',
 'said',
 ' ',
 'unit',
 'design',
 'follow',
 'currently',
 'construction',
 ' ',
 'kakrapar',
 'npcils',
 'rajasthan',
 'plant',
 'followed',
 'greenfield',
 'gorakhpur',
 'site',
 'haryana',
 'planned',
 'unit',
 'fleet',
 'gorakhpur',
 'site',
 'kakrapar',
 'year',
 'past',
 ' ',
 'completion',
 'date',
 'achieving',
 'criticality',
 'july',
 '  ',
 'year',
 'construction',
 'began',
 'commercial',
 'operation',
 'slated',
 'begin',
 'march',
 'according',
 'npcils',
 'website',
 'deadline',
 'likely',
 'met',
 'india',
 'nuclear',
 'supplier',
 'feeling',
 'relie

In [None]:
df.head()

Unnamed: 0,title,content,domain,token_title,token_words_content,token_domain,token_sents_content,token_sents_content_without_process,lemma_title,lemma_content
0,qatar to slash emissions as lng expansion adva...,qatar petroleum qp is targeting aggressive cu...,energyintel,"[qatar, slash, emissions, lng, expansion, adva...","[qatar, petroleum, , qp, targeting, aggressiv...",[energyintel],[qatar petroleum qp is targeting aggressive c...,"[[""Qatar Petroleum ( QP) is targeting aggressi...","[qatar, slash, emission, lng, expansion, advance]","[qatar, petroleum, , qp, targeting, aggressiv..."
1,india launches its first mw phwr,nuclear power corp of india ltd npcil synchro...,energyintel,"[india, launches, , mw, phwr]","[nuclear, power, corp, india, ltd, , npcil, s...",[energyintel],[nuclear power corp of india ltd npcil synchr...,"[["" Nuclear Power Corp. of India Ltd. ( NPCIL)...","[india, launch, , mw, phwr]","[nuclear, power, corp, india, ltd, , npcil, s..."
2,new chapter for uschina energy trade,new us president joe biden took office this we...,energyintel,"[new, chapter, uschina, energy, trade]","[new, president, joe, biden, took, office, wee...",[energyintel],[new us president joe biden took office this w...,"[[""New US President Joe Biden took office this...","[new, chapter, uschina, energy, trade]","[new, president, joe, biden, took, office, wee..."
3,japan slow restarts cast doubt on energy plan,the slow pace of japanese reactor restarts con...,energyintel,"[japan, slow, restarts, cast, doubt, , energy...","[slow, pace, japanese, reactor, restarts, cont...",[energyintel],[the slow pace of japanese reactor restarts co...,"[[""The slow pace of Japanese reactor restarts ...","[japan, slow, restarts, cast, doubt, , energy...","[slow, pace, japanese, reactor, restarts, cont..."
4,nyc pension funds to divest fossil fuel shares,two of new york citys largest pension funds sa...,energyintel,"[nyc, pension, funds, divest, fossil, fuel, sh...","[new, york, citys, largest, pension, funds, di...",[energyintel],[two of new york citys largest pension funds s...,"[[""Two of New York City's largest pension fund...","[nyc, pension, fund, divest, fossil, fuel, share]","[new, york, city, largest, pension, fund, dive..."


In [None]:
def spacy_lemma(text):
    text = " ".join([word for word in text])
    spacy_lemmas = [word.lemma_ for word in nlp(text)]
    return spacy_lemmas

In [None]:
# Lemmatization using spaCy
df["lemma_spacy_content"] = df["token_words_content"].apply(spacy_lemma)

In [None]:
df.head()

Unnamed: 0,title,content,domain,token_title,token_words_content,token_domain,token_sents_content,token_sents_content_without_process,lemma_title,lemma_content,lemma_spacy_content
0,qatar to slash emissions as lng expansion adva...,qatar petroleum qp is targeting aggressive cu...,energyintel,"[qatar, slash, emissions, lng, expansion, adva...","[qatar, petroleum, , qp, targeting, aggressiv...",[energyintel],[qatar petroleum qp is targeting aggressive c...,"[[""Qatar Petroleum ( QP) is targeting aggressi...","[qatar, slash, emission, lng, expansion, advance]","[qatar, petroleum, , qp, targeting, aggressiv...","[qatar, petroleum, , qp, target, aggressive,..."
1,india launches its first mw phwr,nuclear power corp of india ltd npcil synchro...,energyintel,"[india, launches, , mw, phwr]","[nuclear, power, corp, india, ltd, , npcil, s...",[energyintel],[nuclear power corp of india ltd npcil synchr...,"[["" Nuclear Power Corp. of India Ltd. ( NPCIL)...","[india, launch, , mw, phwr]","[nuclear, power, corp, india, ltd, , npcil, s...","[nuclear, power, corp, india, ltd, , npcil, ..."
2,new chapter for uschina energy trade,new us president joe biden took office this we...,energyintel,"[new, chapter, uschina, energy, trade]","[new, president, joe, biden, took, office, wee...",[energyintel],[new us president joe biden took office this w...,"[[""New US President Joe Biden took office this...","[new, chapter, uschina, energy, trade]","[new, president, joe, biden, took, office, wee...","[new, president, joe, biden, take, office, wee..."
3,japan slow restarts cast doubt on energy plan,the slow pace of japanese reactor restarts con...,energyintel,"[japan, slow, restarts, cast, doubt, , energy...","[slow, pace, japanese, reactor, restarts, cont...",[energyintel],[the slow pace of japanese reactor restarts co...,"[[""The slow pace of Japanese reactor restarts ...","[japan, slow, restarts, cast, doubt, , energy...","[slow, pace, japanese, reactor, restarts, cont...","[slow, pace, japanese, reactor, restart, conti..."
4,nyc pension funds to divest fossil fuel shares,two of new york citys largest pension funds sa...,energyintel,"[nyc, pension, funds, divest, fossil, fuel, sh...","[new, york, citys, largest, pension, funds, di...",[energyintel],[two of new york citys largest pension funds s...,"[[""Two of New York City's largest pension fund...","[nyc, pension, fund, divest, fossil, fuel, share]","[new, york, city, largest, pension, fund, dive...","[new, york, city, large, pension, fund, divest..."


In [None]:
df["lemma_spacy_content"][1]

['nuclear',
 'power',
 'corp',
 'india',
 'ltd',
 '  ',
 'npcil',
 'synchronize',
 'kakrapar',
 'western',
 'state',
 'gujarat',
 'grid',
 'jan',
 '  ',
 'make',
 'indias',
 '  ',
 'megawatt',
 'indigenously',
 'develop',
 'pressurize',
 'heavy',
 'water',
 'reactor',
 '  ',
 'phwrs',
 'reach',
 'milestone',
 '  ',
 'niw',
 'sep',
 'news',
 'tweet',
 'anil',
 'kakodkar',
 'chairman',
 'department',
 'atomic',
 'energy',
 'say',
 '  ',
 'unit',
 'design',
 'follow',
 'currently',
 'construction',
 '  ',
 'kakrapar',
 'npcil',
 'rajasthan',
 'plant',
 'follow',
 'greenfield',
 'gorakhpur',
 'site',
 'haryana',
 'plan',
 'unit',
 'fleet',
 'gorakhpur',
 'site',
 'kakrapar',
 'year',
 'past',
 '  ',
 'completion',
 'date',
 'achieve',
 'criticality',
 'july',
 '   ',
 'year',
 'construction',
 'begin',
 'commercial',
 'operation',
 'slate',
 'begin',
 'march',
 'accord',
 'npcil',
 'website',
 'deadline',
 'likely',
 'meet',
 'indias',
 'nuclear',
 'supplier',
 'feel',
 'relief',
 'kakrapa

### Stemming

In [None]:
sno = SnowballStemmer('english')

In [None]:
def stemmer(text):
    sno_stemms = [sno.stem(w) for w in text]
    return sno_stemms

In [None]:
# Stemming using NLTK
df["stemm_content"] = df["token_words_content"].apply(stemmer)

In [None]:
df.head()

Unnamed: 0,title,content,domain,token_title,token_words_content,token_domain,token_sents_content,token_sents_content_without_process,lemma_title,lemma_content,lemma_spacy_content,stemm_content
0,qatar to slash emissions as lng expansion adva...,qatar petroleum qp is targeting aggressive cu...,energyintel,"[qatar, slash, emissions, lng, expansion, adva...","[qatar, petroleum, , qp, targeting, aggressiv...",[energyintel],[qatar petroleum qp is targeting aggressive c...,"[[""Qatar Petroleum ( QP) is targeting aggressi...","[qatar, slash, emission, lng, expansion, advance]","[qatar, petroleum, , qp, targeting, aggressiv...","[qatar, petroleum, , qp, target, aggressive,...","[qatar, petroleum, , qp, target, aggress, cut..."
1,india launches its first mw phwr,nuclear power corp of india ltd npcil synchro...,energyintel,"[india, launches, , mw, phwr]","[nuclear, power, corp, india, ltd, , npcil, s...",[energyintel],[nuclear power corp of india ltd npcil synchr...,"[["" Nuclear Power Corp. of India Ltd. ( NPCIL)...","[india, launch, , mw, phwr]","[nuclear, power, corp, india, ltd, , npcil, s...","[nuclear, power, corp, india, ltd, , npcil, ...","[nuclear, power, corp, india, ltd, , npcil, s..."
2,new chapter for uschina energy trade,new us president joe biden took office this we...,energyintel,"[new, chapter, uschina, energy, trade]","[new, president, joe, biden, took, office, wee...",[energyintel],[new us president joe biden took office this w...,"[[""New US President Joe Biden took office this...","[new, chapter, uschina, energy, trade]","[new, president, joe, biden, took, office, wee...","[new, president, joe, biden, take, office, wee...","[new, presid, joe, biden, took, offic, week, u..."
3,japan slow restarts cast doubt on energy plan,the slow pace of japanese reactor restarts con...,energyintel,"[japan, slow, restarts, cast, doubt, , energy...","[slow, pace, japanese, reactor, restarts, cont...",[energyintel],[the slow pace of japanese reactor restarts co...,"[[""The slow pace of Japanese reactor restarts ...","[japan, slow, restarts, cast, doubt, , energy...","[slow, pace, japanese, reactor, restarts, cont...","[slow, pace, japanese, reactor, restart, conti...","[slow, pace, japanes, reactor, restart, contin..."
4,nyc pension funds to divest fossil fuel shares,two of new york citys largest pension funds sa...,energyintel,"[nyc, pension, funds, divest, fossil, fuel, sh...","[new, york, citys, largest, pension, funds, di...",[energyintel],[two of new york citys largest pension funds s...,"[[""Two of New York City's largest pension fund...","[nyc, pension, fund, divest, fossil, fuel, share]","[new, york, city, largest, pension, fund, dive...","[new, york, city, large, pension, fund, divest...","[new, york, citi, largest, pension, fund, dive..."


In [None]:
df["stemm_content"][1]

['nuclear',
 'power',
 'corp',
 'india',
 'ltd',
 ' ',
 'npcil',
 'synchron',
 'kakrapar',
 'western',
 'state',
 'gujarat',
 'grid',
 'jan',
 ' ',
 'make',
 'india',
 ' ',
 'megawatt',
 'indigen',
 'develop',
 'pressur',
 'heavi',
 'water',
 'reactor',
 ' ',
 'phwrs',
 'reach',
 'mileston',
 ' ',
 'niw',
 'sep',
 'news',
 'tweet',
 'anil',
 'kakodkar',
 'chairman',
 'depart',
 'atom',
 'energi',
 'said',
 ' ',
 'unit',
 'design',
 'follow',
 'current',
 'construct',
 ' ',
 'kakrapar',
 'npcil',
 'rajasthan',
 'plant',
 'follow',
 'greenfield',
 'gorakhpur',
 'site',
 'haryana',
 'plan',
 'unit',
 'fleet',
 'gorakhpur',
 'site',
 'kakrapar',
 'year',
 'past',
 ' ',
 'complet',
 'date',
 'achiev',
 'critic',
 'juli',
 '  ',
 'year',
 'construct',
 'began',
 'commerci',
 'oper',
 'slate',
 'begin',
 'march',
 'accord',
 'npcil',
 'websit',
 'deadlin',
 'like',
 'met',
 'india',
 'nuclear',
 'supplier',
 'feel',
 'relief',
 'kakrapar',
 'startup',
 'order',
 'flow',
 'depend',
 'quick',
 

In [None]:
df.head()

Unnamed: 0,title,content,domain,token_title,token_words_content,token_domain,token_sents_content,token_sents_content_without_process,lemma_title,lemma_content,lemma_spacy_content,stemm_content
0,qatar to slash emissions as lng expansion adva...,qatar petroleum qp is targeting aggressive cu...,energyintel,"[qatar, slash, emissions, lng, expansion, adva...","[qatar, petroleum, , qp, targeting, aggressiv...",[energyintel],[qatar petroleum qp is targeting aggressive c...,"[[""Qatar Petroleum ( QP) is targeting aggressi...","[qatar, slash, emission, lng, expansion, advance]","[qatar, petroleum, , qp, targeting, aggressiv...","[qatar, petroleum, , qp, target, aggressive,...","[qatar, petroleum, , qp, target, aggress, cut..."
1,india launches its first mw phwr,nuclear power corp of india ltd npcil synchro...,energyintel,"[india, launches, , mw, phwr]","[nuclear, power, corp, india, ltd, , npcil, s...",[energyintel],[nuclear power corp of india ltd npcil synchr...,"[["" Nuclear Power Corp. of India Ltd. ( NPCIL)...","[india, launch, , mw, phwr]","[nuclear, power, corp, india, ltd, , npcil, s...","[nuclear, power, corp, india, ltd, , npcil, ...","[nuclear, power, corp, india, ltd, , npcil, s..."
2,new chapter for uschina energy trade,new us president joe biden took office this we...,energyintel,"[new, chapter, uschina, energy, trade]","[new, president, joe, biden, took, office, wee...",[energyintel],[new us president joe biden took office this w...,"[[""New US President Joe Biden took office this...","[new, chapter, uschina, energy, trade]","[new, president, joe, biden, took, office, wee...","[new, president, joe, biden, take, office, wee...","[new, presid, joe, biden, took, offic, week, u..."
3,japan slow restarts cast doubt on energy plan,the slow pace of japanese reactor restarts con...,energyintel,"[japan, slow, restarts, cast, doubt, , energy...","[slow, pace, japanese, reactor, restarts, cont...",[energyintel],[the slow pace of japanese reactor restarts co...,"[[""The slow pace of Japanese reactor restarts ...","[japan, slow, restarts, cast, doubt, , energy...","[slow, pace, japanese, reactor, restarts, cont...","[slow, pace, japanese, reactor, restart, conti...","[slow, pace, japanes, reactor, restart, contin..."
4,nyc pension funds to divest fossil fuel shares,two of new york citys largest pension funds sa...,energyintel,"[nyc, pension, funds, divest, fossil, fuel, sh...","[new, york, citys, largest, pension, funds, di...",[energyintel],[two of new york citys largest pension funds s...,"[[""Two of New York City's largest pension fund...","[nyc, pension, fund, divest, fossil, fuel, share]","[new, york, city, largest, pension, fund, dive...","[new, york, city, large, pension, fund, divest...","[new, york, citi, largest, pension, fund, dive..."
