In [2]:
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [3]:
data = pd.read_csv("./augmented_news_data.csv")
data.head()

Unnamed: 0,Source,Headline
0,BBC,"Israel has destroyed more than 1,500 buildings..."
1,BBC,"Israel has destroyed more than one, five hundr..."
2,BBC,Trump v the BBC: What are the hurdles for US p...
3,BBC,Trumpet v the BBC: What represent the hurdles ...
4,BBC,"Israel has destroyed more than 1,500 buildings..."


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Source    392 non-null    object
 1   Headline  392 non-null    object
dtypes: object(2)
memory usage: 6.2+ KB


In [5]:
data.duplicated()

0      False
1      False
2      False
3      False
4       True
       ...  
387    False
388    False
389    False
390    False
391    False
Length: 392, dtype: bool

In [6]:
data.duplicated().sum()

18

In [7]:
df_no_duplicates = data.drop_duplicates(keep='first')
print(df_no_duplicates)

    Source                                           Headline
0      BBC  Israel has destroyed more than 1,500 buildings...
1      BBC  Israel has destroyed more than one, five hundr...
2      BBC  Trump v the BBC: What are the hurdles for US p...
3      BBC  Trumpet v the BBC: What represent the hurdles ...
5      BBC  Israel let destroyed more than 1, 500 building...
..     ...                                                ...
387    CNN  Zillion of Ukrainians face dark and cold after...
388    CNN  Syrian migrants in Germany face uncertain futu...
389    CNN  Syrian migrants in Germany face incertain futu...
390    CNN  Qantas releases first images of jet that will ...
391    CNN  Qantas releases first images of jet that leave...

[374 rows x 2 columns]


In [8]:
df_no_duplicates['lower_headline'] = df_no_duplicates['Headline'].str.lower()
df_no_duplicates.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_duplicates['lower_headline'] = df_no_duplicates['Headline'].str.lower()


Unnamed: 0,Source,Headline,lower_headline
0,BBC,"Israel has destroyed more than 1,500 buildings...","israel has destroyed more than 1,500 buildings..."
1,BBC,"Israel has destroyed more than one, five hundr...","israel has destroyed more than one, five hundr..."
2,BBC,Trump v the BBC: What are the hurdles for US p...,trump v the bbc: what are the hurdles for us p...
3,BBC,Trumpet v the BBC: What represent the hurdles ...,trumpet v the bbc: what represent the hurdles ...
5,BBC,"Israel let destroyed more than 1, 500 building...","israel let destroyed more than 1, 500 building..."


In [9]:
def clean_text(text):
    if isinstance(text, str):
        cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
        return cleaned_text.strip()
    else:
        return ''

In [10]:
df_no_duplicates['cleaned_text'] = df_no_duplicates['lower_headline'].apply(clean_text)
df_no_duplicates.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_duplicates['cleaned_text'] = df_no_duplicates['lower_headline'].apply(clean_text)


Unnamed: 0,Source,Headline,lower_headline,cleaned_text
0,BBC,"Israel has destroyed more than 1,500 buildings...","israel has destroyed more than 1,500 buildings...",israel has destroyed more than buildings in g...
1,BBC,"Israel has destroyed more than one, five hundr...","israel has destroyed more than one, five hundr...",israel has destroyed more than one five hundre...
2,BBC,Trump v the BBC: What are the hurdles for US p...,trump v the bbc: what are the hurdles for us p...,trump v the bbc what are the hurdles for us pr...
3,BBC,Trumpet v the BBC: What represent the hurdles ...,trumpet v the bbc: what represent the hurdles ...,trumpet v the bbc what represent the hurdles f...
5,BBC,"Israel let destroyed more than 1, 500 building...","israel let destroyed more than 1, 500 building...",israel let destroyed more than buildings in ...


In [11]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saije\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saije\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [12]:
def remove_stopwords(text):
    if isinstance(text, str):
        words = word_tokenize(text)
        print(words)
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in words if word.lower() not in stop_words]
        print(filtered_words)
        cleaned_text = ' '.join(filtered_words)
        print(cleaned_text)
        return cleaned_text.strip()
    else:
        return ''


In [13]:
df_no_duplicates.to_csv('cleaned_data.csv', index=False)

In [15]:
stop_words_removal = pd.read_csv('./cleaned_data.csv')
stop_words_removal.head()

Unnamed: 0,Source,Headline,lower_headline,cleaned_text
0,BBC,"Israel has destroyed more than 1,500 buildings...","israel has destroyed more than 1,500 buildings...",israel has destroyed more than buildings in g...
1,BBC,"Israel has destroyed more than one, five hundr...","israel has destroyed more than one, five hundr...",israel has destroyed more than one five hundre...
2,BBC,Trump v the BBC: What are the hurdles for US p...,trump v the bbc: what are the hurdles for us p...,trump v the bbc what are the hurdles for us pr...
3,BBC,Trumpet v the BBC: What represent the hurdles ...,trumpet v the bbc: what represent the hurdles ...,trumpet v the bbc what represent the hurdles f...
4,BBC,"Israel let destroyed more than 1, 500 building...","israel let destroyed more than 1, 500 building...",israel let destroyed more than buildings in ...


In [17]:
stop_words_removal['No_stopwords'] = stop_words_removal['cleaned_text'].apply(remove_stopwords)

['israel', 'has', 'destroyed', 'more', 'than', 'buildings', 'in', 'gaza', 'since', 'ceasefire']
['israel', 'destroyed', 'buildings', 'gaza', 'since', 'ceasefire']
israel destroyed buildings gaza since ceasefire
['israel', 'has', 'destroyed', 'more', 'than', 'one', 'five', 'hundred', 'construction', 'in', 'gaza', 'strip', 'since', 'ceasefire']
['israel', 'destroyed', 'one', 'five', 'hundred', 'construction', 'gaza', 'strip', 'since', 'ceasefire']
israel destroyed one five hundred construction gaza strip since ceasefire
['trump', 'v', 'the', 'bbc', 'what', 'are', 'the', 'hurdles', 'for', 'us', 'presidents', 'legal', 'argument']
['trump', 'v', 'bbc', 'hurdles', 'us', 'presidents', 'legal', 'argument']
trump v bbc hurdles us presidents legal argument
['trumpet', 'v', 'the', 'bbc', 'what', 'represent', 'the', 'hurdles', 'for', 'u', 'chief', 'executive', 's', 'legal', 'argument']
['trumpet', 'v', 'bbc', 'represent', 'hurdles', 'u', 'chief', 'executive', 'legal', 'argument']
trumpet v bbc rep