In [1]:
import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim

In [2]:
df = pd.read_csv('../datasets/news_dataset.csv')
df.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased
1,89ce50166b,CBC.ca,"The Sunday Magazine for June 23, 2024 - CBC.ca",https://www.cbc.ca,This week on The Sunday Magazine with host Piy...,images/89ce50166b.jpeg,Likely to be Bias,Likely to be Unbiased
2,8b320e107e,CBC News,"Conservative MP shares inaccurate, ChatGPT-gen...",https://www.cbc.ca,An Ontario Conservative MP's use of ChatGPT to...,images/8b320e107e.jpeg,Likely to be Bias,Likely to be Bias
3,7536f87654,CBC.ca,"Women's sports are more popular than ever, but...",https://www.cbc.ca,"When it comes to the study of sports, a man's ...",images/7536f87654.jpeg,Likely to be Unbiased,Likely to be Unbiased
4,c829d1f9a8,CBC.ca,June 22: Listener Question Show - CBC.ca,https://www.cbc.ca,"To wrap up the 48th season of Quirks & Quarks,...",images/c829d1f9a8.jpeg,Likely to be Bias,Likely to be Unbiased


In [3]:
# drop nulls
df = df.dropna()
df.shape

(40945, 8)

In [4]:
# make lowercase
for index, row in df.iterrows():
    df.at[index, "content_cleaned"] = ''.join([i for i in row["article_text"].lower() if not i.isdigit()])
       
df.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...
1,89ce50166b,CBC.ca,"The Sunday Magazine for June 23, 2024 - CBC.ca",https://www.cbc.ca,This week on The Sunday Magazine with host Piy...,images/89ce50166b.jpeg,Likely to be Bias,Likely to be Unbiased,this week on the sunday magazine with host piy...
2,8b320e107e,CBC News,"Conservative MP shares inaccurate, ChatGPT-gen...",https://www.cbc.ca,An Ontario Conservative MP's use of ChatGPT to...,images/8b320e107e.jpeg,Likely to be Bias,Likely to be Bias,an ontario conservative mp's use of chatgpt to...
3,7536f87654,CBC.ca,"Women's sports are more popular than ever, but...",https://www.cbc.ca,"When it comes to the study of sports, a man's ...",images/7536f87654.jpeg,Likely to be Unbiased,Likely to be Unbiased,"when it comes to the study of sports, a man's ..."
4,c829d1f9a8,CBC.ca,June 22: Listener Question Show - CBC.ca,https://www.cbc.ca,"To wrap up the 48th season of Quirks & Quarks,...",images/c829d1f9a8.jpeg,Likely to be Bias,Likely to be Unbiased,"to wrap up the th season of quirks & quarks, w..."


In [5]:
# tokenization
# for content, split into sentences
df_content = df.copy()
df_content["content_sentence"] = df_content["content_cleaned"].apply(nltk.sent_tokenize)
df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,content_sentence
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,[tij iginla doesn't shy away from his famous l...
1,89ce50166b,CBC.ca,"The Sunday Magazine for June 23, 2024 - CBC.ca",https://www.cbc.ca,This week on The Sunday Magazine with host Piy...,images/89ce50166b.jpeg,Likely to be Bias,Likely to be Unbiased,this week on the sunday magazine with host piy...,[this week on the sunday magazine with host pi...
2,8b320e107e,CBC News,"Conservative MP shares inaccurate, ChatGPT-gen...",https://www.cbc.ca,An Ontario Conservative MP's use of ChatGPT to...,images/8b320e107e.jpeg,Likely to be Bias,Likely to be Bias,an ontario conservative mp's use of chatgpt to...,[an ontario conservative mp's use of chatgpt t...
3,7536f87654,CBC.ca,"Women's sports are more popular than ever, but...",https://www.cbc.ca,"When it comes to the study of sports, a man's ...",images/7536f87654.jpeg,Likely to be Unbiased,Likely to be Unbiased,"when it comes to the study of sports, a man's ...","[when it comes to the study of sports, a man's..."
4,c829d1f9a8,CBC.ca,June 22: Listener Question Show - CBC.ca,https://www.cbc.ca,"To wrap up the 48th season of Quirks & Quarks,...",images/c829d1f9a8.jpeg,Likely to be Bias,Likely to be Unbiased,"to wrap up the th season of quirks & quarks, w...","[to wrap up the th season of quirks & quarks, ..."


In [6]:
expanded_rows = []

# Iterate through each row in the DataFrame
for index, row in df_content.iterrows():
    sentences = row["content_sentence"]  
    # Group sentences into chunks of 5
    for i in range(0, len(sentences), 5):
        chunk = ' '.join(sentences[i:i+5])
        # Create a new row with the same metadata but updated content_sentence
        new_row = row.copy()
        new_row["content_sentence"] = chunk
        expanded_rows.append(new_row)

# Create new DataFrame
df_sentence = pd.DataFrame(expanded_rows)

# Optional: Reset index
df_sentence.reset_index(drop=True, inplace=True)

# Show result
df_sentence.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,content_sentence
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,tij iginla doesn't shy away from his famous la...
1,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""i mean, i'd be thrilled to go anywhere. but i..."
2,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"the flames currently hold the ninth pick, and ..."
3,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,max plante is the son of former nhl player der...
4,89ce50166b,CBC.ca,"The Sunday Magazine for June 23, 2024 - CBC.ca",https://www.cbc.ca,This week on The Sunday Magazine with host Piy...,images/89ce50166b.jpeg,Likely to be Bias,Likely to be Unbiased,this week on the sunday magazine with host piy...,this week on the sunday magazine with host piy...


In [7]:
df_sentence.shape

(181747, 10)

In [8]:
# # convert each sentence to individual rows
# df_content = df_content.explode("content_sentence", ignore_index=True)
# df_content.rename(columns={"content_sentence": "sentence"}, inplace=True)
# df_content.head()

In [9]:
# tokenize each sentence into words
df_content = df_sentence.copy()
df_content["content_words"] = df_content["content_sentence"].apply(nltk.word_tokenize)
df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,content_sentence,content_words
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,tij iginla doesn't shy away from his famous la...,"[tij, iginla, does, n't, shy, away, from, his,..."
1,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""i mean, i'd be thrilled to go anywhere. but i...","[``, i, mean, ,, i, 'd, be, thrilled, to, go, ..."
2,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"the flames currently hold the ninth pick, and ...","[the, flames, currently, hold, the, ninth, pic..."
3,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,max plante is the son of former nhl player der...,"[max, plante, is, the, son, of, former, nhl, p..."
4,89ce50166b,CBC.ca,"The Sunday Magazine for June 23, 2024 - CBC.ca",https://www.cbc.ca,This week on The Sunday Magazine with host Piy...,images/89ce50166b.jpeg,Likely to be Bias,Likely to be Unbiased,this week on the sunday magazine with host piy...,this week on the sunday magazine with host piy...,"[this, week, on, the, sunday, magazine, with, ..."


In [10]:
# remove punctuation & numbers
for index, row in df_content.iterrows():
    df_content.at[index, "content_words"] = [w for w in row["content_words"] if re.search(r'^[a-z]+$', w)]
df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,content_sentence,content_words
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,tij iginla doesn't shy away from his famous la...,"[tij, iginla, does, shy, away, from, his, famo..."
1,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""i mean, i'd be thrilled to go anywhere. but i...","[i, mean, i, be, thrilled, to, go, anywhere, b..."
2,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"the flames currently hold the ninth pick, and ...","[the, flames, currently, hold, the, ninth, pic..."
3,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,max plante is the son of former nhl player der...,"[max, plante, is, the, son, of, former, nhl, p..."
4,89ce50166b,CBC.ca,"The Sunday Magazine for June 23, 2024 - CBC.ca",https://www.cbc.ca,This week on The Sunday Magazine with host Piy...,images/89ce50166b.jpeg,Likely to be Bias,Likely to be Unbiased,this week on the sunday magazine with host piy...,this week on the sunday magazine with host piy...,"[this, week, on, the, sunday, magazine, with, ..."


In [11]:
# remove stopwords
stop_list = set(stopwords.words('english'))
for index, row in df_content.iterrows():
    df_content.at[index, "content_words"] = [w for w in row["content_words"] if w not in stop_list]

df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,content_sentence,content_words
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,tij iginla doesn't shy away from his famous la...,"[tij, iginla, shy, away, famous, last, name, i..."
1,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""i mean, i'd be thrilled to go anywhere. but i...","[mean, thrilled, go, anywhere, think, would, c..."
2,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"the flames currently hold the ninth pick, and ...","[flames, currently, hold, ninth, pick, iginla,..."
3,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,max plante is the son of former nhl player der...,"[max, plante, son, former, nhl, player, derek,..."
4,89ce50166b,CBC.ca,"The Sunday Magazine for June 23, 2024 - CBC.ca",https://www.cbc.ca,This week on The Sunday Magazine with host Piy...,images/89ce50166b.jpeg,Likely to be Bias,Likely to be Unbiased,this week on the sunday magazine with host piy...,this week on the sunday magazine with host piy...,"[week, sunday, magazine, host, piya, chattopad..."


In [12]:
# lemmatization
lemmatizer = WordNetLemmatizer()
for index, row in df_content.iterrows():
    df_content.at[index, "content_words"] = [lemmatizer.lemmatize(w) for w in row["content_words"]]

df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,content_sentence,content_words
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,tij iginla doesn't shy away from his famous la...,"[tij, iginla, shy, away, famous, last, name, i..."
1,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""i mean, i'd be thrilled to go anywhere. but i...","[mean, thrilled, go, anywhere, think, would, c..."
2,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"the flames currently hold the ninth pick, and ...","[flame, currently, hold, ninth, pick, iginla, ..."
3,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,max plante is the son of former nhl player der...,"[max, plante, son, former, nhl, player, derek,..."
4,89ce50166b,CBC.ca,"The Sunday Magazine for June 23, 2024 - CBC.ca",https://www.cbc.ca,This week on The Sunday Magazine with host Piy...,images/89ce50166b.jpeg,Likely to be Bias,Likely to be Unbiased,this week on the sunday magazine with host piy...,this week on the sunday magazine with host piy...,"[week, sunday, magazine, host, piya, chattopad..."


In [13]:
# can consider implementing n-grams; not implemented now as 
# it will add more dimensionality to the data and can result in vv
# long run times for the models

In [14]:
# convert words back to sentences
df_content["content_sentence"] = df_content["content_words"].apply(lambda x: ' '.join(x))
df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,content_sentence,content_words
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,tij iginla shy away famous last name instead e...,"[tij, iginla, shy, away, famous, last, name, i..."
1,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,mean thrilled go anywhere think would cool pla...,"[mean, thrilled, go, anywhere, think, would, c..."
2,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,flame currently hold ninth pick iginla happens...,"[flame, currently, hold, ninth, pick, iginla, ..."
3,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,max plante son former nhl player derek mirosla...,"[max, plante, son, former, nhl, player, derek,..."
4,89ce50166b,CBC.ca,"The Sunday Magazine for June 23, 2024 - CBC.ca",https://www.cbc.ca,This week on The Sunday Magazine with host Piy...,images/89ce50166b.jpeg,Likely to be Bias,Likely to be Unbiased,this week on the sunday magazine with host piy...,week sunday magazine host piya chattopadhyay s...,"[week, sunday, magazine, host, piya, chattopad..."


In [15]:
# create df with the relevant cols
# df_content.rename(columns={"sentence": "content_sentence", "words": "content_words"}, inplace=True)
# df_content.head()
df = df_content[["outlet", "article_text", "nlp_label", "nlp-image_label", "content_sentence", "content_words"]]
df.head()



Unnamed: 0,outlet,article_text,nlp_label,nlp-image_label,content_sentence,content_words
0,CBC.ca,Tij Iginla doesn't shy away from his famous la...,Likely to be Bias,Likely to be Unbiased,tij iginla shy away famous last name instead e...,"[tij, iginla, shy, away, famous, last, name, i..."
1,CBC.ca,Tij Iginla doesn't shy away from his famous la...,Likely to be Bias,Likely to be Unbiased,mean thrilled go anywhere think would cool pla...,"[mean, thrilled, go, anywhere, think, would, c..."
2,CBC.ca,Tij Iginla doesn't shy away from his famous la...,Likely to be Bias,Likely to be Unbiased,flame currently hold ninth pick iginla happens...,"[flame, currently, hold, ninth, pick, iginla, ..."
3,CBC.ca,Tij Iginla doesn't shy away from his famous la...,Likely to be Bias,Likely to be Unbiased,max plante son former nhl player derek mirosla...,"[max, plante, son, former, nhl, player, derek,..."
4,CBC.ca,This week on The Sunday Magazine with host Piy...,Likely to be Bias,Likely to be Unbiased,week sunday magazine host piya chattopadhyay s...,"[week, sunday, magazine, host, piya, chattopad..."


In [16]:
df.shape

(181747, 6)

In [17]:
# convert back to csv
df.to_csv('../datasets/news_dataset_content_cleaned_bertopic.csv', index=False)