In [20]:
import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim

In [21]:
df = pd.read_csv('../datasets/news_dataset.csv')
df.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased
1,89ce50166b,CBC.ca,"The Sunday Magazine for June 23, 2024 - CBC.ca",https://www.cbc.ca,This week on The Sunday Magazine with host Piy...,images/89ce50166b.jpeg,Likely to be Bias,Likely to be Unbiased
2,8b320e107e,CBC News,"Conservative MP shares inaccurate, ChatGPT-gen...",https://www.cbc.ca,An Ontario Conservative MP's use of ChatGPT to...,images/8b320e107e.jpeg,Likely to be Bias,Likely to be Bias
3,7536f87654,CBC.ca,"Women's sports are more popular than ever, but...",https://www.cbc.ca,"When it comes to the study of sports, a man's ...",images/7536f87654.jpeg,Likely to be Unbiased,Likely to be Unbiased
4,c829d1f9a8,CBC.ca,June 22: Listener Question Show - CBC.ca,https://www.cbc.ca,"To wrap up the 48th season of Quirks & Quarks,...",images/c829d1f9a8.jpeg,Likely to be Bias,Likely to be Unbiased


In [22]:
# drop nulls
df = df.dropna()
df.shape

(40945, 8)

In [23]:
# make lowercase
for index, row in df.iterrows():
    df.at[index, "content_cleaned"] = ''.join([i for i in row["article_text"].lower() if not i.isdigit()])
       
df.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...
1,89ce50166b,CBC.ca,"The Sunday Magazine for June 23, 2024 - CBC.ca",https://www.cbc.ca,This week on The Sunday Magazine with host Piy...,images/89ce50166b.jpeg,Likely to be Bias,Likely to be Unbiased,this week on the sunday magazine with host piy...
2,8b320e107e,CBC News,"Conservative MP shares inaccurate, ChatGPT-gen...",https://www.cbc.ca,An Ontario Conservative MP's use of ChatGPT to...,images/8b320e107e.jpeg,Likely to be Bias,Likely to be Bias,an ontario conservative mp's use of chatgpt to...
3,7536f87654,CBC.ca,"Women's sports are more popular than ever, but...",https://www.cbc.ca,"When it comes to the study of sports, a man's ...",images/7536f87654.jpeg,Likely to be Unbiased,Likely to be Unbiased,"when it comes to the study of sports, a man's ..."
4,c829d1f9a8,CBC.ca,June 22: Listener Question Show - CBC.ca,https://www.cbc.ca,"To wrap up the 48th season of Quirks & Quarks,...",images/c829d1f9a8.jpeg,Likely to be Bias,Likely to be Unbiased,"to wrap up the th season of quirks & quarks, w..."


In [24]:
# tokenization
# for content, split into sentences
df_content = df.copy()
df_content["content_sentence"] = df_content["content_cleaned"].apply(nltk.sent_tokenize)
df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,content_sentence
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,[tij iginla doesn't shy away from his famous l...
1,89ce50166b,CBC.ca,"The Sunday Magazine for June 23, 2024 - CBC.ca",https://www.cbc.ca,This week on The Sunday Magazine with host Piy...,images/89ce50166b.jpeg,Likely to be Bias,Likely to be Unbiased,this week on the sunday magazine with host piy...,[this week on the sunday magazine with host pi...
2,8b320e107e,CBC News,"Conservative MP shares inaccurate, ChatGPT-gen...",https://www.cbc.ca,An Ontario Conservative MP's use of ChatGPT to...,images/8b320e107e.jpeg,Likely to be Bias,Likely to be Bias,an ontario conservative mp's use of chatgpt to...,[an ontario conservative mp's use of chatgpt t...
3,7536f87654,CBC.ca,"Women's sports are more popular than ever, but...",https://www.cbc.ca,"When it comes to the study of sports, a man's ...",images/7536f87654.jpeg,Likely to be Unbiased,Likely to be Unbiased,"when it comes to the study of sports, a man's ...","[when it comes to the study of sports, a man's..."
4,c829d1f9a8,CBC.ca,June 22: Listener Question Show - CBC.ca,https://www.cbc.ca,"To wrap up the 48th season of Quirks & Quarks,...",images/c829d1f9a8.jpeg,Likely to be Bias,Likely to be Unbiased,"to wrap up the th season of quirks & quarks, w...","[to wrap up the th season of quirks & quarks, ..."


In [25]:
# convert each sentence to individual rows
df_content = df_content.explode("content_sentence", ignore_index=True)
df_content.rename(columns={"content_sentence": "sentence"}, inplace=True)
df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,sentence
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,tij iginla doesn't shy away from his famous la...
1,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,the -year-old is so comfortable being jarome i...
2,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""i think there might be a little extra attenti..."
3,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"but for me, having my dad and having my family..."
4,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""obviously, if i was to go to calgary, that wo..."


In [26]:
# tokenize each sentence into words
df_content["words"] = df_content["sentence"].apply(nltk.word_tokenize)
df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,sentence,words
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,tij iginla doesn't shy away from his famous la...,"[tij, iginla, does, n't, shy, away, from, his,..."
1,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,the -year-old is so comfortable being jarome i...,"[the, -year-old, is, so, comfortable, being, j..."
2,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""i think there might be a little extra attenti...","[``, i, think, there, might, be, a, little, ex..."
3,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"but for me, having my dad and having my family...","[but, for, me, ,, having, my, dad, and, having..."
4,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""obviously, if i was to go to calgary, that wo...","[``, obviously, ,, if, i, was, to, go, to, cal..."


In [27]:
# remove punctuation & numbers
for index, row in df_content.iterrows():
    df_content.at[index, "words"] = [w for w in row["words"] if re.search(r'^[a-z]+$', w)]
df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,sentence,words
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,tij iginla doesn't shy away from his famous la...,"[tij, iginla, does, shy, away, from, his, famo..."
1,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,the -year-old is so comfortable being jarome i...,"[the, is, so, comfortable, being, jarome, igin..."
2,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""i think there might be a little extra attenti...","[i, think, there, might, be, a, little, extra,..."
3,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"but for me, having my dad and having my family...","[but, for, me, having, my, dad, and, having, m..."
4,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""obviously, if i was to go to calgary, that wo...","[obviously, if, i, was, to, go, to, calgary, t..."


In [28]:
# remove stopwords
stop_list = set(stopwords.words('english'))
for index, row in df_content.iterrows():
    df_content.at[index, "words"] = [w for w in row["words"] if w not in stop_list]

df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,sentence,words
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,tij iginla doesn't shy away from his famous la...,"[tij, iginla, shy, away, famous, last, name, i..."
1,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,the -year-old is so comfortable being jarome i...,"[comfortable, jarome, iginla, son, welcomes, p..."
2,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""i think there might be a little extra attenti...","[think, might, little, extra, attention, expec..."
3,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"but for me, having my dad and having my family...","[dad, family, pros, heavily, outweigh, cons, i..."
4,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""obviously, if i was to go to calgary, that wo...","[obviously, go, calgary, would, kind, magnifie..."


In [29]:
# lemmatization
lemmatizer = WordNetLemmatizer()
for index, row in df_content.iterrows():
    df_content.at[index, "words"] = [lemmatizer.lemmatize(w) for w in row["words"]]

df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,sentence,words
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,tij iginla doesn't shy away from his famous la...,"[tij, iginla, shy, away, famous, last, name, i..."
1,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,the -year-old is so comfortable being jarome i...,"[comfortable, jarome, iginla, son, welcome, po..."
2,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""i think there might be a little extra attenti...","[think, might, little, extra, attention, expec..."
3,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"but for me, having my dad and having my family...","[dad, family, pro, heavily, outweigh, con, igi..."
4,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,"""obviously, if i was to go to calgary, that wo...","[obviously, go, calgary, would, kind, magnifie..."


In [30]:
# can consider implementing n-grams; not implemented now as 
# it will add more dimensionality to the data and can result in vv
# long run times for the models

In [31]:
# convert words back to sentences
df_content["sentence"] = df_content["words"].apply(lambda x: ' '.join(x))
df_content.head()

Unnamed: 0,unique_id,outlet,headline,url,article_text,image,nlp_label,nlp-image_label,content_cleaned,sentence,words
0,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,tij iginla shy away famous last name instead e...,"[tij, iginla, shy, away, famous, last, name, i..."
1,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,comfortable jarome iginla son welcome possibil...,"[comfortable, jarome, iginla, son, welcome, po..."
2,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,think might little extra attention expectation...,"[think, might, little, extra, attention, expec..."
3,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,dad family pro heavily outweigh con iginla sai...,"[dad, family, pro, heavily, outweigh, con, igi..."
4,39e6104d56,CBC.ca,"Tij Iginla embraces family name, eager to foll...",https://www.cbc.ca,Tij Iginla doesn't shy away from his famous la...,images/39e6104d56.jpeg,Likely to be Bias,Likely to be Unbiased,tij iginla doesn't shy away from his famous la...,obviously go calgary would kind magnified added,"[obviously, go, calgary, would, kind, magnifie..."


In [32]:
# create df with the relevant cols
df_content.rename(columns={"sentence": "content_sentence", "words": "content_words"}, inplace=True)
df_content.head()
df = df_content[["outlet", "article_text", "nlp_label", "nlp-image_label", "content_sentence", "content_words"]]
df.head()



Unnamed: 0,outlet,article_text,nlp_label,nlp-image_label,content_sentence,content_words
0,CBC.ca,Tij Iginla doesn't shy away from his famous la...,Likely to be Bias,Likely to be Unbiased,tij iginla shy away famous last name instead e...,"[tij, iginla, shy, away, famous, last, name, i..."
1,CBC.ca,Tij Iginla doesn't shy away from his famous la...,Likely to be Bias,Likely to be Unbiased,comfortable jarome iginla son welcome possibil...,"[comfortable, jarome, iginla, son, welcome, po..."
2,CBC.ca,Tij Iginla doesn't shy away from his famous la...,Likely to be Bias,Likely to be Unbiased,think might little extra attention expectation...,"[think, might, little, extra, attention, expec..."
3,CBC.ca,Tij Iginla doesn't shy away from his famous la...,Likely to be Bias,Likely to be Unbiased,dad family pro heavily outweigh con iginla sai...,"[dad, family, pro, heavily, outweigh, con, igi..."
4,CBC.ca,Tij Iginla doesn't shy away from his famous la...,Likely to be Bias,Likely to be Unbiased,obviously go calgary would kind magnified added,"[obviously, go, calgary, would, kind, magnifie..."


In [33]:
df.shape

(824853, 6)

In [34]:
# convert back to csv
df.to_csv('../datasets/news_dataset_content_cleaned.csv', index=False)