In [19]:
import pandas as pd
import numpy as np

import nltk
import re

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import string

import seaborn as sns

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/yennetiuday/K2140746_Project_Dissertation/main/data/nytimes_dataset.csv')

In [3]:
df.shape

(61218, 7)

In [4]:
df.head()

Unnamed: 0,section,headline,abstract,caption,image_url,article_url,image_id
0,Health,F.D.A. Plans to Ban Most E-Cigarette Flavors b...,The tobacco and vaping industries and conserva...,A new study by the National Institute on Drug ...,https://static01.nyt.com/images/2019/11/06/sci...,https://www.nytimes.com/2019/12/31/health/e-ci...,42d25485-0e48-50bf-8d16-948833b2a55d
1,Science,Meteor Showers in 2020 That Will Light Up Nigh...,"All year long, Earth passes through streams of...","Perseid meteors named as ""Orinoid"" streak acro...",https://static01.nyt.com/images/2020/01/01/sci...,https://www.nytimes.com/2020/01/01/science/met...,04bc90f0-b20b-511c-b5bb-3ce13194163f
2,Science,"Rocket Launches, Trips to Mars and More 2020 S...",A year full of highs and lows in space just en...,Spectators viewing the launch of a Soyuz rocke...,https://static01.nyt.com/images/2020/01/01/sci...,https://www.nytimes.com/2020/01/01/science/spa...,bd8647b3-8ec6-50aa-95cf-2b81ed12d2dd
3,Television,What's on TV Wednesday: A Linda Ronstadt Doc a...,"""Linda Ronstadt: The Sound of My Voice"" airs o...","Linda Ronstadt in ""Linda Ronstadt: The Sound o...",https://static01.nyt.com/images/2020/01/01/art...,https://www.nytimes.com/2020/01/01/arts/televi...,e6c25b53-0416-5795-b0cf-e1243924dc79
4,Travel,New Cruise Ships to Set Sail for Antarctica,Interested in the southernmost continent? Here...,"Antarctica21&rsquo;s expedition ship, Ocean No...",https://static01.nyt.com/images/2020/01/05/tra...,https://www.nytimes.com/2020/01/01/travel/anta...,98c3d182-95ce-5244-9b9e-008a3dee7354


In [5]:
pd.unique(df.section)

array(['Health', 'Science', 'Television', 'Travel', 'Movies', 'Dance',
       'Real Estate', 'Economy', 'Sports', 'Theater', 'Opinion', 'Music',
       'Books', 'Art & Design', 'Style', 'Media', 'Food', 'Well',
       'Fashion', 'Technology', 'Your Money', 'Education', 'Automobiles',
       'Global Business'], dtype=object)

In [6]:
df.section.value_counts()

Health             3000
Opinion            3000
Technology         3000
Fashion            3000
Food               3000
Media              3000
Art & Design       3000
Science            3000
Music              3000
Books              3000
Theater            3000
Sports             3000
Real Estate        3000
Dance              3000
Movies             3000
Travel             3000
Television         3000
Style              2681
Automobiles        1825
Economy            1761
Your Money         1263
Global Business    1182
Education           825
Well                681
Name: section, dtype: int64

In [7]:
df.isnull().any()

section        False
headline       False
abstract       False
caption         True
image_url      False
article_url    False
image_id       False
dtype: bool

In [8]:
df.dtypes

section        object
headline       object
abstract       object
caption        object
image_url      object
article_url    object
image_id       object
dtype: object

In [9]:
df['section'] = df['section'].astype(str)
df['headline'] = df['headline'].astype(str)
df['abstract'] = df['abstract'].astype(str)
df['caption'] = df['caption'].astype(str)
df['image_url'] = df['image_url'].astype(str)
df['article_url'] = df['article_url'].astype(str)
df['image_id'] = df['image_id'].astype(str)

In [10]:
df.isnull().any()

section        False
headline       False
abstract       False
caption        False
image_url      False
article_url    False
image_id       False
dtype: bool

In [11]:
df.caption.isnull().sum()

0

In [14]:
df['merged_news_text'] = df[['headline', 'abstract', 'caption']].agg(' '.join, axis=1)

In [15]:
stopwords = nltk.corpus.stopwords.words('english')

In [16]:
def clean_news_text(news_text):
    news_text = "".join([word.lower() for word in news_text if word not in string.punctuation])
    tokens = re.split('\W+', news_text)
    news_text_tokens = [word for word in tokens if word not in stopwords]
    return news_text_tokens

In [17]:
df['cleaned_news_text_tokens'] = df['merged_news_text'].apply(lambda news_text: clean_news_text(news_text))

In [18]:
df.head()

Unnamed: 0,section,headline,abstract,caption,image_url,article_url,image_id,merged_news_text,cleaned_news_text_tokens
0,Health,F.D.A. Plans to Ban Most E-Cigarette Flavors b...,The tobacco and vaping industries and conserva...,A new study by the National Institute on Drug ...,https://static01.nyt.com/images/2019/11/06/sci...,https://www.nytimes.com/2019/12/31/health/e-ci...,42d25485-0e48-50bf-8d16-948833b2a55d,F.D.A. Plans to Ban Most E-Cigarette Flavors b...,"[fda, plans, ban, ecigarette, flavors, menthol..."
1,Science,Meteor Showers in 2020 That Will Light Up Nigh...,"All year long, Earth passes through streams of...","Perseid meteors named as ""Orinoid"" streak acro...",https://static01.nyt.com/images/2020/01/01/sci...,https://www.nytimes.com/2020/01/01/science/met...,04bc90f0-b20b-511c-b5bb-3ce13194163f,Meteor Showers in 2020 That Will Light Up Nigh...,"[meteor, showers, 2020, light, night, skies, y..."
2,Science,"Rocket Launches, Trips to Mars and More 2020 S...",A year full of highs and lows in space just en...,Spectators viewing the launch of a Soyuz rocke...,https://static01.nyt.com/images/2020/01/01/sci...,https://www.nytimes.com/2020/01/01/science/spa...,bd8647b3-8ec6-50aa-95cf-2b81ed12d2dd,"Rocket Launches, Trips to Mars and More 2020 S...","[rocket, launches, trips, mars, 2020, space, a..."
3,Television,What's on TV Wednesday: A Linda Ronstadt Doc a...,"""Linda Ronstadt: The Sound of My Voice"" airs o...","Linda Ronstadt in ""Linda Ronstadt: The Sound o...",https://static01.nyt.com/images/2020/01/01/art...,https://www.nytimes.com/2020/01/01/arts/televi...,e6c25b53-0416-5795-b0cf-e1243924dc79,What's on TV Wednesday: A Linda Ronstadt Doc a...,"[whats, tv, wednesday, linda, ronstadt, doc, d..."
4,Travel,New Cruise Ships to Set Sail for Antarctica,Interested in the southernmost continent? Here...,"Antarctica21&rsquo;s expedition ship, Ocean No...",https://static01.nyt.com/images/2020/01/05/tra...,https://www.nytimes.com/2020/01/01/travel/anta...,98c3d182-95ce-5244-9b9e-008a3dee7354,New Cruise Ships to Set Sail for Antarctica In...,"[new, cruise, ships, set, sail, antarctica, in..."


In [23]:
df.section.unique()

array(['Health', 'Science', 'Television', 'Travel', 'Movies', 'Dance',
       'Real Estate', 'Economy', 'Sports', 'Theater', 'Opinion', 'Music',
       'Books', 'Art & Design', 'Style', 'Media', 'Food', 'Well',
       'Fashion', 'Technology', 'Your Money', 'Education', 'Automobiles',
       'Global Business'], dtype=object)

In [21]:
label_encoder = preprocessing.LabelEncoder() 
df['section_id']= label_encoder.fit_transform(df['section'])

In [22]:
df.head()

Unnamed: 0,section,headline,abstract,caption,image_url,article_url,image_id,merged_news_text,cleaned_news_text_tokens,section_id
0,Health,F.D.A. Plans to Ban Most E-Cigarette Flavors b...,The tobacco and vaping industries and conserva...,A new study by the National Institute on Drug ...,https://static01.nyt.com/images/2019/11/06/sci...,https://www.nytimes.com/2019/12/31/health/e-ci...,42d25485-0e48-50bf-8d16-948833b2a55d,F.D.A. Plans to Ban Most E-Cigarette Flavors b...,"[fda, plans, ban, ecigarette, flavors, menthol...",9
1,Science,Meteor Showers in 2020 That Will Light Up Nigh...,"All year long, Earth passes through streams of...","Perseid meteors named as ""Orinoid"" streak acro...",https://static01.nyt.com/images/2020/01/01/sci...,https://www.nytimes.com/2020/01/01/science/met...,04bc90f0-b20b-511c-b5bb-3ce13194163f,Meteor Showers in 2020 That Will Light Up Nigh...,"[meteor, showers, 2020, light, night, skies, y...",15
2,Science,"Rocket Launches, Trips to Mars and More 2020 S...",A year full of highs and lows in space just en...,Spectators viewing the launch of a Soyuz rocke...,https://static01.nyt.com/images/2020/01/01/sci...,https://www.nytimes.com/2020/01/01/science/spa...,bd8647b3-8ec6-50aa-95cf-2b81ed12d2dd,"Rocket Launches, Trips to Mars and More 2020 S...","[rocket, launches, trips, mars, 2020, space, a...",15
3,Television,What's on TV Wednesday: A Linda Ronstadt Doc a...,"""Linda Ronstadt: The Sound of My Voice"" airs o...","Linda Ronstadt in ""Linda Ronstadt: The Sound o...",https://static01.nyt.com/images/2020/01/01/art...,https://www.nytimes.com/2020/01/01/arts/televi...,e6c25b53-0416-5795-b0cf-e1243924dc79,What's on TV Wednesday: A Linda Ronstadt Doc a...,"[whats, tv, wednesday, linda, ronstadt, doc, d...",19
4,Travel,New Cruise Ships to Set Sail for Antarctica,Interested in the southernmost continent? Here...,"Antarctica21&rsquo;s expedition ship, Ocean No...",https://static01.nyt.com/images/2020/01/05/tra...,https://www.nytimes.com/2020/01/01/travel/anta...,98c3d182-95ce-5244-9b9e-008a3dee7354,New Cruise Ships to Set Sail for Antarctica In...,"[new, cruise, ships, set, sail, antarctica, in...",21
