In [73]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
import nltk
nltk.download('brown')
from nltk.corpus import brown
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from string import punctuation
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')




[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

In [0]:
def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        
        news_articles = [{'news_headline': headline.find('span', 
                                                         attrs={"itemprop": "headline"}).string,
                          'news_article': article.find('div', 
                                                       attrs={"itemprop": "articleBody"}).string,
                          'news_category': news_category}
                         
                            for headline, article in 
                             zip(soup.find_all('div', 
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', 
                                               class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)
        
    df_news =  pd.DataFrame(news_data)
    df_news = df_news[['news_headline', 'news_article', 'news_category']]
    return df_news

In [7]:
news_df = build_dataset(seed_urls)
news_df.head(12)

Unnamed: 0,news_headline,news_article,news_category
0,Harvard prof becomes billionaire from stake in...,Harvard University medical professor Timothy S...,technology
1,IIT Delhi's coronavirus testing method approve...,IIT Delhi has become the first academic instit...,technology
2,"Coronavirus outbreak is like World War, except...",Microsoft Co-founder Bill Gates has said the c...,technology
3,WhatsApp group helps airlift 'seriously ill' I...,"A WhatsApp group, led by former Supreme Court ...",technology
4,"Local is new global business plan, online-to-o...",On the announcement that Amazon India is letti...,technology
5,Twitter allows video of Trump's 'inject disinf...,Microblogging platform Twitter has said the vi...,technology
6,Porn images pop up during online training of 7...,An online training session for around 700 badm...,technology
7,More firms place curbs on use of Zoom despite ...,Even as Zoom announced it would update the app...,technology
8,"Alphabet's Pichai makes ₹2,140 crore in 2019, ...",Google parent Alphabet has said that CEO Sunda...,technology
9,IIITM-K develops AI search engine for COVID-19...,The Indian Institute of Information Technology...,technology


In [8]:
news_df.news_category.value_counts()


sports        25
technology    25
world         21
Name: news_category, dtype: int64

In [9]:
len(news_df)

71

In [12]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [51]:
token_firsr_sent = sent_tokenize(news_df["news_article"][0])
token_firsr_sent

["Harvard University medical professor Timothy Springer's net worth is now more than $1 billion according to Bloomberg after his early stake in biotech firm Moderna surged 162% this year through Wednesday.",
 "US-based Moderna's coronavirus vaccine is one of the first to begin human trials.",
 'He invested $5 million in Moderna and his stake is now worth more than $800 million.']

In [48]:
first_tokenized_lowered = list(map(str.lower, word_tokenize(str(token_firsr_sent))))
first_tokenized_lowered

['[',
 '``',
 'harvard',
 'university',
 'medical',
 'professor',
 'timothy',
 'springer',
 "'s",
 'net',
 'worth',
 'is',
 'now',
 'more',
 'than',
 '$',
 '1',
 'billion',
 'according',
 'to',
 'bloomberg',
 'after',
 'his',
 'early',
 'stake',
 'in',
 'biotech',
 'firm',
 'moderna',
 'surged',
 '162',
 '%',
 'this',
 'year',
 'through',
 'wednesday',
 '.',
 '``',
 ',',
 '``',
 'us-based',
 'moderna',
 "'s",
 'coronavirus',
 'vaccine',
 'is',
 'one',
 'of',
 'the',
 'first',
 'to',
 'begin',
 'human',
 'trials',
 '.',
 '``',
 ',',
 "'he",
 'invested',
 '$',
 '5',
 'million',
 'in',
 'moderna',
 'and',
 'his',
 'stake',
 'is',
 'now',
 'worth',
 'more',
 'than',
 '$',
 '800',
 'million',
 '.',
 "'",
 ']']

In [35]:
stopwords_en = stopwords.words('english')
print(stopwords_en)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [0]:
stopwords_en = set(stopwords.words('english'))

In [52]:
#try to remove the stopwords from the first article
 b = [word for word in first_tokenized_lowered if word not in stopwords_en]
 b

['[',
 '``',
 'harvard',
 'university',
 'medical',
 'professor',
 'timothy',
 'springer',
 "'s",
 'net',
 'worth',
 '$',
 '1',
 'billion',
 'according',
 'bloomberg',
 'early',
 'stake',
 'biotech',
 'firm',
 'moderna',
 'surged',
 '162',
 '%',
 'year',
 'wednesday',
 '.',
 '``',
 ',',
 '``',
 'us-based',
 'moderna',
 "'s",
 'coronavirus',
 'vaccine',
 'one',
 'first',
 'begin',
 'human',
 'trials',
 '.',
 '``',
 ',',
 "'he",
 'invested',
 '$',
 '5',
 'million',
 'moderna',
 'stake',
 'worth',
 '$',
 '800',
 'million',
 '.',
 "'",
 ']']

In [55]:
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [57]:
stopwords_en_pluspunct = stopwords_en.union(set(punctuation))
print(stopwords_en_pluspunct)

{'most', 'before', 'has', "weren't", '@', 'yourself', 'that', 'theirs', 'then', 'having', 'so', "it's", 'hers', 'the', 'but', 'hadn', '(', 'and', "isn't", "mustn't", 'who', 'here', 'them', "you're", 'ourselves', "should've", '.', 'it', 'ma', 'do', 'any', 'now', 'its', "don't", 'himself', "didn't", "you'd", 'he', "that'll", 'through', 'doesn', 't', 'under', "hadn't", ':', 'only', 'further', "shouldn't", 'to', 'too', 'was', 'down', 'can', 'd', 'above', 'against', ')', 'a', 'until', '/', '#', 'between', 'there', 'not', '>', 'being', 'own', 'did', 'such', 'other', ']', 'yours', 'with', 'will', 'than', 'itself', 'at', "you've", 'by', 'into', 'in', 'themselves', "wouldn't", 'what', 'is', "hasn't", 'you', '$', 'their', 'when', 'ain', 'of', 'our', 'herself', 'don', 'or', '~', 'were', "you'll", '%', 'doing', '"', 'my', "aren't", 'few', 'should', '&', '}', "haven't", 'each', 'for', 'we', '=', 'no', '<', 'off', "'", 'just', 'o', 's', 'mightn', '^', 'hasn', 'how', 'isn', '|', 'below', 'mustn', 'af

In [58]:
print([word for word in b if word not in stopwords_en_pluspunct])

['``', 'harvard', 'university', 'medical', 'professor', 'timothy', 'springer', "'s", 'net', 'worth', '1', 'billion', 'according', 'bloomberg', 'early', 'stake', 'biotech', 'firm', 'moderna', 'surged', '162', 'year', 'wednesday', '``', '``', 'us-based', 'moderna', "'s", 'coronavirus', 'vaccine', 'one', 'first', 'begin', 'human', 'trials', '``', "'he", 'invested', '5', 'million', 'moderna', 'stake', 'worth', '800', 'million']


In [0]:
agin = [word for word in b if word not in stopwords_en_pluspunct]

In [70]:
porter = PorterStemmer()

for word in ["walking", "walks", "walked","walkes"]:
    print(porter.stem(word),"in the rain")

walk in the rain
walk in the rain
walk in the rain
walk in the rain


In [75]:
wnl_try = WordNetLemmatizer()

for word in ["walking", "walks", "walked","walkes"]:
    print(wnl_try.lemmatize(word))


walking
walk
walked
walkes
