In [42]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import sys
from pathlib import Path

import pandas as pd

import spacy
from spacy import displacy
from textblob import TextBlob, Word
import nltk
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
# sklearn for feature extraction & modeling
from sklearn.feature_extraction.text import CountVectorizer
#from textacy.extract import ngrams, entitiespp

[nltk_data] Downloading package punkt to /Users/tonghuang/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
!{sys.executable} -m spacy validate

[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/Users/tonghuang/opt/anaconda3/envs/nlp/lib/python3.11/site-packages/spacy[0m

NAME             SPACY            VERSION                            
en_core_web_sm   >=3.7.2,<3.8.0   [38;5;2m3.7.1[0m   [38;5;2m✔[0m



In [9]:
nlp = spacy.load('en_core_web_sm') 

In [12]:
sample_text = 'Apple is looking at buying U.K. startup for $1 billion'
doc = nlp(sample_text)

In [19]:
pd.DataFrame([[t.text, t.lemma_, t.pos_, t.tag_, t.dep_, t.shape_, t.is_alpha, t.is_stop]
              for t in doc],
             columns=['text', 'lemma', 'pos', 'tag', 'dep', 'shape', 'is_alpha', 'is_stop'])

Unnamed: 0,text,lemma,pos,tag,dep,shape,is_alpha,is_stop
0,Apple,Apple,PROPN,NNP,nsubj,Xxxxx,True,False
1,is,be,AUX,VBZ,aux,xx,True,True
2,looking,look,VERB,VBG,ROOT,xxxx,True,False
3,at,at,ADP,IN,prep,xx,True,True
4,buying,buy,VERB,VBG,pcomp,xxxx,True,False
5,U.K.,U.K.,PROPN,NNP,dobj,X.X.,False,False
6,startup,startup,NOUN,NN,dep,xxxx,True,False
7,for,for,ADP,IN,prep,xxx,True,True
8,$,$,SYM,$,quantmod,$,False,False
9,1,1,NUM,CD,compound,d,False,False


In [30]:
DATA_DIR = Path("./")
files = (DATA_DIR / 'bbc').glob('**/*.txt')
bbc_articles = []
doc_list = []
for i, file in enumerate(files):
    topic = file.parts[-2]
    article = file.read_text(encoding='latin1').split('\n')
    heading = article[0].strip()
    body = ' '.join([l.strip() for l in article[1:]]).strip()
    doc_list.append([topic, heading, body])

In [34]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'body'])
docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    2225 non-null   object
 1   heading  2225 non-null   object
 2   body     2225 non-null   object
dtypes: object(3)
memory usage: 52.3+ KB


In [40]:
article = docs.sample(1).squeeze()
print(f'Topic:\t{article.topic.capitalize()}\n\n{article.heading}\n')
print(article.body.strip())

Topic:	Tech

Latest Opera browser gets vocal

Net browser Opera 8.0, due for official release at the end of next month, will be "the most accessible browser on the market", according to its authors.  The latest version of the net browser can be controlled by voice command and will read pages aloud. The voice features, based on IBM technology, are currently only available in the Windows version. Opera can also magnify text by up to 10 times and users can create "style sheets", its developers say. This will enable them to view pages with colours and fonts that they prefer. But the browser does not yet work well with screen reader software often used by blind people, so its accessibility features are more likely to appeal to those with some residual vision. "Our mission was always to provide the best internet experience for everyone," said Opera spokeswoman, Berit Hanson. "So we would obviously not want to exclude disabled computer users."  Another feature likely to appeal to people with 

In [43]:
stemmer = SnowballStemmer('english')
parsed_body = TextBlob(article.body)
[(word, stemmer.stem(word)) for i, word in enumerate(parsed_body.words) 
 if word.lower() != stemmer.stem(parsed_body.words[i])]

[('official', 'offici'),
 ('release', 'releas'),
 ('accessible', 'access'),
 ('according', 'accord'),
 ('its', 'it'),
 ('authors', 'author'),
 ('controlled', 'control'),
 ('voice', 'voic'),
 ('pages', 'page'),
 ('voice', 'voic'),
 ('features', 'featur'),
 ('based', 'base'),
 ('technology', 'technolog'),
 ('currently', 'current'),
 ('only', 'onli'),
 ('available', 'avail'),
 ('Windows', 'window'),
 ('magnify', 'magnifi'),
 ('times', 'time'),
 ('users', 'user'),
 ('create', 'creat'),
 ('sheets', 'sheet'),
 ('its', 'it'),
 ('developers', 'develop'),
 ('enable', 'enabl'),
 ('pages', 'page'),
 ('colours', 'colour'),
 ('fonts', 'font'),
 ('does', 'doe'),
 ('software', 'softwar'),
 ('used', 'use'),
 ('people', 'peopl'),
 ('its', 'it'),
 ('accessibility', 'access'),
 ('features', 'featur'),
 ('likely', 'like'),
 ('residual', 'residu'),
 ('always', 'alway'),
 ('provide', 'provid'),
 ('experience', 'experi'),
 ('everyone', 'everyon'),
 ('obviously', 'obvious'),
 ('exclude', 'exclud'),
 ('disable

In [44]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tonghuang/nltk_data...


True

In [45]:
[(word, word.lemmatize()) for i, word in enumerate(parsed_body.words) 
 if word != parsed_body.words[i].lemmatize()]

[('its', 'it'),
 ('authors', 'author'),
 ('pages', 'page'),
 ('features', 'feature'),
 ('times', 'time'),
 ('users', 'user'),
 ('sheets', 'sheet'),
 ('its', 'it'),
 ('developers', 'developer'),
 ('pages', 'page'),
 ('colours', 'colour'),
 ('fonts', 'font'),
 ('does', 'doe'),
 ('its', 'it'),
 ('features', 'feature'),
 ('was', 'wa'),
 ('users', 'user'),
 ('pages', 'page'),
 ('points', 'point'),
 ('says', 'say'),
 ('features', 'feature'),
 ('was', 'wa'),
 ('makes', 'make'),
 ('does', 'doe'),
 ('commands', 'command'),
 ('users', 'user'),
 ('comes', 'come'),
 ('as', 'a'),
 ('telecoms', 'telecom'),
 ('systems', 'system'),
 ('platforms', 'platform')]

In [46]:
parsed_body.sentiment

Sentiment(polarity=0.18816326530612246, subjectivity=0.439591836734694)

# Document matrix

In [59]:
docs.topic.value_counts(normalize = True).to_frame('count').style.format({'count': '{:.2%}'.format})

Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
sport,22.97%
business,22.92%
politics,18.74%
tech,18.02%
entertainment,17.35%
