# Text Vectorization and Feature Engineering Assignment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

import requests
from bs4 import BeautifulSoup

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [None]:
articles = ['http://lite.cnn.io/en/article/h_eac18760a7a7f9a1bf33616f1c4a336d',
            'http://lite.cnn.io/en/article/h_de3f82f17d289680dd2b47c6413ebe7c',
            'http://lite.cnn.io/en/article/h_72f4dc9d6f35458a89af014b62e625ad',
            'http://lite.cnn.io/en/article/h_aa21fe6bf176071cb49e09d422c3adf0',
            'http://lite.cnn.io/en/article/h_8ad34a532921c9076cdc9d7390d2f1bc',
            'http://lite.cnn.io/en/article/h_84422c79110d9989177cfaf1c5f45fe7',
            'http://lite.cnn.io/en/article/h_d010d9580abac3a44c6181ec6fb63d58',
            'http://lite.cnn.io/en/article/h_fb11f4e9d7c5323e75b337d9e9e5e368',
            'http://lite.cnn.io/en/article/h_7b27f0b131067f8ece6238ac559670ab',
            'http://lite.cnn.io/en/article/h_8cae7f735fa9573d470f802063ceffe2',
            'http://lite.cnn.io/en/article/h_72c3668280e82576fcc2602b0fa70c14',
            'http://lite.cnn.io/en/article/h_d20658fb0e20212051cda0e0a7248c8a',
            'http://lite.cnn.io/en/article/h_56611c43d7928120d2ae21666ccc7417',
            'http://lite.cnn.io/en/article/h_bda0394e3c5ee7054ee65c022bca7695']

In [None]:
PATH = '/content/drive/MyDrive/cnn_articles/'

for i, url in enumerate(articles):
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    text = soup.find('div', {'class':'afe4286c'}).text
    with open(PATH + f'article_{i}.txt', 'wb') as f:
        f.write(text.encode())

In [None]:
DOC_PATTERN = r'.*\.txt'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [None]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

In [None]:
docs[0]

'Pink taking a break to focus on familyUpdated 10:18 AM ET, Thu November 14, 2019(CNN) - Pink has been working pretty hard and it sounds like she will be taking a step back in 2020.Speaking with "Entertainment Tonight" on the Country Music Association Awards red carpet, the singer was joined by her husband, Carey Hart, and their kids Willow, 8, and Jameson, 2.Pink was there to perform her song "Love Me Anyway" with country star Chris Stapleton, and she  talked about how hectic things have been. "We did two and a half years of [music] and Willow\'s back in school now, Jameson\'s going to start pre-school soon," Pink said. "It\'s kind of the year of the family."The star also praised her husband, with whom she will celebrate 14 years of marriage in January."Carey has a lot going on as well," she said of Hart, who went from being a professional motocross competitor to racing off-road trucks. "He\'s super supportive, he follows me around the world and now it\'s his turn."According to Billbo

In [None]:
len(docs)

14

### Preprocess and clean the documents according to the steps below.

- Word Tokenize
- Lowercase
- Remove Stopwords
- Remove Punctuation
- Lemmatize
- Stem

In [None]:
#word tokenization
word_tk = [word_tokenize(doc) for doc in docs]
word_tk[0][:12]

['Pink',
 'taking',
 'a',
 'break',
 'to',
 'focus',
 'on',
 'familyUpdated',
 '10:18',
 'AM',
 'ET',
 ',']

In [None]:
no_stopwords = [[token.lower() for token in doc if token.lower() not in stopwords.words('english')] for doc in word_tk]
no_stopwords[0][:12]

['pink',
 'taking',
 'break',
 'focus',
 'familyupdated',
 '10:18',
 'et',
 ',',
 'thu',
 'november',
 '14',
 ',']

In [None]:
#no punctuation
no_punct = [[token.lower() for token in doc if token.isalpha()] for doc in no_stopwords]
no_punct[0][:12]

['pink',
 'taking',
 'break',
 'focus',
 'familyupdated',
 'et',
 'thu',
 'november',
 'cnn',
 'pink',
 'working',
 'pretty']

In [None]:
#lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [[lemmatizer.lemmatize(token) for token in doc] for doc in no_punct]

lemmatized[0][:20]

['pink',
 'taking',
 'break',
 'focus',
 'familyupdated',
 'et',
 'thu',
 'november',
 'cnn',
 'pink',
 'working',
 'pretty',
 'hard',
 'sound',
 'like',
 'taking',
 'step',
 'back',
 'entertainment',
 'tonight']

In [None]:
#stemming
stemmer = SnowballStemmer('english')
stemmed = [[stemmer.stem(token) for token in doc] for doc in lemmatized]
stemmed[0][:12]

['pink',
 'take',
 'break',
 'focus',
 'familyupd',
 'et',
 'thu',
 'novemb',
 'cnn',
 'pink',
 'work',
 'pretti']

### Count vectorize the preprocessed documents.

In [None]:
new_docs = [' '.join(word for word in doc) for doc in stemmed]
new_docs[0]

'pink take break focus familyupd et thu novemb cnn pink work pretti hard sound like take step back entertain tonight countri music associ award red carpet singer join husband carey hart kid willow jameson perform song love anyway countri star chris stapleton talk hectic thing two half year music willow back school jameson go start soon pink said kind year famili star also prais husband celebr year marriag januari carey lot go well said hart went profession motocross competitor race truck super support follow around world turn accord billboard pink beauti trauma tour rank tour time earn million'

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(new_docs)

In [None]:
count_vec = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
count_vec.head()

Unnamed: 0,abc,abet,abil,abl,abort,absolut,abund,abus,accent,accept,accid,accomplic,accomplish,accord,account,accus,acknowledg,across,act,action,activ,activist,actor,actual,ad,adam,add,addit,address,administr,admir,admiss,adopt,ador,advanc,advantag,advic,advis,aerial,affair,...,window,wit,withheld,withhold,within,without,wolfson,woman,wonder,wood,word,work,worker,world,worm,worri,wors,worst,would,wound,wow,wowtaylor,wrap,write,wrong,wrongdo,wrote,xinhua,ya,yawnther,year,yell,yes,yesterday,yet,york,yorker,young,younger,zelenski
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,5,0,0,0,0,2,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,2,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,2,0,0,1,0
4,1,0,1,0,0,0,0,2,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,3,1,0,0,1,2,0,0,0,0,1,0,0,2,0,0,...,0,5,3,1,1,1,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,7


### One hot vectorize the preprocessed documents.

In [None]:
#one hot
vecorizer = CountVectorizer(binary=True) #binary set to True
vectors = vectorizer.fit_transform(new_docs)

one_hot = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
one_hot.head()

Unnamed: 0,abc,abet,abil,abl,abort,absolut,abund,abus,accent,accept,accid,accomplic,accomplish,accord,account,accus,acknowledg,across,act,action,activ,activist,actor,actual,ad,adam,add,addit,address,administr,admir,admiss,adopt,ador,advanc,advantag,advic,advis,aerial,affair,...,window,wit,withheld,withhold,within,without,wolfson,woman,wonder,wood,word,work,worker,world,worm,worri,wors,worst,would,wound,wow,wowtaylor,wrap,write,wrong,wrongdo,wrote,xinhua,ya,yawnther,year,yell,yes,yesterday,yet,york,yorker,young,younger,zelenski
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,5,0,0,0,0,2,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,2,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,2,0,0,1,0
4,1,0,1,0,0,0,0,2,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,3,1,0,0,1,2,0,0,0,0,1,0,0,2,0,0,...,0,5,3,1,1,1,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,7


### TF-IDF vectorize the preprocessed documents.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(new_docs)

tfidf = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
tfidf.head()

Unnamed: 0,abc,abet,abil,abl,abort,absolut,abund,abus,accent,accept,accid,accomplic,accomplish,accord,account,accus,acknowledg,across,act,action,activ,activist,actor,actual,ad,adam,add,addit,address,administr,admir,admiss,adopt,ador,advanc,advantag,advic,advis,aerial,affair,...,window,wit,withheld,withhold,within,without,wolfson,woman,wonder,wood,word,work,worker,world,worm,worri,wors,worst,would,wound,wow,wowtaylor,wrap,write,wrong,wrongdo,wrote,xinhua,ya,yawnther,year,yell,yes,yesterday,yet,york,yorker,young,younger,zelenski
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052723,0.0,0.081021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050598,0.050598,0.038965,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050711,0.0,0.0,0.0,0.0,0.0,0.0,0.076066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043793,0.0,0.0,0.0,0.136661,0.0,0.0,0.0,0.0,0.059146,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03384,0.0,0.0,0.0,0.0,0.0,0.028414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03384,0.03384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.145844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037438,0.0,0.048615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048615,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.033159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038312,0.0,0.0,0.0,0.0,0.0,0.0,0.029504,0.0,0.0,...,0.0,0.0,0.0,0.0,0.026668,0.0,0.076623,0.0,0.0,0.0,0.0,0.019199,0.0,0.0,0.0,0.0,0.0,0.0,0.038397,0.0,0.0,0.0,0.0,0.0,0.022392,0.0,0.0,0.0,0.0,0.0,0.020695,0.0,0.0,0.0,0.0,0.044785,0.0,0.0,0.038312,0.0
4,0.023741,0.0,0.020548,0.0,0.0,0.0,0.0,0.033051,0.0,0.0,0.023741,0.0,0.0,0.013876,0.0,0.0,0.0,0.0,0.018283,0.0,0.0,0.0,0.0,0.016526,0.049577,0.016526,0.0,0.0,0.023741,0.033051,0.0,0.0,0.0,0.0,0.020548,0.0,0.0,0.036566,0.0,0.0,...,0.0,0.091414,0.071223,0.020548,0.016526,0.018283,0.0,0.0,0.0,0.0,0.0,0.047588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023741,0.0,0.0,0.0,0.020548,0.0,0.0,0.0,0.023741,0.0,0.0,0.020548,0.0,0.0,0.013876,0.0,0.0,0.0,0.143838


In [None]:
tfidf.shape[1]

2068

### Use Doc2Vec to vectorize the preprocessed documents.

Set the size of the vectors to be the same size as those of the other methods using the `vector_size` argument.

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(new_docs)]
documents[0]

TaggedDocument(words='pink take break focus familyupd et thu novemb cnn pink work pretti hard sound like take step back entertain tonight countri music associ award red carpet singer join husband carey hart kid willow jameson perform song love anyway countri star chris stapleton talk hectic thing two half year music willow back school jameson go start soon pink said kind year famili star also prais husband celebr year marriag januari carey lot go well said hart went profession motocross competitor race truck super support follow around world turn accord billboard pink beauti trauma tour rank tour time earn million', tags=[0])

In [None]:
model = Doc2Vec(documents, vector_size=tfidf.shape[1])
doc2vec = pd.DataFrame([list(model[document]) for document in range(len(documents))])

In [None]:
doc2vec.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050,2051,2052,2053,2054,2055,2056,2057,2058,2059,2060,2061,2062,2063,2064,2065,2066,2067
0,-0.007631,0.006005,0.009475,0.00365,-0.004089,-0.003712,0.001394,-0.003281,-0.011028,0.005238,0.014233,0.011425,0.014261,-0.004799,-0.005316,-0.012596,0.002167,0.000155,0.000252,0.009134,-0.01348,0.001572,-0.001877,0.006405,-0.006217,-0.000382,0.010487,-0.000421,-0.016603,-0.002154,0.013707,-0.005498,0.002233,0.007472,0.000658,0.011201,0.019628,-0.00994,0.009133,-0.012483,...,0.012489,0.010397,-0.007625,0.006762,-0.003308,0.009922,0.007905,0.0175,0.005175,0.003696,0.002738,0.00915,-0.004021,-0.003042,-0.005925,-0.005859,0.005276,0.007416,-0.006204,-0.001264,0.003365,0.006162,0.016145,0.002875,0.004354,-0.000478,-0.00276,0.004886,-0.006677,-0.012795,0.002186,-0.001071,-0.016517,-0.009668,-0.00933,0.002357,0.014837,0.001956,0.014436,0.006098
1,-0.018372,0.015032,0.023063,0.008612,-0.009528,-0.00843,0.00311,-0.008768,-0.028096,0.012467,0.034962,0.027765,0.035525,-0.012269,-0.012762,-0.030813,0.005752,0.000506,-0.000102,0.021921,-0.033755,0.003556,-0.005473,0.016115,-0.015098,-0.000916,0.026283,-0.000484,-0.040533,-0.005188,0.034355,-0.01281,0.005278,0.018179,0.001202,0.027144,0.048267,-0.024878,0.022641,-0.030473,...,0.030443,0.026114,-0.018858,0.016151,-0.007502,0.023765,0.019032,0.04237,0.012435,0.009512,0.006739,0.022776,-0.009233,-0.00709,-0.014391,-0.014931,0.0137,0.018431,-0.015074,-0.003541,0.008436,0.014685,0.03947,0.006767,0.011381,-0.001421,-0.007261,0.011671,-0.015763,-0.031527,0.005524,-0.002855,-0.040349,-0.023457,-0.02256,0.005271,0.036973,0.004284,0.034803,0.015305
2,-0.019725,0.016239,0.024529,0.009104,-0.010408,-0.009274,0.003467,-0.009038,-0.029579,0.013217,0.037175,0.029659,0.037634,-0.012949,-0.01368,-0.032895,0.005857,0.000549,-7.3e-05,0.023625,-0.035725,0.003676,-0.005679,0.016981,-0.016096,-0.001087,0.028052,-0.000293,-0.043403,-0.005579,0.036452,-0.013749,0.005606,0.01953,0.001498,0.028734,0.051014,-0.02618,0.024086,-0.032018,...,0.032501,0.02775,-0.020045,0.016983,-0.00802,0.025394,0.020111,0.04525,0.013279,0.01013,0.006916,0.023914,-0.009925,-0.00766,-0.015485,-0.015925,0.014306,0.019586,-0.016201,-0.00363,0.008598,0.015675,0.041865,0.007367,0.012,-0.001265,-0.007699,0.012513,-0.016643,-0.033504,0.00548,-0.002704,-0.043033,-0.025421,-0.024057,0.00599,0.039146,0.004626,0.037477,0.016601
3,-0.020697,0.017193,0.026192,0.009311,-0.010738,-0.009771,0.00374,-0.009696,-0.031679,0.014177,0.039053,0.031107,0.040013,-0.013716,-0.01433,-0.034407,0.006428,0.000589,-2e-05,0.024513,-0.037478,0.003999,-0.005639,0.018014,-0.017164,-0.000961,0.029283,-0.00072,-0.045532,-0.005627,0.038482,-0.014514,0.005622,0.020195,0.001435,0.029693,0.052933,-0.027316,0.024852,-0.033809,...,0.034255,0.029005,-0.021272,0.017985,-0.00835,0.026748,0.021565,0.047803,0.013996,0.010708,0.007273,0.025441,-0.010597,-0.007837,-0.016245,-0.01671,0.014942,0.020455,-0.016636,-0.003936,0.00915,0.01661,0.044181,0.007455,0.012414,-0.001683,-0.007912,0.012982,-0.017819,-0.035065,0.005943,-0.002773,-0.045337,-0.026596,-0.025527,0.006168,0.041215,0.004743,0.039207,0.017334
4,-0.019685,0.016218,0.024868,0.008966,-0.010193,-0.009163,0.003409,-0.009406,-0.029616,0.013649,0.037367,0.029703,0.038131,-0.012883,-0.013781,-0.033067,0.005936,0.000856,4.7e-05,0.023576,-0.035764,0.0034,-0.005853,0.017227,-0.016292,-0.001048,0.02816,-0.000402,-0.043674,-0.005175,0.036753,-0.013737,0.005522,0.019614,0.001489,0.02859,0.05069,-0.026113,0.023941,-0.032117,...,0.032144,0.027303,-0.019969,0.016781,-0.008143,0.025282,0.019979,0.044775,0.012821,0.00979,0.006823,0.023823,-0.010093,-0.007515,-0.015293,-0.015872,0.014345,0.019326,-0.016009,-0.003532,0.008744,0.015847,0.042004,0.007255,0.011862,-0.001415,-0.007563,0.012166,-0.016803,-0.033232,0.005452,-0.00303,-0.042608,-0.024949,-0.024045,0.005627,0.038797,0.004456,0.037166,0.016471


In [None]:
doc2vec.shape

(14, 2068)